2dd1c49f6a5368c25ab369da4f460cb6ff715a29
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 class InfoExtractor(object):
27     """Information Extractor class.
28
29     Information extractors are the classes that, given a URL, extract
30     information about the video (or videos) the URL refers to. This
31     information includes the real video URL, the video title, author and
32     others. The information is stored in a dictionary which is then
33     passed to the FileDownloader. The FileDownloader processes this
34     information possibly downloading the video to the file system, among
35     other possible outcomes.
36
37     The dictionaries must include the following fields:
38
39     id:             Video identifier.
40     url:            Final video URL.
41     title:          Video title, unescaped.
42     ext:            Video filename extension.
43
44     The following fields are optional:
45
46     format:         The video format, defaults to ext (used for --get-format)
47     thumbnail:      Full URL to a video thumbnail image.
48     description:    One-line video description.
49     uploader:       Full name of the video uploader.
50     upload_date:    Video upload date (YYYYMMDD).
51     uploader_id:    Nickname or id of the video uploader.
52     location:       Physical location of the video.
53     player_url:     SWF Player URL (used for rtmpdump).
54     subtitles:      The subtitle file contents.
55     urlhandle:      [internal] The urlHandle to be used to download the file,
56                     like returned by urllib.request.urlopen
57
58     The fields should all be Unicode strings.
59
60     Subclasses of this one should re-define the _real_initialize() and
61     _real_extract() methods and define a _VALID_URL regexp.
62     Probably, they should also be added to the list of extractors.
63
64     _real_extract() must return a *list* of information dictionaries as
65     described above.
66
67     Finally, the _WORKING attribute should be set to False for broken IEs
68     in order to warn the users and skip the tests.
69     """
70
71     _ready = False
72     _downloader = None
73     _WORKING = True
74
75     def __init__(self, downloader=None):
76         """Constructor. Receives an optional downloader."""
77         self._ready = False
78         self.set_downloader(downloader)
79
80     @classmethod
81     def suitable(cls, url):
82         """Receives a URL and returns True if suitable for this IE."""
83         return re.match(cls._VALID_URL, url) is not None
84
85     @classmethod
86     def working(cls):
87         """Getter method for _WORKING."""
88         return cls._WORKING
89
90     def initialize(self):
91         """Initializes an instance (authentication, etc)."""
92         if not self._ready:
93             self._real_initialize()
94             self._ready = True
95
96     def extract(self, url):
97         """Extracts URL information and returns it in list of dicts."""
98         self.initialize()
99         return self._real_extract(url)
100
101     def set_downloader(self, downloader):
102         """Sets the downloader for this IE."""
103         self._downloader = downloader
104
105     def _real_initialize(self):
106         """Real initialization process. Redefine in subclasses."""
107         pass
108
109     def _real_extract(self, url):
110         """Real extraction process. Redefine in subclasses."""
111         pass
112
113     @property
114     def IE_NAME(self):
115         return type(self).__name__[:-2]
116
117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118         """ Returns the response handle """
119         if note is None:
120             self.report_download_webpage(video_id)
121         elif note is not False:
122             self.to_screen(u'%s: %s' % (video_id, note))
123         try:
124             return compat_urllib_request.urlopen(url_or_request)
125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126             if errnote is None:
127                 errnote = u'Unable to download webpage'
128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
129
130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131         """ Returns a tuple (page content as string, URL handle) """
132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133         content_type = urlh.headers.get('Content-Type', '')
134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
135         if m:
136             encoding = m.group(1)
137         else:
138             encoding = 'utf-8'
139         webpage_bytes = urlh.read()
140         if self._downloader.params.get('dump_intermediate_pages', False):
141             try:
142                 url = url_or_request.get_full_url()
143             except AttributeError:
144                 url = url_or_request
145             self.to_screen(u'Dumping request to ' + url)
146             dump = base64.b64encode(webpage_bytes).decode('ascii')
147             self._downloader.to_screen(dump)
148         content = webpage_bytes.decode(encoding, 'replace')
149         return (content, urlh)
150
151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152         """ Returns the data of the page as a string """
153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
154
155     def to_screen(self, msg):
156         """Print msg to screen, prefixing it with '[ie_name]'"""
157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
158
159     def report_extraction(self, id_or_name):
160         """Report information extraction."""
161         self.to_screen(u'%s: Extracting information' % id_or_name)
162
163     def report_download_webpage(self, video_id):
164         """Report webpage download."""
165         self.to_screen(u'%s: Downloading webpage' % video_id)
166
167     def report_age_confirmation(self):
168         """Report attempt to confirm age."""
169         self.to_screen(u'Confirming age')
170
171     #Methods for following #608
172     #They set the correct value of the '_type' key
173     def video_result(self, video_info):
174         """Returns a video"""
175         video_info['_type'] = 'video'
176         return video_info
177     def url_result(self, url, ie=None):
178         """Returns a url that points to a page that should be processed"""
179         #TODO: ie should be the class used for getting the info
180         video_info = {'_type': 'url',
181                       'url': url,
182                       'ie_key': ie}
183         return video_info
184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185         """Returns a playlist"""
186         video_info = {'_type': 'playlist',
187                       'entries': entries}
188         if playlist_id:
189             video_info['id'] = playlist_id
190         if playlist_title:
191             video_info['title'] = playlist_title
192         return video_info
193
194 class SearchInfoExtractor(InfoExtractor):
195     """
196     Base class for paged search queries extractors.
197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
199     """
200
201     @classmethod
202     def _make_valid_url(cls):
203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
204
205     @classmethod
206     def suitable(cls, url):
207         return re.match(cls._make_valid_url(), url) is not None
208
209     def _real_extract(self, query):
210         mobj = re.match(self._make_valid_url(), query)
211         if mobj is None:
212             raise ExtractorError(u'Invalid search query "%s"' % query)
213
214         prefix = mobj.group('prefix')
215         query = mobj.group('query')
216         if prefix == '':
217             return self._get_n_results(query, 1)
218         elif prefix == 'all':
219             return self._get_n_results(query, self._MAX_RESULTS)
220         else:
221             n = int(prefix)
222             if n <= 0:
223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224             elif n > self._MAX_RESULTS:
225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226                 n = self._MAX_RESULTS
227             return self._get_n_results(query, n)
228
229     def _get_n_results(self, query, n):
230         """Get a specified number of results for a query"""
231         raise NotImplementedError("This method must be implemented by sublclasses")
232
233
234 class YoutubeIE(InfoExtractor):
235     """Information extractor for youtube.com."""
236
237     _VALID_URL = r"""^
238                      (
239                          (?:https?://)?                                       # http(s):// (optional)
240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
243                          (?:                                                  # the various things that can precede the ID:
244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
245                              |(?:                                             # or the v= param in all its forms
246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
249                                  v=
250                              )
251                          )?                                                   # optional -> youtube.com/xxxx is OK
252                      )?                                                       # all until now is optional -> you can pass the naked ID
253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
254                      (?(1).+)?                                                # if we found the ID, everything can follow
255                      $"""
256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260     _NETRC_MACHINE = 'youtube'
261     # Listed in order of quality
262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264     _video_extensions = {
265         '13': '3gp',
266         '17': 'mp4',
267         '18': 'mp4',
268         '22': 'mp4',
269         '37': 'mp4',
270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
271         '43': 'webm',
272         '44': 'webm',
273         '45': 'webm',
274         '46': 'webm',
275     }
276     _video_dimensions = {
277         '5': '240x400',
278         '6': '???',
279         '13': '???',
280         '17': '144x176',
281         '18': '360x640',
282         '22': '720x1280',
283         '34': '360x640',
284         '35': '480x854',
285         '37': '1080x1920',
286         '38': '3072x4096',
287         '43': '360x640',
288         '44': '480x854',
289         '45': '720x1280',
290         '46': '1080x1920',
291     }
292     IE_NAME = u'youtube'
293
294     @classmethod
295     def suitable(cls, url):
296         """Receives a URL and returns True if suitable for this IE."""
297         if YoutubePlaylistIE.suitable(url): return False
298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
299
300     def report_lang(self):
301         """Report attempt to set language."""
302         self.to_screen(u'Setting language')
303
304     def report_login(self):
305         """Report attempt to log in."""
306         self.to_screen(u'Logging in')
307
308     def report_video_webpage_download(self, video_id):
309         """Report attempt to download video webpage."""
310         self.to_screen(u'%s: Downloading video webpage' % video_id)
311
312     def report_video_info_webpage_download(self, video_id):
313         """Report attempt to download video info webpage."""
314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
315
316     def report_video_subtitles_download(self, video_id):
317         """Report attempt to download video info webpage."""
318         self.to_screen(u'%s: Checking available subtitles' % video_id)
319
320     def report_video_subtitles_request(self, video_id, sub_lang, format):
321         """Report attempt to download video info webpage."""
322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
323
324     def report_video_subtitles_available(self, video_id, sub_lang_list):
325         """Report available subtitles."""
326         sub_lang = ",".join(list(sub_lang_list.keys()))
327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
328
329     def report_information_extraction(self, video_id):
330         """Report attempt to extract video information."""
331         self.to_screen(u'%s: Extracting video information' % video_id)
332
333     def report_unavailable_format(self, video_id, format):
334         """Report extracted video URL."""
335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
336
337     def report_rtmp_download(self):
338         """Indicate the download will use the RTMP protocol."""
339         self.to_screen(u'RTMP download detected')
340
341     def _get_available_subtitles(self, video_id):
342         self.report_video_subtitles_download(video_id)
343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
344         try:
345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350         if not sub_lang_list:
351             return (u'video doesn\'t have subtitles', None)
352         return sub_lang_list
353
354     def _list_available_subtitles(self, video_id):
355         sub_lang_list = self._get_available_subtitles(video_id)
356         self.report_video_subtitles_available(video_id, sub_lang_list)
357
358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
359         """
360         Return tuple:
361         (error_message, sub_lang, sub)
362         """
363         self.report_video_subtitles_request(video_id, sub_lang, format)
364         params = compat_urllib_parse.urlencode({
365             'lang': sub_lang,
366             'name': sub_name,
367             'v': video_id,
368             'fmt': format,
369         })
370         url = 'http://www.youtube.com/api/timedtext?' + params
371         try:
372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
375         if not sub:
376             return (u'Did not fetch video subtitles', None, None)
377         return (None, sub_lang, sub)
378
379     def _extract_subtitle(self, video_id):
380         """
381         Return a list with a tuple:
382         [(error_message, sub_lang, sub)]
383         """
384         sub_lang_list = self._get_available_subtitles(video_id)
385         sub_format = self._downloader.params.get('subtitlesformat')
386         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
387             return [(sub_lang_list[0], None, None)]
388         if self._downloader.params.get('subtitleslang', False):
389             sub_lang = self._downloader.params.get('subtitleslang')
390         elif 'en' in sub_lang_list:
391             sub_lang = 'en'
392         else:
393             sub_lang = list(sub_lang_list.keys())[0]
394         if not sub_lang in sub_lang_list:
395             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
396
397         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
398         return [subtitle]
399
400     def _extract_all_subtitles(self, video_id):
401         sub_lang_list = self._get_available_subtitles(video_id)
402         sub_format = self._downloader.params.get('subtitlesformat')
403         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
404             return [(sub_lang_list[0], None, None)]
405         subtitles = []
406         for sub_lang in sub_lang_list:
407             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
408             subtitles.append(subtitle)
409         return subtitles
410
411     def _print_formats(self, formats):
412         print('Available formats:')
413         for x in formats:
414             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
415
416     def _real_initialize(self):
417         if self._downloader is None:
418             return
419
420         username = None
421         password = None
422         downloader_params = self._downloader.params
423
424         # Attempt to use provided username and password or .netrc data
425         if downloader_params.get('username', None) is not None:
426             username = downloader_params['username']
427             password = downloader_params['password']
428         elif downloader_params.get('usenetrc', False):
429             try:
430                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431                 if info is not None:
432                     username = info[0]
433                     password = info[2]
434                 else:
435                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
436             except (IOError, netrc.NetrcParseError) as err:
437                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438                 return
439
440         # Set language
441         request = compat_urllib_request.Request(self._LANG_URL)
442         try:
443             self.report_lang()
444             compat_urllib_request.urlopen(request).read()
445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
447             return
448
449         # No authentication to be performed
450         if username is None:
451             return
452
453         request = compat_urllib_request.Request(self._LOGIN_URL)
454         try:
455             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
458             return
459
460         galx = None
461         dsh = None
462         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
463         if match:
464           galx = match.group(1)
465
466         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
467         if match:
468           dsh = match.group(1)
469
470         # Log in
471         login_form_strs = {
472                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473                 u'Email': username,
474                 u'GALX': galx,
475                 u'Passwd': password,
476                 u'PersistentCookie': u'yes',
477                 u'_utf8': u'霱',
478                 u'bgresponse': u'js_disabled',
479                 u'checkConnection': u'',
480                 u'checkedDomains': u'youtube',
481                 u'dnConn': u'',
482                 u'dsh': dsh,
483                 u'pstMsg': u'0',
484                 u'rmShown': u'1',
485                 u'secTok': u'',
486                 u'signIn': u'Sign in',
487                 u'timeStmp': u'',
488                 u'service': u'youtube',
489                 u'uilel': u'3',
490                 u'hl': u'en_US',
491         }
492         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
493         # chokes on unicode
494         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
495         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
496         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
497         try:
498             self.report_login()
499             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
500             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
501                 self._downloader.report_warning(u'unable to log in: bad username or password')
502                 return
503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
505             return
506
507         # Confirm age
508         age_form = {
509                 'next_url':     '/',
510                 'action_confirm':   'Confirm',
511                 }
512         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
513         try:
514             self.report_age_confirmation()
515             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
518
519     def _extract_id(self, url):
520         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
521         if mobj is None:
522             raise ExtractorError(u'Invalid URL: %s' % url)
523         video_id = mobj.group(2)
524         return video_id
525
526     def _real_extract(self, url):
527         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
528         mobj = re.search(self._NEXT_URL_RE, url)
529         if mobj:
530             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
531         video_id = self._extract_id(url)
532
533         # Get video webpage
534         self.report_video_webpage_download(video_id)
535         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
536         request = compat_urllib_request.Request(url)
537         try:
538             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
540             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
541
542         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
543
544         # Attempt to extract SWF player URL
545         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
546         if mobj is not None:
547             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
548         else:
549             player_url = None
550
551         # Get video info
552         self.report_video_info_webpage_download(video_id)
553         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
554             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
555                     % (video_id, el_type))
556             video_info_webpage = self._download_webpage(video_info_url, video_id,
557                                     note=False,
558                                     errnote='unable to download video info webpage')
559             video_info = compat_parse_qs(video_info_webpage)
560             if 'token' in video_info:
561                 break
562         if 'token' not in video_info:
563             if 'reason' in video_info:
564                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
565             else:
566                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
567
568         # Check for "rental" videos
569         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
570             raise ExtractorError(u'"rental" videos not supported')
571
572         # Start extracting information
573         self.report_information_extraction(video_id)
574
575         # uploader
576         if 'author' not in video_info:
577             raise ExtractorError(u'Unable to extract uploader name')
578         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
579
580         # uploader_id
581         video_uploader_id = None
582         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
583         if mobj is not None:
584             video_uploader_id = mobj.group(1)
585         else:
586             self._downloader.report_warning(u'unable to extract uploader nickname')
587
588         # title
589         if 'title' not in video_info:
590             raise ExtractorError(u'Unable to extract video title')
591         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
592
593         # thumbnail image
594         if 'thumbnail_url' not in video_info:
595             self._downloader.report_warning(u'unable to extract video thumbnail')
596             video_thumbnail = ''
597         else:   # don't panic if we can't find it
598             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599
600         # upload date
601         upload_date = None
602         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
603         if mobj is not None:
604             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
605             upload_date = unified_strdate(upload_date)
606
607         # description
608         video_description = get_element_by_id("eow-description", video_webpage)
609         if video_description:
610             video_description = clean_html(video_description)
611         else:
612             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
613             if fd_mobj:
614                 video_description = unescapeHTML(fd_mobj.group(1))
615             else:
616                 video_description = u''
617
618         # subtitles
619         video_subtitles = None
620
621         if self._downloader.params.get('writesubtitles', False):
622             video_subtitles = self._extract_subtitle(video_id)
623             if video_subtitles:
624                 (sub_error, sub_lang, sub) = video_subtitles[0]
625                 if sub_error:
626                     self._downloader.report_error(sub_error)
627
628         if self._downloader.params.get('allsubtitles', False):
629             video_subtitles = self._extract_all_subtitles(video_id)
630             for video_subtitle in video_subtitles:
631                 (sub_error, sub_lang, sub) = video_subtitle
632                 if sub_error:
633                     self._downloader.report_error(sub_error)
634
635         if self._downloader.params.get('listsubtitles', False):
636             sub_lang_list = self._list_available_subtitles(video_id)
637             return
638
639         if 'length_seconds' not in video_info:
640             self._downloader.report_warning(u'unable to extract video duration')
641             video_duration = ''
642         else:
643             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
644
645         # token
646         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
647
648         # Decide which formats to download
649         req_format = self._downloader.params.get('format', None)
650
651         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
652             self.report_rtmp_download()
653             video_url_list = [(None, video_info['conn'][0])]
654         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
655             url_map = {}
656             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
657                 url_data = compat_parse_qs(url_data_str)
658                 if 'itag' in url_data and 'url' in url_data:
659                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
660                     if not 'ratebypass' in url: url += '&ratebypass=yes'
661                     url_map[url_data['itag'][0]] = url
662
663             format_limit = self._downloader.params.get('format_limit', None)
664             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
665             if format_limit is not None and format_limit in available_formats:
666                 format_list = available_formats[available_formats.index(format_limit):]
667             else:
668                 format_list = available_formats
669             existing_formats = [x for x in format_list if x in url_map]
670             if len(existing_formats) == 0:
671                 raise ExtractorError(u'no known formats available for video')
672             if self._downloader.params.get('listformats', None):
673                 self._print_formats(existing_formats)
674                 return
675             if req_format is None or req_format == 'best':
676                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
677             elif req_format == 'worst':
678                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
679             elif req_format in ('-1', 'all'):
680                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
681             else:
682                 # Specific formats. We pick the first in a slash-delimeted sequence.
683                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
684                 req_formats = req_format.split('/')
685                 video_url_list = None
686                 for rf in req_formats:
687                     if rf in url_map:
688                         video_url_list = [(rf, url_map[rf])]
689                         break
690                 if video_url_list is None:
691                     raise ExtractorError(u'requested format not available')
692         else:
693             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
694
695         results = []
696         for format_param, video_real_url in video_url_list:
697             # Extension
698             video_extension = self._video_extensions.get(format_param, 'flv')
699
700             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
701                                               self._video_dimensions.get(format_param, '???'))
702
703             results.append({
704                 'id':       video_id,
705                 'url':      video_real_url,
706                 'uploader': video_uploader,
707                 'uploader_id': video_uploader_id,
708                 'upload_date':  upload_date,
709                 'title':    video_title,
710                 'ext':      video_extension,
711                 'format':   video_format,
712                 'thumbnail':    video_thumbnail,
713                 'description':  video_description,
714                 'player_url':   player_url,
715                 'subtitles':    video_subtitles,
716                 'duration':     video_duration
717             })
718         return results
719
720
721 class MetacafeIE(InfoExtractor):
722     """Information Extractor for metacafe.com."""
723
724     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
725     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
726     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
727     IE_NAME = u'metacafe'
728
729     def report_disclaimer(self):
730         """Report disclaimer retrieval."""
731         self.to_screen(u'Retrieving disclaimer')
732
733     def _real_initialize(self):
734         # Retrieve disclaimer
735         request = compat_urllib_request.Request(self._DISCLAIMER)
736         try:
737             self.report_disclaimer()
738             disclaimer = compat_urllib_request.urlopen(request).read()
739         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
740             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
741
742         # Confirm age
743         disclaimer_form = {
744             'filters': '0',
745             'submit': "Continue - I'm over 18",
746             }
747         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
748         try:
749             self.report_age_confirmation()
750             disclaimer = compat_urllib_request.urlopen(request).read()
751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
752             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
753
754     def _real_extract(self, url):
755         # Extract id and simplified title from URL
756         mobj = re.match(self._VALID_URL, url)
757         if mobj is None:
758             raise ExtractorError(u'Invalid URL: %s' % url)
759
760         video_id = mobj.group(1)
761
762         # Check if video comes from YouTube
763         mobj2 = re.match(r'^yt-(.*)$', video_id)
764         if mobj2 is not None:
765             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
766
767         # Retrieve video webpage to extract further information
768         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
769
770         # Extract URL, uploader and title from webpage
771         self.report_extraction(video_id)
772         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
773         if mobj is not None:
774             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775             video_extension = mediaURL[-3:]
776
777             # Extract gdaKey if available
778             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779             if mobj is None:
780                 video_url = mediaURL
781             else:
782                 gdaKey = mobj.group(1)
783                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
784         else:
785             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
786             if mobj is None:
787                 raise ExtractorError(u'Unable to extract media URL')
788             vardict = compat_parse_qs(mobj.group(1))
789             if 'mediaData' not in vardict:
790                 raise ExtractorError(u'Unable to extract media URL')
791             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
792             if mobj is None:
793                 raise ExtractorError(u'Unable to extract media URL')
794             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
795             video_extension = mediaURL[-3:]
796             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
797
798         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
799         if mobj is None:
800             raise ExtractorError(u'Unable to extract title')
801         video_title = mobj.group(1).decode('utf-8')
802
803         mobj = re.search(r'submitter=(.*?);', webpage)
804         if mobj is None:
805             raise ExtractorError(u'Unable to extract uploader nickname')
806         video_uploader = mobj.group(1)
807
808         return [{
809             'id':       video_id.decode('utf-8'),
810             'url':      video_url.decode('utf-8'),
811             'uploader': video_uploader.decode('utf-8'),
812             'upload_date':  None,
813             'title':    video_title,
814             'ext':      video_extension.decode('utf-8'),
815         }]
816
817 class DailymotionIE(InfoExtractor):
818     """Information Extractor for Dailymotion"""
819
820     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
821     IE_NAME = u'dailymotion'
822
823     def _real_extract(self, url):
824         # Extract id and simplified title from URL
825         mobj = re.match(self._VALID_URL, url)
826         if mobj is None:
827             raise ExtractorError(u'Invalid URL: %s' % url)
828
829         video_id = mobj.group(1).split('_')[0].split('?')[0]
830
831         video_extension = 'mp4'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         request.add_header('Cookie', 'family_filter=off')
836         webpage = self._download_webpage(request, video_id)
837
838         # Extract URL, uploader and title from webpage
839         self.report_extraction(video_id)
840         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
841         if mobj is None:
842             raise ExtractorError(u'Unable to extract media URL')
843         flashvars = compat_urllib_parse.unquote(mobj.group(1))
844
845         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
846             if key in flashvars:
847                 max_quality = key
848                 self.to_screen(u'Using %s' % key)
849                 break
850         else:
851             raise ExtractorError(u'Unable to extract video URL')
852
853         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
854         if mobj is None:
855             raise ExtractorError(u'Unable to extract video URL')
856
857         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
858
859         # TODO: support choosing qualities
860
861         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
862         if mobj is None:
863             raise ExtractorError(u'Unable to extract title')
864         video_title = unescapeHTML(mobj.group('title'))
865
866         video_uploader = None
867         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
868         if mobj is None:
869             # lookin for official user
870             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
871             if mobj_official is None:
872                 self._downloader.report_warning(u'unable to extract uploader nickname')
873             else:
874                 video_uploader = mobj_official.group(1)
875         else:
876             video_uploader = mobj.group(1)
877
878         video_upload_date = None
879         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
880         if mobj is not None:
881             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
882
883         return [{
884             'id':       video_id,
885             'url':      video_url,
886             'uploader': video_uploader,
887             'upload_date':  video_upload_date,
888             'title':    video_title,
889             'ext':      video_extension,
890         }]
891
892
893 class PhotobucketIE(InfoExtractor):
894     """Information extractor for photobucket.com."""
895
896     # TODO: the original _VALID_URL was:
897     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
898     # Check if it's necessary to keep the old extracion process
899     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
900     IE_NAME = u'photobucket'
901
902     def _real_extract(self, url):
903         # Extract id from URL
904         mobj = re.match(self._VALID_URL, url)
905         if mobj is None:
906             raise ExtractorError(u'Invalid URL: %s' % url)
907
908         video_id = mobj.group('id')
909
910         video_extension = mobj.group('ext')
911
912         # Retrieve video webpage to extract further information
913         webpage = self._download_webpage(url, video_id)
914
915         # Extract URL, uploader, and title from webpage
916         self.report_extraction(video_id)
917         # We try first by looking the javascript code:
918         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
919         if mobj is not None:
920             info = json.loads(mobj.group('json'))
921             return [{
922                 'id':       video_id,
923                 'url':      info[u'downloadUrl'],
924                 'uploader': info[u'username'],
925                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
926                 'title':    info[u'title'],
927                 'ext':      video_extension,
928                 'thumbnail': info[u'thumbUrl'],
929             }]
930
931         # We try looking in other parts of the webpage
932         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
933         if mobj is None:
934             raise ExtractorError(u'Unable to extract media URL')
935         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936
937         video_url = mediaURL
938
939         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
940         if mobj is None:
941             raise ExtractorError(u'Unable to extract title')
942         video_title = mobj.group(1).decode('utf-8')
943
944         video_uploader = mobj.group(2).decode('utf-8')
945
946         return [{
947             'id':       video_id.decode('utf-8'),
948             'url':      video_url.decode('utf-8'),
949             'uploader': video_uploader,
950             'upload_date':  None,
951             'title':    video_title,
952             'ext':      video_extension.decode('utf-8'),
953         }]
954
955
956 class YahooIE(InfoExtractor):
957     """Information extractor for screen.yahoo.com."""
958     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
959
960     def _real_extract(self, url):
961         mobj = re.match(self._VALID_URL, url)
962         if mobj is None:
963             raise ExtractorError(u'Invalid URL: %s' % url)
964         video_id = mobj.group('id')
965         webpage = self._download_webpage(url, video_id)
966         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
967
968         if m_id is None: 
969             # TODO: Check which url parameters are required
970             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
971             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
972             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
973                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
974                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
975                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
976                         '''
977             self.report_extraction(video_id)
978             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
979             if m_info is None:
980                 raise ExtractorError(u'Unable to extract video info')
981             video_title = m_info.group('title')
982             video_description = m_info.group('description')
983             video_thumb = m_info.group('thumb')
984             video_date = m_info.group('date')
985             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
986     
987             # TODO: Find a way to get mp4 videos
988             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
989             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
990             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
991             video_url = m_rest.group('url')
992             video_path = m_rest.group('path')
993             if m_rest is None:
994                 raise ExtractorError(u'Unable to extract video url')
995
996         else: # We have to use a different method if another id is defined
997             long_id = m_id.group('new_id')
998             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
999             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001             info = json.loads(json_str)
1002             res = info[u'query'][u'results'][u'mediaObj'][0]
1003             stream = res[u'streams'][0]
1004             video_path = stream[u'path']
1005             video_url = stream[u'host']
1006             meta = res[u'meta']
1007             video_title = meta[u'title']
1008             video_description = meta[u'description']
1009             video_thumb = meta[u'thumbnail']
1010             video_date = None # I can't find it
1011
1012         info_dict = {
1013                      'id': video_id,
1014                      'url': video_url,
1015                      'play_path': video_path,
1016                      'title':video_title,
1017                      'description': video_description,
1018                      'thumbnail': video_thumb,
1019                      'upload_date': video_date,
1020                      'ext': 'flv',
1021                      }
1022         return info_dict
1023
1024 class VimeoIE(InfoExtractor):
1025     """Information extractor for vimeo.com."""
1026
1027     # _VALID_URL matches Vimeo URLs
1028     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1029     IE_NAME = u'vimeo'
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             raise ExtractorError(u'Invalid URL: %s' % url)
1036
1037         video_id = mobj.group('id')
1038         if not mobj.group('proto'):
1039             url = 'https://' + url
1040         if mobj.group('direct_link'):
1041             url = 'https://vimeo.com/' + video_id
1042
1043         # Retrieve video webpage to extract further information
1044         request = compat_urllib_request.Request(url, None, std_headers)
1045         webpage = self._download_webpage(request, video_id)
1046
1047         # Now we begin extracting as much information as we can from what we
1048         # retrieved. First we extract the information common to all extractors,
1049         # and latter we extract those that are Vimeo specific.
1050         self.report_extraction(video_id)
1051
1052         # Extract the config JSON
1053         try:
1054             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055             config = json.loads(config)
1056         except:
1057             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1059             else:
1060                 raise ExtractorError(u'Unable to extract info section')
1061
1062         # Extract title
1063         video_title = config["video"]["title"]
1064
1065         # Extract uploader and uploader_id
1066         video_uploader = config["video"]["owner"]["name"]
1067         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1068
1069         # Extract video thumbnail
1070         video_thumbnail = config["video"]["thumbnail"]
1071
1072         # Extract video description
1073         video_description = get_element_by_attribute("itemprop", "description", webpage)
1074         if video_description: video_description = clean_html(video_description)
1075         else: video_description = u''
1076
1077         # Extract upload date
1078         video_upload_date = None
1079         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080         if mobj is not None:
1081             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1082
1083         # Vimeo specific: extract request signature and timestamp
1084         sig = config['request']['signature']
1085         timestamp = config['request']['timestamp']
1086
1087         # Vimeo specific: extract video codec and quality information
1088         # First consider quality, then codecs, then take everything
1089         # TODO bind to format param
1090         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091         files = { 'hd': [], 'sd': [], 'other': []}
1092         for codec_name, codec_extension in codecs:
1093             if codec_name in config["video"]["files"]:
1094                 if 'hd' in config["video"]["files"][codec_name]:
1095                     files['hd'].append((codec_name, codec_extension, 'hd'))
1096                 elif 'sd' in config["video"]["files"][codec_name]:
1097                     files['sd'].append((codec_name, codec_extension, 'sd'))
1098                 else:
1099                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1100
1101         for quality in ('hd', 'sd', 'other'):
1102             if len(files[quality]) > 0:
1103                 video_quality = files[quality][0][2]
1104                 video_codec = files[quality][0][0]
1105                 video_extension = files[quality][0][1]
1106                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1107                 break
1108         else:
1109             raise ExtractorError(u'No known codec found')
1110
1111         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1113
1114         return [{
1115             'id':       video_id,
1116             'url':      video_url,
1117             'uploader': video_uploader,
1118             'uploader_id': video_uploader_id,
1119             'upload_date':  video_upload_date,
1120             'title':    video_title,
1121             'ext':      video_extension,
1122             'thumbnail':    video_thumbnail,
1123             'description':  video_description,
1124         }]
1125
1126
1127 class ArteTvIE(InfoExtractor):
1128     """arte.tv information extractor."""
1129
1130     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131     _LIVE_URL = r'index-[0-9]+\.html$'
1132
1133     IE_NAME = u'arte.tv'
1134
1135     def fetch_webpage(self, url):
1136         request = compat_urllib_request.Request(url)
1137         try:
1138             self.report_download_webpage(url)
1139             webpage = compat_urllib_request.urlopen(request).read()
1140         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142         except ValueError as err:
1143             raise ExtractorError(u'Invalid URL: %s' % url)
1144         return webpage
1145
1146     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147         page = self.fetch_webpage(url)
1148         mobj = re.search(regex, page, regexFlags)
1149         info = {}
1150
1151         if mobj is None:
1152             raise ExtractorError(u'Invalid URL: %s' % url)
1153
1154         for (i, key, err) in matchTuples:
1155             if mobj.group(i) is None:
1156                 raise ExtractorError(err)
1157             else:
1158                 info[key] = mobj.group(i)
1159
1160         return info
1161
1162     def extractLiveStream(self, url):
1163         video_lang = url.split('/')[-4]
1164         info = self.grep_webpage(
1165             url,
1166             r'src="(.*?/videothek_js.*?\.js)',
1167             0,
1168             [
1169                 (1, 'url', u'Invalid URL: %s' % url)
1170             ]
1171         )
1172         http_host = url.split('/')[2]
1173         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177                 '(http://.*?\.swf).*?' +
1178                 '(rtmp://.*?)\'',
1179             re.DOTALL,
1180             [
1181                 (1, 'path',   u'could not extract video path: %s' % url),
1182                 (2, 'player', u'could not extract video player: %s' % url),
1183                 (3, 'url',    u'could not extract video url: %s' % url)
1184             ]
1185         )
1186         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1187
1188     def extractPlus7Stream(self, url):
1189         video_lang = url.split('/')[-3]
1190         info = self.grep_webpage(
1191             url,
1192             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1193             0,
1194             [
1195                 (1, 'url', u'Invalid URL: %s' % url)
1196             ]
1197         )
1198         next_url = compat_urllib_parse.unquote(info.get('url'))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1202             0,
1203             [
1204                 (1, 'url', u'Could not find <video> tag: %s' % url)
1205             ]
1206         )
1207         next_url = compat_urllib_parse.unquote(info.get('url'))
1208
1209         info = self.grep_webpage(
1210             next_url,
1211             r'<video id="(.*?)".*?>.*?' +
1212                 '<name>(.*?)</name>.*?' +
1213                 '<dateVideo>(.*?)</dateVideo>.*?' +
1214                 '<url quality="hd">(.*?)</url>',
1215             re.DOTALL,
1216             [
1217                 (1, 'id',    u'could not extract video id: %s' % url),
1218                 (2, 'title', u'could not extract video title: %s' % url),
1219                 (3, 'date',  u'could not extract video date: %s' % url),
1220                 (4, 'url',   u'could not extract video url: %s' % url)
1221             ]
1222         )
1223
1224         return {
1225             'id':           info.get('id'),
1226             'url':          compat_urllib_parse.unquote(info.get('url')),
1227             'uploader':     u'arte.tv',
1228             'upload_date':  unified_strdate(info.get('date')),
1229             'title':        info.get('title').decode('utf-8'),
1230             'ext':          u'mp4',
1231             'format':       u'NA',
1232             'player_url':   None,
1233         }
1234
1235     def _real_extract(self, url):
1236         video_id = url.split('/')[-1]
1237         self.report_extraction(video_id)
1238
1239         if re.search(self._LIVE_URL, video_id) is not None:
1240             self.extractLiveStream(url)
1241             return
1242         else:
1243             info = self.extractPlus7Stream(url)
1244
1245         return [info]
1246
1247
1248 class GenericIE(InfoExtractor):
1249     """Generic last-resort information extractor."""
1250
1251     _VALID_URL = r'.*'
1252     IE_NAME = u'generic'
1253
1254     def report_download_webpage(self, video_id):
1255         """Report webpage download."""
1256         if not self._downloader.params.get('test', False):
1257             self._downloader.report_warning(u'Falling back on generic information extractor.')
1258         super(GenericIE, self).report_download_webpage(video_id)
1259
1260     def report_following_redirect(self, new_url):
1261         """Report information extraction."""
1262         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1263
1264     def _test_redirect(self, url):
1265         """Check if it is a redirect, like url shorteners, in case return the new url."""
1266         class HeadRequest(compat_urllib_request.Request):
1267             def get_method(self):
1268                 return "HEAD"
1269
1270         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1271             """
1272             Subclass the HTTPRedirectHandler to make it use our
1273             HeadRequest also on the redirected URL
1274             """
1275             def redirect_request(self, req, fp, code, msg, headers, newurl):
1276                 if code in (301, 302, 303, 307):
1277                     newurl = newurl.replace(' ', '%20')
1278                     newheaders = dict((k,v) for k,v in req.headers.items()
1279                                       if k.lower() not in ("content-length", "content-type"))
1280                     return HeadRequest(newurl,
1281                                        headers=newheaders,
1282                                        origin_req_host=req.get_origin_req_host(),
1283                                        unverifiable=True)
1284                 else:
1285                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1286
1287         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1288             """
1289             Fallback to GET if HEAD is not allowed (405 HTTP error)
1290             """
1291             def http_error_405(self, req, fp, code, msg, headers):
1292                 fp.read()
1293                 fp.close()
1294
1295                 newheaders = dict((k,v) for k,v in req.headers.items()
1296                                   if k.lower() not in ("content-length", "content-type"))
1297                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1298                                                  headers=newheaders,
1299                                                  origin_req_host=req.get_origin_req_host(),
1300                                                  unverifiable=True))
1301
1302         # Build our opener
1303         opener = compat_urllib_request.OpenerDirector()
1304         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305                         HTTPMethodFallback, HEADRedirectHandler,
1306                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307             opener.add_handler(handler())
1308
1309         response = opener.open(HeadRequest(url))
1310         if response is None:
1311             raise ExtractorError(u'Invalid URL protocol')
1312         new_url = response.geturl()
1313
1314         if url == new_url:
1315             return False
1316
1317         self.report_following_redirect(new_url)
1318         return new_url
1319
1320     def _real_extract(self, url):
1321         new_url = self._test_redirect(url)
1322         if new_url: return [self.url_result(new_url)]
1323
1324         video_id = url.split('/')[-1]
1325         try:
1326             webpage = self._download_webpage(url, video_id)
1327         except ValueError as err:
1328             # since this is the last-resort InfoExtractor, if
1329             # this error is thrown, it'll be thrown here
1330             raise ExtractorError(u'Invalid URL: %s' % url)
1331
1332         self.report_extraction(video_id)
1333         # Start with something easy: JW Player in SWFObject
1334         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1335         if mobj is None:
1336             # Broaden the search a little bit
1337             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1338         if mobj is None:
1339             # Broaden the search a little bit: JWPlayer JS loader
1340             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1341         if mobj is None:
1342             raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344         # It's possible that one of the regexes
1345         # matched, but returned an empty group:
1346         if mobj.group(1) is None:
1347             raise ExtractorError(u'Invalid URL: %s' % url)
1348
1349         video_url = compat_urllib_parse.unquote(mobj.group(1))
1350         video_id = os.path.basename(video_url)
1351
1352         # here's a fun little line of code for you:
1353         video_extension = os.path.splitext(video_id)[1][1:]
1354         video_id = os.path.splitext(video_id)[0]
1355
1356         # it's tempting to parse this further, but you would
1357         # have to take into account all the variations like
1358         #   Video Title - Site Name
1359         #   Site Name | Video Title
1360         #   Video Title - Tagline | Site Name
1361         # and so on and so forth; it's just not practical
1362         mobj = re.search(r'<title>(.*)</title>', webpage)
1363         if mobj is None:
1364             raise ExtractorError(u'Unable to extract title')
1365         video_title = mobj.group(1)
1366
1367         # video uploader is domain name
1368         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1369         if mobj is None:
1370             raise ExtractorError(u'Unable to extract title')
1371         video_uploader = mobj.group(1)
1372
1373         return [{
1374             'id':       video_id,
1375             'url':      video_url,
1376             'uploader': video_uploader,
1377             'upload_date':  None,
1378             'title':    video_title,
1379             'ext':      video_extension,
1380         }]
1381
1382
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384     """Information Extractor for YouTube search queries."""
1385     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386     _MAX_RESULTS = 1000
1387     IE_NAME = u'youtube:search'
1388     _SEARCH_KEY = 'ytsearch'
1389
1390     def report_download_page(self, query, pagenum):
1391         """Report attempt to download search page with given number."""
1392         query = query.decode(preferredencoding())
1393         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394
1395     def _get_n_results(self, query, n):
1396         """Get a specified number of results for a query"""
1397
1398         video_ids = []
1399         pagenum = 0
1400         limit = n
1401
1402         while (50 * pagenum) < limit:
1403             self.report_download_page(query, pagenum+1)
1404             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405             request = compat_urllib_request.Request(result_url)
1406             try:
1407                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410             api_response = json.loads(data)['data']
1411
1412             if not 'items' in api_response:
1413                 raise ExtractorError(u'[youtube] No video results')
1414
1415             new_ids = list(video['id'] for video in api_response['items'])
1416             video_ids += new_ids
1417
1418             limit = min(n, api_response['totalItems'])
1419             pagenum += 1
1420
1421         if len(video_ids) > n:
1422             video_ids = video_ids[:n]
1423         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424         return self.playlist_result(videos, query)
1425
1426
1427 class GoogleSearchIE(SearchInfoExtractor):
1428     """Information Extractor for Google Video search queries."""
1429     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1430     _MAX_RESULTS = 1000
1431     IE_NAME = u'video.google:search'
1432     _SEARCH_KEY = 'gvsearch'
1433
1434     def _get_n_results(self, query, n):
1435         """Get a specified number of results for a query"""
1436
1437         res = {
1438             '_type': 'playlist',
1439             'id': query,
1440             'entries': []
1441         }
1442
1443         for pagenum in itertools.count(1):
1444             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446                                              note='Downloading result page ' + str(pagenum))
1447
1448             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449                 e = {
1450                     '_type': 'url',
1451                     'url': mobj.group(1)
1452                 }
1453                 res['entries'].append(e)
1454
1455             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456                 return res
1457
1458 class YahooSearchIE(SearchInfoExtractor):
1459     """Information Extractor for Yahoo! Video search queries."""
1460
1461     _MAX_RESULTS = 1000
1462     IE_NAME = u'screen.yahoo:search'
1463     _SEARCH_KEY = 'yvsearch'
1464
1465     def _get_n_results(self, query, n):
1466         """Get a specified number of results for a query"""
1467
1468         res = {
1469             '_type': 'playlist',
1470             'id': query,
1471             'entries': []
1472         }
1473         for pagenum in itertools.count(0): 
1474             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475             webpage = self._download_webpage(result_url, query,
1476                                              note='Downloading results page '+str(pagenum+1))
1477             info = json.loads(webpage)
1478             m = info[u'm']
1479             results = info[u'results']
1480
1481             for (i, r) in enumerate(results):
1482                 if (pagenum * 30) +i >= n:
1483                     break
1484                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486                 res['entries'].append(e)
1487             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488                 break
1489
1490         return res
1491
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494     """Information Extractor for YouTube playlists."""
1495
1496     _VALID_URL = r"""(?:
1497                         (?:https?://)?
1498                         (?:\w+\.)?
1499                         youtube\.com/
1500                         (?:
1501                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502                            \? (?:.*?&)*? (?:p|a|list)=
1503                         |  p/
1504                         )
1505                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1506                         .*
1507                      |
1508                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1509                      )"""
1510     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1511     _MAX_RESULTS = 50
1512     IE_NAME = u'youtube:playlist'
1513
1514     @classmethod
1515     def suitable(cls, url):
1516         """Receives a URL and returns True if suitable for this IE."""
1517         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1518
1519     def _real_extract(self, url):
1520         # Extract playlist id
1521         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1522         if mobj is None:
1523             raise ExtractorError(u'Invalid URL: %s' % url)
1524
1525         # Download playlist videos from API
1526         playlist_id = mobj.group(1) or mobj.group(2)
1527         page_num = 1
1528         videos = []
1529
1530         while True:
1531             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1533
1534             try:
1535                 response = json.loads(page)
1536             except ValueError as err:
1537                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1538
1539             if 'feed' not in response:
1540                 raise ExtractorError(u'Got a malformed response from YouTube API')
1541             playlist_title = response['feed']['title']['$t']
1542             if 'entry' not in response['feed']:
1543                 # Number of videos is a multiple of self._MAX_RESULTS
1544                 break
1545
1546             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547                         for entry in response['feed']['entry']
1548                         if 'content' in entry ]
1549
1550             if len(response['feed']['entry']) < self._MAX_RESULTS:
1551                 break
1552             page_num += 1
1553
1554         videos = [v[1] for v in sorted(videos)]
1555
1556         url_results = [self.url_result(url, 'Youtube') for url in videos]
1557         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1558
1559
1560 class YoutubeChannelIE(InfoExtractor):
1561     """Information Extractor for YouTube channels."""
1562
1563     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567     IE_NAME = u'youtube:channel'
1568
1569     def extract_videos_from_page(self, page):
1570         ids_in_page = []
1571         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572             if mobj.group(1) not in ids_in_page:
1573                 ids_in_page.append(mobj.group(1))
1574         return ids_in_page
1575
1576     def _real_extract(self, url):
1577         # Extract channel id
1578         mobj = re.match(self._VALID_URL, url)
1579         if mobj is None:
1580             raise ExtractorError(u'Invalid URL: %s' % url)
1581
1582         # Download channel page
1583         channel_id = mobj.group(1)
1584         video_ids = []
1585         pagenum = 1
1586
1587         url = self._TEMPLATE_URL % (channel_id, pagenum)
1588         page = self._download_webpage(url, channel_id,
1589                                       u'Downloading page #%s' % pagenum)
1590
1591         # Extract video identifiers
1592         ids_in_page = self.extract_videos_from_page(page)
1593         video_ids.extend(ids_in_page)
1594
1595         # Download any subsequent channel pages using the json-based channel_ajax query
1596         if self._MORE_PAGES_INDICATOR in page:
1597             while True:
1598                 pagenum = pagenum + 1
1599
1600                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601                 page = self._download_webpage(url, channel_id,
1602                                               u'Downloading page #%s' % pagenum)
1603
1604                 page = json.loads(page)
1605
1606                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607                 video_ids.extend(ids_in_page)
1608
1609                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1610                     break
1611
1612         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1613
1614         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616         return [self.playlist_result(url_entries, channel_id)]
1617
1618
1619 class YoutubeUserIE(InfoExtractor):
1620     """Information Extractor for YouTube users."""
1621
1622     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624     _GDATA_PAGE_SIZE = 50
1625     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627     IE_NAME = u'youtube:user'
1628
1629     def _real_extract(self, url):
1630         # Extract username
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             raise ExtractorError(u'Invalid URL: %s' % url)
1634
1635         username = mobj.group(1)
1636
1637         # Download video ids using YouTube Data API. Result size per
1638         # query is limited (currently to 50 videos) so we need to query
1639         # page by page until there are no video ids - it means we got
1640         # all of them.
1641
1642         video_ids = []
1643         pagenum = 0
1644
1645         while True:
1646             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1647
1648             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649             page = self._download_webpage(gdata_url, username,
1650                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654
1655             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656                 if mobj.group(1) not in ids_in_page:
1657                     ids_in_page.append(mobj.group(1))
1658
1659             video_ids.extend(ids_in_page)
1660
1661             # A little optimization - if current page is not
1662             # "full", ie. does not contain PAGE_SIZE video ids then
1663             # we can assume that this page is the last one - there
1664             # are no more ids on further pages - no need to query
1665             # again.
1666
1667             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1668                 break
1669
1670             pagenum += 1
1671
1672         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673         url_results = [self.url_result(url, 'Youtube') for url in urls]
1674         return [self.playlist_result(url_results, playlist_title = username)]
1675
1676
1677 class BlipTVUserIE(InfoExtractor):
1678     """Information Extractor for blip.tv users."""
1679
1680     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1681     _PAGE_SIZE = 12
1682     IE_NAME = u'blip.tv:user'
1683
1684     def _real_extract(self, url):
1685         # Extract username
1686         mobj = re.match(self._VALID_URL, url)
1687         if mobj is None:
1688             raise ExtractorError(u'Invalid URL: %s' % url)
1689
1690         username = mobj.group(1)
1691
1692         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1693
1694         page = self._download_webpage(url, username, u'Downloading user page')
1695         mobj = re.search(r'data-users-id="([^"]+)"', page)
1696         page_base = page_base % mobj.group(1)
1697
1698
1699         # Download video ids using BlipTV Ajax calls. Result size per
1700         # query is limited (currently to 12 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 1
1706
1707         while True:
1708             url = page_base + "&page=" + str(pagenum)
1709             page = self._download_webpage(url, username,
1710                                           u'Downloading video ids from page %d' % pagenum)
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714
1715             for mobj in re.finditer(r'href="/([^"]+)"', page):
1716                 if mobj.group(1) not in ids_in_page:
1717                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1718
1719             video_ids.extend(ids_in_page)
1720
1721             # A little optimization - if current page is not
1722             # "full", ie. does not contain PAGE_SIZE video ids then
1723             # we can assume that this page is the last one - there
1724             # are no more ids on further pages - no need to query
1725             # again.
1726
1727             if len(ids_in_page) < self._PAGE_SIZE:
1728                 break
1729
1730             pagenum += 1
1731
1732         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734         return [self.playlist_result(url_entries, playlist_title = username)]
1735
1736
1737 class DepositFilesIE(InfoExtractor):
1738     """Information extractor for depositfiles.com"""
1739
1740     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1741
1742     def _real_extract(self, url):
1743         file_id = url.split('/')[-1]
1744         # Rebuild url in english locale
1745         url = 'http://depositfiles.com/en/files/' + file_id
1746
1747         # Retrieve file webpage with 'Free download' button pressed
1748         free_download_indication = { 'gateway_result' : '1' }
1749         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1750         try:
1751             self.report_download_webpage(file_id)
1752             webpage = compat_urllib_request.urlopen(request).read()
1753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1755
1756         # Search for the real file URL
1757         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758         if (mobj is None) or (mobj.group(1) is None):
1759             # Try to figure out reason of the error.
1760             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761             if (mobj is not None) and (mobj.group(1) is not None):
1762                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763                 raise ExtractorError(u'%s' % restriction_message)
1764             else:
1765                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1766
1767         file_url = mobj.group(1)
1768         file_extension = os.path.splitext(file_url)[1][1:]
1769
1770         # Search for file title
1771         mobj = re.search(r'<b title="(.*?)">', webpage)
1772         if mobj is None:
1773             raise ExtractorError(u'Unable to extract title')
1774         file_title = mobj.group(1).decode('utf-8')
1775
1776         return [{
1777             'id':       file_id.decode('utf-8'),
1778             'url':      file_url.decode('utf-8'),
1779             'uploader': None,
1780             'upload_date':  None,
1781             'title':    file_title,
1782             'ext':      file_extension.decode('utf-8'),
1783         }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787     """Information Extractor for Facebook"""
1788
1789     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791     _NETRC_MACHINE = 'facebook'
1792     IE_NAME = u'facebook'
1793
1794     def report_login(self):
1795         """Report attempt to log in."""
1796         self.to_screen(u'Logging in')
1797
1798     def _real_initialize(self):
1799         if self._downloader is None:
1800             return
1801
1802         useremail = None
1803         password = None
1804         downloader_params = self._downloader.params
1805
1806         # Attempt to use provided username and password or .netrc data
1807         if downloader_params.get('username', None) is not None:
1808             useremail = downloader_params['username']
1809             password = downloader_params['password']
1810         elif downloader_params.get('usenetrc', False):
1811             try:
1812                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813                 if info is not None:
1814                     useremail = info[0]
1815                     password = info[2]
1816                 else:
1817                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818             except (IOError, netrc.NetrcParseError) as err:
1819                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1820                 return
1821
1822         if useremail is None:
1823             return
1824
1825         # Log in
1826         login_form = {
1827             'email': useremail,
1828             'pass': password,
1829             'login': 'Log+In'
1830             }
1831         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1832         try:
1833             self.report_login()
1834             login_results = compat_urllib_request.urlopen(request).read()
1835             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1837                 return
1838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1840             return
1841
1842     def _real_extract(self, url):
1843         mobj = re.match(self._VALID_URL, url)
1844         if mobj is None:
1845             raise ExtractorError(u'Invalid URL: %s' % url)
1846         video_id = mobj.group('ID')
1847
1848         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849         webpage = self._download_webpage(url, video_id)
1850
1851         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1854         if not m:
1855             raise ExtractorError(u'Cannot parse data')
1856         data = dict(json.loads(m.group(1)))
1857         params_raw = compat_urllib_parse.unquote(data['params'])
1858         params = json.loads(params_raw)
1859         video_data = params['video_data'][0]
1860         video_url = video_data.get('hd_src')
1861         if not video_url:
1862             video_url = video_data['sd_src']
1863         if not video_url:
1864             raise ExtractorError(u'Cannot find video URL')
1865         video_duration = int(video_data['video_duration'])
1866         thumbnail = video_data['thumbnail_src']
1867
1868         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1869         if not m:
1870             raise ExtractorError(u'Cannot find title in webpage')
1871         video_title = unescapeHTML(m.group(1))
1872
1873         info = {
1874             'id': video_id,
1875             'title': video_title,
1876             'url': video_url,
1877             'ext': 'mp4',
1878             'duration': video_duration,
1879             'thumbnail': thumbnail,
1880         }
1881         return [info]
1882
1883
1884 class BlipTVIE(InfoExtractor):
1885     """Information extractor for blip.tv"""
1886
1887     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1888     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889     IE_NAME = u'blip.tv'
1890
1891     def report_direct_download(self, title):
1892         """Report information extraction."""
1893         self.to_screen(u'%s: Direct download detected' % title)
1894
1895     def _real_extract(self, url):
1896         mobj = re.match(self._VALID_URL, url)
1897         if mobj is None:
1898             raise ExtractorError(u'Invalid URL: %s' % url)
1899
1900         urlp = compat_urllib_parse_urlparse(url)
1901         if urlp.path.startswith('/play/'):
1902             request = compat_urllib_request.Request(url)
1903             response = compat_urllib_request.urlopen(request)
1904             redirecturl = response.geturl()
1905             rurlp = compat_urllib_parse_urlparse(redirecturl)
1906             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1907             url = 'http://blip.tv/a/a-' + file_id
1908             return self._real_extract(url)
1909
1910
1911         if '?' in url:
1912             cchar = '&'
1913         else:
1914             cchar = '?'
1915         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1916         request = compat_urllib_request.Request(json_url)
1917         request.add_header('User-Agent', 'iTunes/10.6.1')
1918         self.report_extraction(mobj.group(1))
1919         info = None
1920         try:
1921             urlh = compat_urllib_request.urlopen(request)
1922             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1923                 basename = url.split('/')[-1]
1924                 title,ext = os.path.splitext(basename)
1925                 title = title.decode('UTF-8')
1926                 ext = ext.replace('.', '')
1927                 self.report_direct_download(title)
1928                 info = {
1929                     'id': title,
1930                     'url': url,
1931                     'uploader': None,
1932                     'upload_date': None,
1933                     'title': title,
1934                     'ext': ext,
1935                     'urlhandle': urlh
1936                 }
1937         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1939         if info is None: # Regular URL
1940             try:
1941                 json_code_bytes = urlh.read()
1942                 json_code = json_code_bytes.decode('utf-8')
1943             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1945
1946             try:
1947                 json_data = json.loads(json_code)
1948                 if 'Post' in json_data:
1949                     data = json_data['Post']
1950                 else:
1951                     data = json_data
1952
1953                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1954                 video_url = data['media']['url']
1955                 umobj = re.match(self._URL_EXT, video_url)
1956                 if umobj is None:
1957                     raise ValueError('Can not determine filename extension')
1958                 ext = umobj.group(1)
1959
1960                 info = {
1961                     'id': data['item_id'],
1962                     'url': video_url,
1963                     'uploader': data['display_name'],
1964                     'upload_date': upload_date,
1965                     'title': data['title'],
1966                     'ext': ext,
1967                     'format': data['media']['mimeType'],
1968                     'thumbnail': data['thumbnailUrl'],
1969                     'description': data['description'],
1970                     'player_url': data['embedUrl'],
1971                     'user_agent': 'iTunes/10.6.1',
1972                 }
1973             except (ValueError,KeyError) as err:
1974                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1975
1976         return [info]
1977
1978
1979 class MyVideoIE(InfoExtractor):
1980     """Information Extractor for myvideo.de."""
1981
1982     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1983     IE_NAME = u'myvideo'
1984
1985 #     Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1986 #     Copyright (C) 2013 Tristan Fischer (sphere@dersphere.de) - GPLv3
1987     def __rc4crypt(self,data, key):
1988         x = 0
1989         box = list(range(256))
1990         for i in list(range(256)):
1991             x = (x + box[i] + ord(key[i % len(key)])) % 256
1992             box[i], box[x] = box[x], box[i]
1993         x = 0
1994         y = 0
1995         out = []
1996         for char in data:
1997             x = (x + 1) % 256
1998             y = (y + box[x]) % 256
1999             box[x], box[y] = box[y], box[x]
2000 #            out.append(chr(ord(char) ^ box[(box[x] + box[y]) % 256]))
2001             out.append(chr(char ^ box[(box[x] + box[y]) % 256]))
2002         return ''.join(out)
2003
2004     def __md5(self,s):
2005         return hashlib.md5(s).hexdigest()
2006
2007     def _real_extract(self,url):
2008         mobj = re.match(self._VALID_URL, url)
2009         if mobj is None:
2010             raise ExtractorError(u'invalid URL: %s' % url)
2011
2012         video_id = mobj.group(1)
2013
2014         GK = (
2015           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2016           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2017           b'TnpsbA0KTVRkbU1tSTRNdz09'
2018         )
2019
2020         # Get video webpage
2021         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2022         webpage = self._download_webpage(webpage_url, video_id)
2023
2024         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2025         if mobj is not None:
2026             self.report_extraction(video_id)
2027             video_url = mobj.group(1) + '.flv'
2028
2029             mobj = re.search('<title>([^<]+)</title>', webpage)
2030             if mobj is None:
2031                 raise ExtractorError(u'Unable to extract title')
2032             video_title = mobj.group(1)
2033
2034             mobj = re.search('[.](.+?)$', video_url)
2035             if mobj is None:
2036                 raise ExtractorError(u'Unable to extract extention')
2037             video_ext = mobj.group(1)
2038
2039             return [{
2040                 'id':       video_id,
2041                 'url':      video_url,
2042                 'uploader': None,
2043                 'upload_date':  None,
2044                 'title':    video_title,
2045                 'ext':      u'flv',
2046             }]
2047
2048         # try encxml
2049         params = {}
2050         encxml = ''
2051         sec = re.search('var flashvars={(.+?)}', webpage).group(1)
2052         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2053             if not a == '_encxml':
2054                 params[a] = b
2055             else:
2056                 encxml = compat_urllib_parse.unquote(b)
2057         if not params.get('domain'):
2058             params['domain'] = 'www.myvideo.de'
2059         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2060         if 'flash_playertype=MTV' in xmldata_url:
2061             self._downloader.report_warning(u'avoiding MTV player')
2062             xmldata_url = (
2063                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2064                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2065             ) % video_id
2066
2067         # get enc data
2068         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2069         enc_data_b = binascii.unhexlify(enc_data)
2070         sk = self.__md5( 
2071             base64.b64decode(base64.b64decode(GK)) + 
2072             self.__md5( 
2073                 str(video_id).encode('utf-8') 
2074             ).encode('utf-8') 
2075         )
2076         dec_data = self.__rc4crypt(enc_data_b, sk)
2077
2078         # extracting infos
2079         self.report_extraction(video_id)
2080
2081         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2082         if mobj is None:
2083             raise ExtractorError(u'unable to extract rtmpurl')
2084         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2085         if 'myvideo2flash' in video_rtmpurl:
2086             self._downloader.report_warning(u'forcing RTMPT ...')
2087             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2088
2089         # extract non rtmp videos
2090         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2091             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2092             if mobj is None:
2093                 raise ExtractorError(u'unable to extract url')
2094             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2095
2096         mobj = re.search('source=\'(.*?)\'', dec_data)
2097         if mobj is None:
2098             raise ExtractorError(u'unable to extract swfobj')
2099         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2100
2101 #        mobj = re.search('path=\'(.*?)\'', dec_data)
2102 #        if mobj is None:
2103 #            raise ExtractorError(u'unable to extract filepath')
2104 #        video_filepath = mobj.group(1)
2105
2106         if not video_file.endswith('f4m'):
2107             ppath, prefix = video_file.split('.')
2108             video_playpath = '%s:%s' % (prefix, ppath)
2109             video_hls_playlist = ''
2110         else:
2111             video_playpath = ''
2112             video_hls_playlist = (
2113                 video_filepath + video_file
2114             ).replace('.f4m', '.m3u8')
2115
2116         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2117         if mobj is None:
2118             raise ExtractorError(u'unable to extract swfobj')
2119         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2120
2121         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2122         if mobj is None:
2123             raise ExtractorError(u'unable to extract title')
2124         video_title = mobj.group(1)
2125
2126         return [{
2127             'id':                 video_id,
2128             'url':                video_rtmpurl,
2129             'tc_url':             video_rtmpurl,
2130             'uploader':           None,
2131             'upload_date':        None,
2132             'title':              video_title,
2133             'ext':                u'flv',
2134             'play_path':          video_playpath,
2135             'video_file':         video_file,
2136 #            'file_path':          video_filepath,
2137             'video_hls_playlist': video_hls_playlist,
2138             'player_url':         video_swfobj,
2139         }]
2140
2141 class ComedyCentralIE(InfoExtractor):
2142     """Information extractor for The Daily Show and Colbert Report """
2143
2144     # urls can be abbreviations like :thedailyshow or :colbert
2145     # urls for episodes like:
2146     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2147     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2148     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2149     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2150                       |(https?://)?(www\.)?
2151                           (?P<showname>thedailyshow|colbertnation)\.com/
2152                          (full-episodes/(?P<episode>.*)|
2153                           (?P<clip>
2154                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2155                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2156                      $"""
2157
2158     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2159
2160     _video_extensions = {
2161         '3500': 'mp4',
2162         '2200': 'mp4',
2163         '1700': 'mp4',
2164         '1200': 'mp4',
2165         '750': 'mp4',
2166         '400': 'mp4',
2167     }
2168     _video_dimensions = {
2169         '3500': '1280x720',
2170         '2200': '960x540',
2171         '1700': '768x432',
2172         '1200': '640x360',
2173         '750': '512x288',
2174         '400': '384x216',
2175     }
2176
2177     @classmethod
2178     def suitable(cls, url):
2179         """Receives a URL and returns True if suitable for this IE."""
2180         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2181
2182     def _print_formats(self, formats):
2183         print('Available formats:')
2184         for x in formats:
2185             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2186
2187
2188     def _real_extract(self, url):
2189         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2190         if mobj is None:
2191             raise ExtractorError(u'Invalid URL: %s' % url)
2192
2193         if mobj.group('shortname'):
2194             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2195                 url = u'http://www.thedailyshow.com/full-episodes/'
2196             else:
2197                 url = u'http://www.colbertnation.com/full-episodes/'
2198             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2199             assert mobj is not None
2200
2201         if mobj.group('clip'):
2202             if mobj.group('showname') == 'thedailyshow':
2203                 epTitle = mobj.group('tdstitle')
2204             else:
2205                 epTitle = mobj.group('cntitle')
2206             dlNewest = False
2207         else:
2208             dlNewest = not mobj.group('episode')
2209             if dlNewest:
2210                 epTitle = mobj.group('showname')
2211             else:
2212                 epTitle = mobj.group('episode')
2213
2214         self.report_extraction(epTitle)
2215         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2216         if dlNewest:
2217             url = htmlHandle.geturl()
2218             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2219             if mobj is None:
2220                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2221             if mobj.group('episode') == '':
2222                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2223             epTitle = mobj.group('episode')
2224
2225         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2226
2227         if len(mMovieParams) == 0:
2228             # The Colbert Report embeds the information in a without
2229             # a URL prefix; so extract the alternate reference
2230             # and then add the URL prefix manually.
2231
2232             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2233             if len(altMovieParams) == 0:
2234                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2235             else:
2236                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2237
2238         uri = mMovieParams[0][1]
2239         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2240         indexXml = self._download_webpage(indexUrl, epTitle,
2241                                           u'Downloading show index',
2242                                           u'unable to download episode index')
2243
2244         results = []
2245
2246         idoc = xml.etree.ElementTree.fromstring(indexXml)
2247         itemEls = idoc.findall('.//item')
2248         for partNum,itemEl in enumerate(itemEls):
2249             mediaId = itemEl.findall('./guid')[0].text
2250             shortMediaId = mediaId.split(':')[-1]
2251             showId = mediaId.split(':')[-2].replace('.com', '')
2252             officialTitle = itemEl.findall('./title')[0].text
2253             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2254
2255             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2256                         compat_urllib_parse.urlencode({'uri': mediaId}))
2257             configXml = self._download_webpage(configUrl, epTitle,
2258                                                u'Downloading configuration for %s' % shortMediaId)
2259
2260             cdoc = xml.etree.ElementTree.fromstring(configXml)
2261             turls = []
2262             for rendition in cdoc.findall('.//rendition'):
2263                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2264                 turls.append(finfo)
2265
2266             if len(turls) == 0:
2267                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2268                 continue
2269
2270             if self._downloader.params.get('listformats', None):
2271                 self._print_formats([i[0] for i in turls])
2272                 return
2273
2274             # For now, just pick the highest bitrate
2275             format,rtmp_video_url = turls[-1]
2276
2277             # Get the format arg from the arg stream
2278             req_format = self._downloader.params.get('format', None)
2279
2280             # Select format if we can find one
2281             for f,v in turls:
2282                 if f == req_format:
2283                     format, rtmp_video_url = f, v
2284                     break
2285
2286             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2287             if not m:
2288                 raise ExtractorError(u'Cannot transform RTMP url')
2289             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2290             video_url = base + m.group('finalid')
2291
2292             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2293             info = {
2294                 'id': shortMediaId,
2295                 'url': video_url,
2296                 'uploader': showId,
2297                 'upload_date': officialDate,
2298                 'title': effTitle,
2299                 'ext': 'mp4',
2300                 'format': format,
2301                 'thumbnail': None,
2302                 'description': officialTitle,
2303             }
2304             results.append(info)
2305
2306         return results
2307
2308
2309 class EscapistIE(InfoExtractor):
2310     """Information extractor for The Escapist """
2311
2312     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2313     IE_NAME = u'escapist'
2314
2315     def _real_extract(self, url):
2316         mobj = re.match(self._VALID_URL, url)
2317         if mobj is None:
2318             raise ExtractorError(u'Invalid URL: %s' % url)
2319         showName = mobj.group('showname')
2320         videoId = mobj.group('episode')
2321
2322         self.report_extraction(showName)
2323         webPage = self._download_webpage(url, showName)
2324
2325         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2326         description = unescapeHTML(descMatch.group(1))
2327         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2328         imgUrl = unescapeHTML(imgMatch.group(1))
2329         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2330         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2331         configUrlMatch = re.search('config=(.*)$', playerUrl)
2332         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2333
2334         configJSON = self._download_webpage(configUrl, showName,
2335                                             u'Downloading configuration',
2336                                             u'unable to download configuration')
2337
2338         # Technically, it's JavaScript, not JSON
2339         configJSON = configJSON.replace("'", '"')
2340
2341         try:
2342             config = json.loads(configJSON)
2343         except (ValueError,) as err:
2344             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2345
2346         playlist = config['playlist']
2347         videoUrl = playlist[1]['url']
2348
2349         info = {
2350             'id': videoId,
2351             'url': videoUrl,
2352             'uploader': showName,
2353             'upload_date': None,
2354             'title': showName,
2355             'ext': 'mp4',
2356             'thumbnail': imgUrl,
2357             'description': description,
2358             'player_url': playerUrl,
2359         }
2360
2361         return [info]
2362
2363 class CollegeHumorIE(InfoExtractor):
2364     """Information extractor for collegehumor.com"""
2365
2366     _WORKING = False
2367     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2368     IE_NAME = u'collegehumor'
2369
2370     def report_manifest(self, video_id):
2371         """Report information extraction."""
2372         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2373
2374     def _real_extract(self, url):
2375         mobj = re.match(self._VALID_URL, url)
2376         if mobj is None:
2377             raise ExtractorError(u'Invalid URL: %s' % url)
2378         video_id = mobj.group('videoid')
2379
2380         info = {
2381             'id': video_id,
2382             'uploader': None,
2383             'upload_date': None,
2384         }
2385
2386         self.report_extraction(video_id)
2387         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2388         try:
2389             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2392
2393         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2394         try:
2395             videoNode = mdoc.findall('./video')[0]
2396             info['description'] = videoNode.findall('./description')[0].text
2397             info['title'] = videoNode.findall('./caption')[0].text
2398             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2399             manifest_url = videoNode.findall('./file')[0].text
2400         except IndexError:
2401             raise ExtractorError(u'Invalid metadata XML file')
2402
2403         manifest_url += '?hdcore=2.10.3'
2404         self.report_manifest(video_id)
2405         try:
2406             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2409
2410         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2411         try:
2412             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2413             node_id = media_node.attrib['url']
2414             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2415         except IndexError as err:
2416             raise ExtractorError(u'Invalid manifest file')
2417
2418         url_pr = compat_urllib_parse_urlparse(manifest_url)
2419         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2420
2421         info['url'] = url
2422         info['ext'] = 'f4f'
2423         return [info]
2424
2425
2426 class XVideosIE(InfoExtractor):
2427     """Information extractor for xvideos.com"""
2428
2429     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2430     IE_NAME = u'xvideos'
2431
2432     def _real_extract(self, url):
2433         mobj = re.match(self._VALID_URL, url)
2434         if mobj is None:
2435             raise ExtractorError(u'Invalid URL: %s' % url)
2436         video_id = mobj.group(1)
2437
2438         webpage = self._download_webpage(url, video_id)
2439
2440         self.report_extraction(video_id)
2441
2442
2443         # Extract video URL
2444         mobj = re.search(r'flv_url=(.+?)&', webpage)
2445         if mobj is None:
2446             raise ExtractorError(u'Unable to extract video url')
2447         video_url = compat_urllib_parse.unquote(mobj.group(1))
2448
2449
2450         # Extract title
2451         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2452         if mobj is None:
2453             raise ExtractorError(u'Unable to extract video title')
2454         video_title = mobj.group(1)
2455
2456
2457         # Extract video thumbnail
2458         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2459         if mobj is None:
2460             raise ExtractorError(u'Unable to extract video thumbnail')
2461         video_thumbnail = mobj.group(0)
2462
2463         info = {
2464             'id': video_id,
2465             'url': video_url,
2466             'uploader': None,
2467             'upload_date': None,
2468             'title': video_title,
2469             'ext': 'flv',
2470             'thumbnail': video_thumbnail,
2471             'description': None,
2472         }
2473
2474         return [info]
2475
2476
2477 class SoundcloudIE(InfoExtractor):
2478     """Information extractor for soundcloud.com
2479        To access the media, the uid of the song and a stream token
2480        must be extracted from the page source and the script must make
2481        a request to media.soundcloud.com/crossdomain.xml. Then
2482        the media can be grabbed by requesting from an url composed
2483        of the stream token and uid
2484      """
2485
2486     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2487     IE_NAME = u'soundcloud'
2488
2489     def report_resolve(self, video_id):
2490         """Report information extraction."""
2491         self.to_screen(u'%s: Resolving id' % video_id)
2492
2493     def _real_extract(self, url):
2494         mobj = re.match(self._VALID_URL, url)
2495         if mobj is None:
2496             raise ExtractorError(u'Invalid URL: %s' % url)
2497
2498         # extract uploader (which is in the url)
2499         uploader = mobj.group(1)
2500         # extract simple title (uploader + slug of song title)
2501         slug_title =  mobj.group(2)
2502         simple_title = uploader + u'-' + slug_title
2503         full_title = '%s/%s' % (uploader, slug_title)
2504
2505         self.report_resolve(full_title)
2506
2507         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2508         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2509         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2510
2511         info = json.loads(info_json)
2512         video_id = info['id']
2513         self.report_extraction(full_title)
2514
2515         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2516         stream_json = self._download_webpage(streams_url, full_title,
2517                                              u'Downloading stream definitions',
2518                                              u'unable to download stream definitions')
2519
2520         streams = json.loads(stream_json)
2521         mediaURL = streams['http_mp3_128_url']
2522         upload_date = unified_strdate(info['created_at'])
2523
2524         return [{
2525             'id':       info['id'],
2526             'url':      mediaURL,
2527             'uploader': info['user']['username'],
2528             'upload_date': upload_date,
2529             'title':    info['title'],
2530             'ext':      u'mp3',
2531             'description': info['description'],
2532         }]
2533
2534 class SoundcloudSetIE(InfoExtractor):
2535     """Information extractor for soundcloud.com sets
2536        To access the media, the uid of the song and a stream token
2537        must be extracted from the page source and the script must make
2538        a request to media.soundcloud.com/crossdomain.xml. Then
2539        the media can be grabbed by requesting from an url composed
2540        of the stream token and uid
2541      """
2542
2543     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2544     IE_NAME = u'soundcloud:set'
2545
2546     def report_resolve(self, video_id):
2547         """Report information extraction."""
2548         self.to_screen(u'%s: Resolving id' % video_id)
2549
2550     def _real_extract(self, url):
2551         mobj = re.match(self._VALID_URL, url)
2552         if mobj is None:
2553             raise ExtractorError(u'Invalid URL: %s' % url)
2554
2555         # extract uploader (which is in the url)
2556         uploader = mobj.group(1)
2557         # extract simple title (uploader + slug of song title)
2558         slug_title =  mobj.group(2)
2559         simple_title = uploader + u'-' + slug_title
2560         full_title = '%s/sets/%s' % (uploader, slug_title)
2561
2562         self.report_resolve(full_title)
2563
2564         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2565         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2566         info_json = self._download_webpage(resolv_url, full_title)
2567
2568         videos = []
2569         info = json.loads(info_json)
2570         if 'errors' in info:
2571             for err in info['errors']:
2572                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2573             return
2574
2575         self.report_extraction(full_title)
2576         for track in info['tracks']:
2577             video_id = track['id']
2578
2579             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2580             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2581
2582             self.report_extraction(video_id)
2583             streams = json.loads(stream_json)
2584             mediaURL = streams['http_mp3_128_url']
2585
2586             videos.append({
2587                 'id':       video_id,
2588                 'url':      mediaURL,
2589                 'uploader': track['user']['username'],
2590                 'upload_date':  unified_strdate(track['created_at']),
2591                 'title':    track['title'],
2592                 'ext':      u'mp3',
2593                 'description': track['description'],
2594             })
2595         return videos
2596
2597
2598 class InfoQIE(InfoExtractor):
2599     """Information extractor for infoq.com"""
2600     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             raise ExtractorError(u'Invalid URL: %s' % url)
2606
2607         webpage = self._download_webpage(url, video_id=url)
2608         self.report_extraction(url)
2609
2610         # Extract video URL
2611         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2612         if mobj is None:
2613             raise ExtractorError(u'Unable to extract video url')
2614         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2615         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2616
2617         # Extract title
2618         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2619         if mobj is None:
2620             raise ExtractorError(u'Unable to extract video title')
2621         video_title = mobj.group(1)
2622
2623         # Extract description
2624         video_description = u'No description available.'
2625         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2626         if mobj is not None:
2627             video_description = mobj.group(1)
2628
2629         video_filename = video_url.split('/')[-1]
2630         video_id, extension = video_filename.split('.')
2631
2632         info = {
2633             'id': video_id,
2634             'url': video_url,
2635             'uploader': None,
2636             'upload_date': None,
2637             'title': video_title,
2638             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2639             'thumbnail': None,
2640             'description': video_description,
2641         }
2642
2643         return [info]
2644
2645 class MixcloudIE(InfoExtractor):
2646     """Information extractor for www.mixcloud.com"""
2647
2648     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2649     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2650     IE_NAME = u'mixcloud'
2651
2652     def report_download_json(self, file_id):
2653         """Report JSON download."""
2654         self.to_screen(u'Downloading json')
2655
2656     def get_urls(self, jsonData, fmt, bitrate='best'):
2657         """Get urls from 'audio_formats' section in json"""
2658         file_url = None
2659         try:
2660             bitrate_list = jsonData[fmt]
2661             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2662                 bitrate = max(bitrate_list) # select highest
2663
2664             url_list = jsonData[fmt][bitrate]
2665         except TypeError: # we have no bitrate info.
2666             url_list = jsonData[fmt]
2667         return url_list
2668
2669     def check_urls(self, url_list):
2670         """Returns 1st active url from list"""
2671         for url in url_list:
2672             try:
2673                 compat_urllib_request.urlopen(url)
2674                 return url
2675             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676                 url = None
2677
2678         return None
2679
2680     def _print_formats(self, formats):
2681         print('Available formats:')
2682         for fmt in formats.keys():
2683             for b in formats[fmt]:
2684                 try:
2685                     ext = formats[fmt][b][0]
2686                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2687                 except TypeError: # we have no bitrate info
2688                     ext = formats[fmt][0]
2689                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2690                     break
2691
2692     def _real_extract(self, url):
2693         mobj = re.match(self._VALID_URL, url)
2694         if mobj is None:
2695             raise ExtractorError(u'Invalid URL: %s' % url)
2696         # extract uploader & filename from url
2697         uploader = mobj.group(1).decode('utf-8')
2698         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2699
2700         # construct API request
2701         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2702         # retrieve .json file with links to files
2703         request = compat_urllib_request.Request(file_url)
2704         try:
2705             self.report_download_json(file_url)
2706             jsonData = compat_urllib_request.urlopen(request).read()
2707         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2708             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2709
2710         # parse JSON
2711         json_data = json.loads(jsonData)
2712         player_url = json_data['player_swf_url']
2713         formats = dict(json_data['audio_formats'])
2714
2715         req_format = self._downloader.params.get('format', None)
2716         bitrate = None
2717
2718         if self._downloader.params.get('listformats', None):
2719             self._print_formats(formats)
2720             return
2721
2722         if req_format is None or req_format == 'best':
2723             for format_param in formats.keys():
2724                 url_list = self.get_urls(formats, format_param)
2725                 # check urls
2726                 file_url = self.check_urls(url_list)
2727                 if file_url is not None:
2728                     break # got it!
2729         else:
2730             if req_format not in formats:
2731                 raise ExtractorError(u'Format is not available')
2732
2733             url_list = self.get_urls(formats, req_format)
2734             file_url = self.check_urls(url_list)
2735             format_param = req_format
2736
2737         return [{
2738             'id': file_id.decode('utf-8'),
2739             'url': file_url.decode('utf-8'),
2740             'uploader': uploader.decode('utf-8'),
2741             'upload_date': None,
2742             'title': json_data['name'],
2743             'ext': file_url.split('.')[-1].decode('utf-8'),
2744             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2745             'thumbnail': json_data['thumbnail_url'],
2746             'description': json_data['description'],
2747             'player_url': player_url.decode('utf-8'),
2748         }]
2749
2750 class StanfordOpenClassroomIE(InfoExtractor):
2751     """Information extractor for Stanford's Open ClassRoom"""
2752
2753     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2754     IE_NAME = u'stanfordoc'
2755
2756     def _real_extract(self, url):
2757         mobj = re.match(self._VALID_URL, url)
2758         if mobj is None:
2759             raise ExtractorError(u'Invalid URL: %s' % url)
2760
2761         if mobj.group('course') and mobj.group('video'): # A specific video
2762             course = mobj.group('course')
2763             video = mobj.group('video')
2764             info = {
2765                 'id': course + '_' + video,
2766                 'uploader': None,
2767                 'upload_date': None,
2768             }
2769
2770             self.report_extraction(info['id'])
2771             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2772             xmlUrl = baseUrl + video + '.xml'
2773             try:
2774                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2775             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2777             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2778             try:
2779                 info['title'] = mdoc.findall('./title')[0].text
2780                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2781             except IndexError:
2782                 raise ExtractorError(u'Invalid metadata XML file')
2783             info['ext'] = info['url'].rpartition('.')[2]
2784             return [info]
2785         elif mobj.group('course'): # A course page
2786             course = mobj.group('course')
2787             info = {
2788                 'id': course,
2789                 'type': 'playlist',
2790                 'uploader': None,
2791                 'upload_date': None,
2792             }
2793
2794             coursepage = self._download_webpage(url, info['id'],
2795                                         note='Downloading course info page',
2796                                         errnote='Unable to download course info page')
2797
2798             m = re.search('<h1>([^<]+)</h1>', coursepage)
2799             if m:
2800                 info['title'] = unescapeHTML(m.group(1))
2801             else:
2802                 info['title'] = info['id']
2803
2804             m = re.search('<description>([^<]+)</description>', coursepage)
2805             if m:
2806                 info['description'] = unescapeHTML(m.group(1))
2807
2808             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2809             info['list'] = [
2810                 {
2811                     'type': 'reference',
2812                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2813                 }
2814                     for vpage in links]
2815             results = []
2816             for entry in info['list']:
2817                 assert entry['type'] == 'reference'
2818                 results += self.extract(entry['url'])
2819             return results
2820         else: # Root page
2821             info = {
2822                 'id': 'Stanford OpenClassroom',
2823                 'type': 'playlist',
2824                 'uploader': None,
2825                 'upload_date': None,
2826             }
2827
2828             self.report_download_webpage(info['id'])
2829             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2830             try:
2831                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2832             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2833                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2834
2835             info['title'] = info['id']
2836
2837             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2838             info['list'] = [
2839                 {
2840                     'type': 'reference',
2841                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2842                 }
2843                     for cpage in links]
2844
2845             results = []
2846             for entry in info['list']:
2847                 assert entry['type'] == 'reference'
2848                 results += self.extract(entry['url'])
2849             return results
2850
2851 class MTVIE(InfoExtractor):
2852     """Information extractor for MTV.com"""
2853
2854     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2855     IE_NAME = u'mtv'
2856
2857     def _real_extract(self, url):
2858         mobj = re.match(self._VALID_URL, url)
2859         if mobj is None:
2860             raise ExtractorError(u'Invalid URL: %s' % url)
2861         if not mobj.group('proto'):
2862             url = 'http://' + url
2863         video_id = mobj.group('videoid')
2864
2865         webpage = self._download_webpage(url, video_id)
2866
2867         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2868         if mobj is None:
2869             raise ExtractorError(u'Unable to extract song name')
2870         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2871         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2872         if mobj is None:
2873             raise ExtractorError(u'Unable to extract performer')
2874         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2875         video_title = performer + ' - ' + song_name
2876
2877         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2878         if mobj is None:
2879             raise ExtractorError(u'Unable to mtvn_uri')
2880         mtvn_uri = mobj.group(1)
2881
2882         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2883         if mobj is None:
2884             raise ExtractorError(u'Unable to extract content id')
2885         content_id = mobj.group(1)
2886
2887         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2888         self.report_extraction(video_id)
2889         request = compat_urllib_request.Request(videogen_url)
2890         try:
2891             metadataXml = compat_urllib_request.urlopen(request).read()
2892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2894
2895         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2896         renditions = mdoc.findall('.//rendition')
2897
2898         # For now, always pick the highest quality.
2899         rendition = renditions[-1]
2900
2901         try:
2902             _,_,ext = rendition.attrib['type'].partition('/')
2903             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2904             video_url = rendition.find('./src').text
2905         except KeyError:
2906             raise ExtractorError('Invalid rendition field.')
2907
2908         info = {
2909             'id': video_id,
2910             'url': video_url,
2911             'uploader': performer,
2912             'upload_date': None,
2913             'title': video_title,
2914             'ext': ext,
2915             'format': format,
2916         }
2917
2918         return [info]
2919
2920
2921 class YoukuIE(InfoExtractor):
2922     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2923
2924     def _gen_sid(self):
2925         nowTime = int(time.time() * 1000)
2926         random1 = random.randint(1000,1998)
2927         random2 = random.randint(1000,9999)
2928
2929         return "%d%d%d" %(nowTime,random1,random2)
2930
2931     def _get_file_ID_mix_string(self, seed):
2932         mixed = []
2933         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2934         seed = float(seed)
2935         for i in range(len(source)):
2936             seed  =  (seed * 211 + 30031 ) % 65536
2937             index  =  math.floor(seed / 65536 * len(source) )
2938             mixed.append(source[int(index)])
2939             source.remove(source[int(index)])
2940         #return ''.join(mixed)
2941         return mixed
2942
2943     def _get_file_id(self, fileId, seed):
2944         mixed = self._get_file_ID_mix_string(seed)
2945         ids = fileId.split('*')
2946         realId = []
2947         for ch in ids:
2948             if ch:
2949                 realId.append(mixed[int(ch)])
2950         return ''.join(realId)
2951
2952     def _real_extract(self, url):
2953         mobj = re.match(self._VALID_URL, url)
2954         if mobj is None:
2955             raise ExtractorError(u'Invalid URL: %s' % url)
2956         video_id = mobj.group('ID')
2957
2958         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2959
2960         jsondata = self._download_webpage(info_url, video_id)
2961
2962         self.report_extraction(video_id)
2963         try:
2964             config = json.loads(jsondata)
2965
2966             video_title =  config['data'][0]['title']
2967             seed = config['data'][0]['seed']
2968
2969             format = self._downloader.params.get('format', None)
2970             supported_format = list(config['data'][0]['streamfileids'].keys())
2971
2972             if format is None or format == 'best':
2973                 if 'hd2' in supported_format:
2974                     format = 'hd2'
2975                 else:
2976                     format = 'flv'
2977                 ext = u'flv'
2978             elif format == 'worst':
2979                 format = 'mp4'
2980                 ext = u'mp4'
2981             else:
2982                 format = 'flv'
2983                 ext = u'flv'
2984
2985
2986             fileid = config['data'][0]['streamfileids'][format]
2987             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2988         except (UnicodeDecodeError, ValueError, KeyError):
2989             raise ExtractorError(u'Unable to extract info section')
2990
2991         files_info=[]
2992         sid = self._gen_sid()
2993         fileid = self._get_file_id(fileid, seed)
2994
2995         #column 8,9 of fileid represent the segment number
2996         #fileid[7:9] should be changed
2997         for index, key in enumerate(keys):
2998
2999             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3000             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3001
3002             info = {
3003                 'id': '%s_part%02d' % (video_id, index),
3004                 'url': download_url,
3005                 'uploader': None,
3006                 'upload_date': None,
3007                 'title': video_title,
3008                 'ext': ext,
3009             }
3010             files_info.append(info)
3011
3012         return files_info
3013
3014
3015 class XNXXIE(InfoExtractor):
3016     """Information extractor for xnxx.com"""
3017
3018     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3019     IE_NAME = u'xnxx'
3020     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3021     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3022     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3023
3024     def _real_extract(self, url):
3025         mobj = re.match(self._VALID_URL, url)
3026         if mobj is None:
3027             raise ExtractorError(u'Invalid URL: %s' % url)
3028         video_id = mobj.group(1)
3029
3030         # Get webpage content
3031         webpage = self._download_webpage(url, video_id)
3032
3033         result = re.search(self.VIDEO_URL_RE, webpage)
3034         if result is None:
3035             raise ExtractorError(u'Unable to extract video url')
3036         video_url = compat_urllib_parse.unquote(result.group(1))
3037
3038         result = re.search(self.VIDEO_TITLE_RE, webpage)
3039         if result is None:
3040             raise ExtractorError(u'Unable to extract video title')
3041         video_title = result.group(1)
3042
3043         result = re.search(self.VIDEO_THUMB_RE, webpage)
3044         if result is None:
3045             raise ExtractorError(u'Unable to extract video thumbnail')
3046         video_thumbnail = result.group(1)
3047
3048         return [{
3049             'id': video_id,
3050             'url': video_url,
3051             'uploader': None,
3052             'upload_date': None,
3053             'title': video_title,
3054             'ext': 'flv',
3055             'thumbnail': video_thumbnail,
3056             'description': None,
3057         }]
3058
3059
3060 class GooglePlusIE(InfoExtractor):
3061     """Information extractor for plus.google.com."""
3062
3063     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3064     IE_NAME = u'plus.google'
3065
3066     def report_extract_entry(self, url):
3067         """Report downloading extry"""
3068         self.to_screen(u'Downloading entry: %s' % url)
3069
3070     def report_date(self, upload_date):
3071         """Report downloading extry"""
3072         self.to_screen(u'Entry date: %s' % upload_date)
3073
3074     def report_uploader(self, uploader):
3075         """Report downloading extry"""
3076         self.to_screen(u'Uploader: %s' % uploader)
3077
3078     def report_title(self, video_title):
3079         """Report downloading extry"""
3080         self.to_screen(u'Title: %s' % video_title)
3081
3082     def report_extract_vid_page(self, video_page):
3083         """Report information extraction."""
3084         self.to_screen(u'Extracting video page: %s' % video_page)
3085
3086     def _real_extract(self, url):
3087         # Extract id from URL
3088         mobj = re.match(self._VALID_URL, url)
3089         if mobj is None:
3090             raise ExtractorError(u'Invalid URL: %s' % url)
3091
3092         post_url = mobj.group(0)
3093         video_id = mobj.group(1)
3094
3095         video_extension = 'flv'
3096
3097         # Step 1, Retrieve post webpage to extract further information
3098         self.report_extract_entry(post_url)
3099         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3100
3101         # Extract update date
3102         upload_date = None
3103         pattern = 'title="Timestamp">(.*?)</a>'
3104         mobj = re.search(pattern, webpage)
3105         if mobj:
3106             upload_date = mobj.group(1)
3107             # Convert timestring to a format suitable for filename
3108             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3109             upload_date = upload_date.strftime('%Y%m%d')
3110         self.report_date(upload_date)
3111
3112         # Extract uploader
3113         uploader = None
3114         pattern = r'rel\="author".*?>(.*?)</a>'
3115         mobj = re.search(pattern, webpage)
3116         if mobj:
3117             uploader = mobj.group(1)
3118         self.report_uploader(uploader)
3119
3120         # Extract title
3121         # Get the first line for title
3122         video_title = u'NA'
3123         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3124         mobj = re.search(pattern, webpage)
3125         if mobj:
3126             video_title = mobj.group(1)
3127         self.report_title(video_title)
3128
3129         # Step 2, Stimulate clicking the image box to launch video
3130         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3131         mobj = re.search(pattern, webpage)
3132         if mobj is None:
3133             raise ExtractorError(u'Unable to extract video page URL')
3134
3135         video_page = mobj.group(1)
3136         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3137         self.report_extract_vid_page(video_page)
3138
3139
3140         # Extract video links on video page
3141         """Extract video links of all sizes"""
3142         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3143         mobj = re.findall(pattern, webpage)
3144         if len(mobj) == 0:
3145             raise ExtractorError(u'Unable to extract video links')
3146
3147         # Sort in resolution
3148         links = sorted(mobj)
3149
3150         # Choose the lowest of the sort, i.e. highest resolution
3151         video_url = links[-1]
3152         # Only get the url. The resolution part in the tuple has no use anymore
3153         video_url = video_url[-1]
3154         # Treat escaped \u0026 style hex
3155         try:
3156             video_url = video_url.decode("unicode_escape")
3157         except AttributeError: # Python 3
3158             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3159
3160
3161         return [{
3162             'id':       video_id,
3163             'url':      video_url,
3164             'uploader': uploader,
3165             'upload_date':  upload_date,
3166             'title':    video_title,
3167             'ext':      video_extension,
3168         }]
3169
3170 class NBAIE(InfoExtractor):
3171     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3172     IE_NAME = u'nba'
3173
3174     def _real_extract(self, url):
3175         mobj = re.match(self._VALID_URL, url)
3176         if mobj is None:
3177             raise ExtractorError(u'Invalid URL: %s' % url)
3178
3179         video_id = mobj.group(1)
3180         if video_id.endswith('/index.html'):
3181             video_id = video_id[:-len('/index.html')]
3182
3183         webpage = self._download_webpage(url, video_id)
3184
3185         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3186         def _findProp(rexp, default=None):
3187             m = re.search(rexp, webpage)
3188             if m:
3189                 return unescapeHTML(m.group(1))
3190             else:
3191                 return default
3192
3193         shortened_video_id = video_id.rpartition('/')[2]
3194         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3195         info = {
3196             'id': shortened_video_id,
3197             'url': video_url,
3198             'ext': 'mp4',
3199             'title': title,
3200             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3201             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3202         }
3203         return [info]
3204
3205 class JustinTVIE(InfoExtractor):
3206     """Information extractor for justin.tv and twitch.tv"""
3207     # TODO: One broadcast may be split into multiple videos. The key
3208     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3209     # starts at 1 and increases. Can we treat all parts as one video?
3210
3211     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3212         (?:
3213             (?P<channelid>[^/]+)|
3214             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3215             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3216         )
3217         /?(?:\#.*)?$
3218         """
3219     _JUSTIN_PAGE_LIMIT = 100
3220     IE_NAME = u'justin.tv'
3221
3222     def report_download_page(self, channel, offset):
3223         """Report attempt to download a single page of videos."""
3224         self.to_screen(u'%s: Downloading video information from %d to %d' %
3225                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3226
3227     # Return count of items, list of *valid* items
3228     def _parse_page(self, url, video_id):
3229         webpage = self._download_webpage(url, video_id,
3230                                          u'Downloading video info JSON',
3231                                          u'unable to download video info JSON')
3232
3233         response = json.loads(webpage)
3234         if type(response) != list:
3235             error_text = response.get('error', 'unknown error')
3236             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3237         info = []
3238         for clip in response:
3239             video_url = clip['video_file_url']
3240             if video_url:
3241                 video_extension = os.path.splitext(video_url)[1][1:]
3242                 video_date = re.sub('-', '', clip['start_time'][:10])
3243                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3244                 video_id = clip['id']
3245                 video_title = clip.get('title', video_id)
3246                 info.append({
3247                     'id': video_id,
3248                     'url': video_url,
3249                     'title': video_title,
3250                     'uploader': clip.get('channel_name', video_uploader_id),
3251                     'uploader_id': video_uploader_id,
3252                     'upload_date': video_date,
3253                     'ext': video_extension,
3254                 })
3255         return (len(response), info)
3256
3257     def _real_extract(self, url):
3258         mobj = re.match(self._VALID_URL, url)
3259         if mobj is None:
3260             raise ExtractorError(u'invalid URL: %s' % url)
3261
3262         api_base = 'http://api.justin.tv'
3263         paged = False
3264         if mobj.group('channelid'):
3265             paged = True
3266             video_id = mobj.group('channelid')
3267             api = api_base + '/channel/archives/%s.json' % video_id
3268         elif mobj.group('chapterid'):
3269             chapter_id = mobj.group('chapterid')
3270
3271             webpage = self._download_webpage(url, chapter_id)
3272             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3273             if not m:
3274                 raise ExtractorError(u'Cannot find archive of a chapter')
3275             archive_id = m.group(1)
3276
3277             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3278             chapter_info_xml = self._download_webpage(api, chapter_id,
3279                                              note=u'Downloading chapter information',
3280                                              errnote=u'Chapter information download failed')
3281             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3282             for a in doc.findall('.//archive'):
3283                 if archive_id == a.find('./id').text:
3284                     break
3285             else:
3286                 raise ExtractorError(u'Could not find chapter in chapter information')
3287
3288             video_url = a.find('./video_file_url').text
3289             video_ext = video_url.rpartition('.')[2] or u'flv'
3290
3291             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3292             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3293                                    note='Downloading chapter metadata',
3294                                    errnote='Download of chapter metadata failed')
3295             chapter_info = json.loads(chapter_info_json)
3296
3297             bracket_start = int(doc.find('.//bracket_start').text)
3298             bracket_end = int(doc.find('.//bracket_end').text)
3299
3300             # TODO determine start (and probably fix up file)
3301             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3302             #video_url += u'?start=' + TODO:start_timestamp
3303             # bracket_start is 13290, but we want 51670615
3304             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3305                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3306
3307             info = {
3308                 'id': u'c' + chapter_id,
3309                 'url': video_url,
3310                 'ext': video_ext,
3311                 'title': chapter_info['title'],
3312                 'thumbnail': chapter_info['preview'],
3313                 'description': chapter_info['description'],
3314                 'uploader': chapter_info['channel']['display_name'],
3315                 'uploader_id': chapter_info['channel']['name'],
3316             }
3317             return [info]
3318         else:
3319             video_id = mobj.group('videoid')
3320             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3321
3322         self.report_extraction(video_id)
3323
3324         info = []
3325         offset = 0
3326         limit = self._JUSTIN_PAGE_LIMIT
3327         while True:
3328             if paged:
3329                 self.report_download_page(video_id, offset)
3330             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3331             page_count, page_info = self._parse_page(page_url, video_id)
3332             info.extend(page_info)
3333             if not paged or page_count != limit:
3334                 break
3335             offset += limit
3336         return info
3337
3338 class FunnyOrDieIE(InfoExtractor):
3339     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3340
3341     def _real_extract(self, url):
3342         mobj = re.match(self._VALID_URL, url)
3343         if mobj is None:
3344             raise ExtractorError(u'invalid URL: %s' % url)
3345
3346         video_id = mobj.group('id')
3347         webpage = self._download_webpage(url, video_id)
3348
3349         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3350         if not m:
3351             raise ExtractorError(u'Unable to find video information')
3352         video_url = unescapeHTML(m.group('url'))
3353
3354         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3355         if not m:
3356             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3357             if not m:
3358                 raise ExtractorError(u'Cannot find video title')
3359         title = clean_html(m.group('title'))
3360
3361         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3362         if m:
3363             desc = unescapeHTML(m.group('desc'))
3364         else:
3365             desc = None
3366
3367         info = {
3368             'id': video_id,
3369             'url': video_url,
3370             'ext': 'mp4',
3371             'title': title,
3372             'description': desc,
3373         }
3374         return [info]
3375
3376 class SteamIE(InfoExtractor):
3377     _VALID_URL = r"""http://store\.steampowered\.com/
3378                 (agecheck/)?
3379                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3380                 (?P<gameID>\d+)/?
3381                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3382                 """
3383
3384     @classmethod
3385     def suitable(cls, url):
3386         """Receives a URL and returns True if suitable for this IE."""
3387         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3388
3389     def _real_extract(self, url):
3390         m = re.match(self._VALID_URL, url, re.VERBOSE)
3391         gameID = m.group('gameID')
3392         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3393         self.report_age_confirmation()
3394         webpage = self._download_webpage(videourl, gameID)
3395         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3396         
3397         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3398         mweb = re.finditer(urlRE, webpage)
3399         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3400         titles = re.finditer(namesRE, webpage)
3401         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3402         thumbs = re.finditer(thumbsRE, webpage)
3403         videos = []
3404         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3405             video_id = vid.group('videoID')
3406             title = vtitle.group('videoName')
3407             video_url = vid.group('videoURL')
3408             video_thumb = thumb.group('thumbnail')
3409             if not video_url:
3410                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3411             info = {
3412                 'id':video_id,
3413                 'url':video_url,
3414                 'ext': 'flv',
3415                 'title': unescapeHTML(title),
3416                 'thumbnail': video_thumb
3417                   }
3418             videos.append(info)
3419         return [self.playlist_result(videos, gameID, game_title)]
3420
3421 class UstreamIE(InfoExtractor):
3422     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3423     IE_NAME = u'ustream'
3424
3425     def _real_extract(self, url):
3426         m = re.match(self._VALID_URL, url)
3427         video_id = m.group('videoID')
3428         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3429         webpage = self._download_webpage(url, video_id)
3430         self.report_extraction(video_id)
3431         try:
3432             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3433             title = m.group('title')
3434             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3435                           webpage, re.DOTALL)
3436             uploader = unescapeHTML(m.group('uploader').strip())
3437             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3438             thumb = m.group('thumb')
3439         except AttributeError:
3440             raise ExtractorError(u'Unable to extract info')
3441         info = {
3442                 'id':video_id,
3443                 'url':video_url,
3444                 'ext': 'flv',
3445                 'title': title,
3446                 'uploader': uploader,
3447                 'thumbnail': thumb,
3448                   }
3449         return info
3450
3451 class WorldStarHipHopIE(InfoExtractor):
3452     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3453     IE_NAME = u'WorldStarHipHop'
3454
3455     def _real_extract(self, url):
3456         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3457
3458         m = re.match(self._VALID_URL, url)
3459         video_id = m.group('id')
3460
3461         webpage_src = self._download_webpage(url, video_id) 
3462
3463         mobj = re.search(_src_url, webpage_src)
3464
3465         if mobj is not None:
3466             video_url = mobj.group(1)
3467             if 'mp4' in video_url:
3468                 ext = 'mp4'
3469             else:
3470                 ext = 'flv'
3471         else:
3472             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3473
3474         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3475
3476         if mobj is None:
3477             raise ExtractorError(u'Cannot determine title')
3478         title = mobj.group(1)
3479
3480         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3481         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3482         if mobj is not None:
3483             thumbnail = mobj.group(1)
3484         else:
3485             _title = r"""candytitles.*>(.*)</span>"""
3486             mobj = re.search(_title, webpage_src)
3487             if mobj is not None:
3488                 title = mobj.group(1)
3489             thumbnail = None
3490
3491         results = [{
3492                     'id': video_id,
3493                     'url' : video_url,
3494                     'title' : title,
3495                     'thumbnail' : thumbnail,
3496                     'ext' : ext,
3497                     }]
3498         return results
3499
3500 class RBMARadioIE(InfoExtractor):
3501     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3502
3503     def _real_extract(self, url):
3504         m = re.match(self._VALID_URL, url)
3505         video_id = m.group('videoID')
3506
3507         webpage = self._download_webpage(url, video_id)
3508         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3509         if not m:
3510             raise ExtractorError(u'Cannot find metadata')
3511         json_data = m.group(1)
3512
3513         try:
3514             data = json.loads(json_data)
3515         except ValueError as e:
3516             raise ExtractorError(u'Invalid JSON: ' + str(e))
3517
3518         video_url = data['akamai_url'] + '&cbr=256'
3519         url_parts = compat_urllib_parse_urlparse(video_url)
3520         video_ext = url_parts.path.rpartition('.')[2]
3521         info = {
3522                 'id': video_id,
3523                 'url': video_url,
3524                 'ext': video_ext,
3525                 'title': data['title'],
3526                 'description': data.get('teaser_text'),
3527                 'location': data.get('country_of_origin'),
3528                 'uploader': data.get('host', {}).get('name'),
3529                 'uploader_id': data.get('host', {}).get('slug'),
3530                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3531                 'duration': data.get('duration'),
3532         }
3533         return [info]
3534
3535
3536 class YouPornIE(InfoExtractor):
3537     """Information extractor for youporn.com."""
3538     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3539
3540     def _print_formats(self, formats):
3541         """Print all available formats"""
3542         print(u'Available formats:')
3543         print(u'ext\t\tformat')
3544         print(u'---------------------------------')
3545         for format in formats:
3546             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3547
3548     def _specific(self, req_format, formats):
3549         for x in formats:
3550             if(x["format"]==req_format):
3551                 return x
3552         return None
3553
3554     def _real_extract(self, url):
3555         mobj = re.match(self._VALID_URL, url)
3556         if mobj is None:
3557             raise ExtractorError(u'Invalid URL: %s' % url)
3558
3559         video_id = mobj.group('videoid')
3560
3561         req = compat_urllib_request.Request(url)
3562         req.add_header('Cookie', 'age_verified=1')
3563         webpage = self._download_webpage(req, video_id)
3564
3565         # Get the video title
3566         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3567         if result is None:
3568             raise ExtractorError(u'Unable to extract video title')
3569         video_title = result.group('title').strip()
3570
3571         # Get the video date
3572         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3573         if result is None:
3574             self._downloader.report_warning(u'unable to extract video date')
3575             upload_date = None
3576         else:
3577             upload_date = unified_strdate(result.group('date').strip())
3578
3579         # Get the video uploader
3580         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3581         if result is None:
3582             self._downloader.report_warning(u'unable to extract uploader')
3583             video_uploader = None
3584         else:
3585             video_uploader = result.group('uploader').strip()
3586             video_uploader = clean_html( video_uploader )
3587
3588         # Get all of the formats available
3589         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3590         result = re.search(DOWNLOAD_LIST_RE, webpage)
3591         if result is None:
3592             raise ExtractorError(u'Unable to extract download list')
3593         download_list_html = result.group('download_list').strip()
3594
3595         # Get all of the links from the page
3596         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3597         links = re.findall(LINK_RE, download_list_html)
3598         if(len(links) == 0):
3599             raise ExtractorError(u'ERROR: no known formats available for video')
3600
3601         self.to_screen(u'Links found: %d' % len(links))
3602
3603         formats = []
3604         for link in links:
3605
3606             # A link looks like this:
3607             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3608             # A path looks like this:
3609             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3610             video_url = unescapeHTML( link )
3611             path = compat_urllib_parse_urlparse( video_url ).path
3612             extension = os.path.splitext( path )[1][1:]
3613             format = path.split('/')[4].split('_')[:2]
3614             size = format[0]
3615             bitrate = format[1]
3616             format = "-".join( format )
3617             title = u'%s-%s-%s' % (video_title, size, bitrate)
3618
3619             formats.append({
3620                 'id': video_id,
3621                 'url': video_url,
3622                 'uploader': video_uploader,
3623                 'upload_date': upload_date,
3624                 'title': title,
3625                 'ext': extension,
3626                 'format': format,
3627                 'thumbnail': None,
3628                 'description': None,
3629                 'player_url': None
3630             })
3631
3632         if self._downloader.params.get('listformats', None):
3633             self._print_formats(formats)
3634             return
3635
3636         req_format = self._downloader.params.get('format', None)
3637         self.to_screen(u'Format: %s' % req_format)
3638
3639         if req_format is None or req_format == 'best':
3640             return [formats[0]]
3641         elif req_format == 'worst':
3642             return [formats[-1]]
3643         elif req_format in ('-1', 'all'):
3644             return formats
3645         else:
3646             format = self._specific( req_format, formats )
3647             if result is None:
3648                 raise ExtractorError(u'Requested format not available')
3649             return [format]
3650
3651
3652
3653 class PornotubeIE(InfoExtractor):
3654     """Information extractor for pornotube.com."""
3655     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3656
3657     def _real_extract(self, url):
3658         mobj = re.match(self._VALID_URL, url)
3659         if mobj is None:
3660             raise ExtractorError(u'Invalid URL: %s' % url)
3661
3662         video_id = mobj.group('videoid')
3663         video_title = mobj.group('title')
3664
3665         # Get webpage content
3666         webpage = self._download_webpage(url, video_id)
3667
3668         # Get the video URL
3669         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3670         result = re.search(VIDEO_URL_RE, webpage)
3671         if result is None:
3672             raise ExtractorError(u'Unable to extract video url')
3673         video_url = compat_urllib_parse.unquote(result.group('url'))
3674
3675         #Get the uploaded date
3676         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3677         result = re.search(VIDEO_UPLOADED_RE, webpage)
3678         if result is None:
3679             raise ExtractorError(u'Unable to extract video title')
3680         upload_date = unified_strdate(result.group('date'))
3681
3682         info = {'id': video_id,
3683                 'url': video_url,
3684                 'uploader': None,
3685                 'upload_date': upload_date,
3686                 'title': video_title,
3687                 'ext': 'flv',
3688                 'format': 'flv'}
3689
3690         return [info]
3691
3692 class YouJizzIE(InfoExtractor):
3693     """Information extractor for youjizz.com."""
3694     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3695
3696     def _real_extract(self, url):
3697         mobj = re.match(self._VALID_URL, url)
3698         if mobj is None:
3699             raise ExtractorError(u'Invalid URL: %s' % url)
3700
3701         video_id = mobj.group('videoid')
3702
3703         # Get webpage content
3704         webpage = self._download_webpage(url, video_id)
3705
3706         # Get the video title
3707         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3708         if result is None:
3709             raise ExtractorError(u'ERROR: unable to extract video title')
3710         video_title = result.group('title').strip()
3711
3712         # Get the embed page
3713         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3714         if result is None:
3715             raise ExtractorError(u'ERROR: unable to extract embed page')
3716
3717         embed_page_url = result.group(0).strip()
3718         video_id = result.group('videoid')
3719
3720         webpage = self._download_webpage(embed_page_url, video_id)
3721
3722         # Get the video URL
3723         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3724         if result is None:
3725             raise ExtractorError(u'ERROR: unable to extract video url')
3726         video_url = result.group('source')
3727
3728         info = {'id': video_id,
3729                 'url': video_url,
3730                 'title': video_title,
3731                 'ext': 'flv',
3732                 'format': 'flv',
3733                 'player_url': embed_page_url}
3734
3735         return [info]
3736
3737 class EightTracksIE(InfoExtractor):
3738     IE_NAME = '8tracks'
3739     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3740
3741     def _real_extract(self, url):
3742         mobj = re.match(self._VALID_URL, url)
3743         if mobj is None:
3744             raise ExtractorError(u'Invalid URL: %s' % url)
3745         playlist_id = mobj.group('id')
3746
3747         webpage = self._download_webpage(url, playlist_id)
3748
3749         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3750         if not m:
3751             raise ExtractorError(u'Cannot find trax information')
3752         json_like = m.group(1)
3753         data = json.loads(json_like)
3754
3755         session = str(random.randint(0, 1000000000))
3756         mix_id = data['id']
3757         track_count = data['tracks_count']
3758         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3759         next_url = first_url
3760         res = []
3761         for i in itertools.count():
3762             api_json = self._download_webpage(next_url, playlist_id,
3763                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3764                 errnote=u'Failed to download song information')
3765             api_data = json.loads(api_json)
3766             track_data = api_data[u'set']['track']
3767             info = {
3768                 'id': track_data['id'],
3769                 'url': track_data['track_file_stream_url'],
3770                 'title': track_data['performer'] + u' - ' + track_data['name'],
3771                 'raw_title': track_data['name'],
3772                 'uploader_id': data['user']['login'],
3773                 'ext': 'm4a',
3774             }
3775             res.append(info)
3776             if api_data['set']['at_last_track']:
3777                 break
3778             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3779         return res
3780
3781 class KeekIE(InfoExtractor):
3782     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3783     IE_NAME = u'keek'
3784
3785     def _real_extract(self, url):
3786         m = re.match(self._VALID_URL, url)
3787         video_id = m.group('videoID')
3788         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3789         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3790         webpage = self._download_webpage(url, video_id)
3791         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3792         title = unescapeHTML(m.group('title'))
3793         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3794         uploader = clean_html(m.group('uploader'))
3795         info = {
3796                 'id': video_id,
3797                 'url': video_url,
3798                 'ext': 'mp4',
3799                 'title': title,
3800                 'thumbnail': thumbnail,
3801                 'uploader': uploader
3802         }
3803         return [info]
3804
3805 class TEDIE(InfoExtractor):
3806     _VALID_URL=r'''http://www\.ted\.com/
3807                    (
3808                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3809                         |
3810                         ((?P<type_talk>talks)) # We have a simple talk
3811                    )
3812                    (/lang/(.*?))? # The url may contain the language
3813                    /(?P<name>\w+) # Here goes the name and then ".html"
3814                    '''
3815
3816     @classmethod
3817     def suitable(cls, url):
3818         """Receives a URL and returns True if suitable for this IE."""
3819         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3820
3821     def _real_extract(self, url):
3822         m=re.match(self._VALID_URL, url, re.VERBOSE)
3823         if m.group('type_talk'):
3824             return [self._talk_info(url)]
3825         else :
3826             playlist_id=m.group('playlist_id')
3827             name=m.group('name')
3828             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3829             return [self._playlist_videos_info(url,name,playlist_id)]
3830
3831     def _talk_video_link(self,mediaSlug):
3832         '''Returns the video link for that mediaSlug'''
3833         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3834
3835     def _playlist_videos_info(self,url,name,playlist_id=0):
3836         '''Returns the videos of the playlist'''
3837         video_RE=r'''
3838                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3839                      ([.\s]*?)data-playlist_item_id="(\d+)"
3840                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3841                      '''
3842         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3843         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3844         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3845         m_names=re.finditer(video_name_RE,webpage)
3846
3847         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3848         m_playlist = re.search(playlist_RE, webpage)
3849         playlist_title = m_playlist.group('playlist_title')
3850
3851         playlist_entries = []
3852         for m_video, m_name in zip(m_videos,m_names):
3853             video_id=m_video.group('video_id')
3854             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3855             playlist_entries.append(self.url_result(talk_url, 'TED'))
3856         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3857
3858     def _talk_info(self, url, video_id=0):
3859         """Return the video for the talk in the url"""
3860         m=re.match(self._VALID_URL, url,re.VERBOSE)
3861         videoName=m.group('name')
3862         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3863         # If the url includes the language we get the title translated
3864         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3865         title=re.search(title_RE, webpage).group('title')
3866         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3867                         "id":(?P<videoID>[\d]+).*?
3868                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3869         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3870         thumb_match=re.search(thumb_RE,webpage)
3871         info_match=re.search(info_RE,webpage,re.VERBOSE)
3872         video_id=info_match.group('videoID')
3873         mediaSlug=info_match.group('mediaSlug')
3874         video_url=self._talk_video_link(mediaSlug)
3875         info = {
3876                 'id': video_id,
3877                 'url': video_url,
3878                 'ext': 'mp4',
3879                 'title': title,
3880                 'thumbnail': thumb_match.group('thumbnail')
3881                 }
3882         return info
3883
3884 class MySpassIE(InfoExtractor):
3885     _VALID_URL = r'http://www.myspass.de/.*'
3886
3887     def _real_extract(self, url):
3888         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3889
3890         # video id is the last path element of the URL
3891         # usually there is a trailing slash, so also try the second but last
3892         url_path = compat_urllib_parse_urlparse(url).path
3893         url_parent_path, video_id = os.path.split(url_path)
3894         if not video_id:
3895             _, video_id = os.path.split(url_parent_path)
3896
3897         # get metadata
3898         metadata_url = META_DATA_URL_TEMPLATE % video_id
3899         metadata_text = self._download_webpage(metadata_url, video_id)
3900         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3901
3902         # extract values from metadata
3903         url_flv_el = metadata.find('url_flv')
3904         if url_flv_el is None:
3905             raise ExtractorError(u'Unable to extract download url')
3906         video_url = url_flv_el.text
3907         extension = os.path.splitext(video_url)[1][1:]
3908         title_el = metadata.find('title')
3909         if title_el is None:
3910             raise ExtractorError(u'Unable to extract title')
3911         title = title_el.text
3912         format_id_el = metadata.find('format_id')
3913         if format_id_el is None:
3914             format = ext
3915         else:
3916             format = format_id_el.text
3917         description_el = metadata.find('description')
3918         if description_el is not None:
3919             description = description_el.text
3920         else:
3921             description = None
3922         imagePreview_el = metadata.find('imagePreview')
3923         if imagePreview_el is not None:
3924             thumbnail = imagePreview_el.text
3925         else:
3926             thumbnail = None
3927         info = {
3928             'id': video_id,
3929             'url': video_url,
3930             'title': title,
3931             'ext': extension,
3932             'format': format,
3933             'thumbnail': thumbnail,
3934             'description': description
3935         }
3936         return [info]
3937
3938 class SpiegelIE(InfoExtractor):
3939     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3940
3941     def _real_extract(self, url):
3942         m = re.match(self._VALID_URL, url)
3943         video_id = m.group('videoID')
3944
3945         webpage = self._download_webpage(url, video_id)
3946         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3947         if not m:
3948             raise ExtractorError(u'Cannot find title')
3949         video_title = unescapeHTML(m.group(1))
3950
3951         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3952         xml_code = self._download_webpage(xml_url, video_id,
3953                     note=u'Downloading XML', errnote=u'Failed to download XML')
3954
3955         idoc = xml.etree.ElementTree.fromstring(xml_code)
3956         last_type = idoc[-1]
3957         filename = last_type.findall('./filename')[0].text
3958         duration = float(last_type.findall('./duration')[0].text)
3959
3960         video_url = 'http://video2.spiegel.de/flash/' + filename
3961         video_ext = filename.rpartition('.')[2]
3962         info = {
3963             'id': video_id,
3964             'url': video_url,
3965             'ext': video_ext,
3966             'title': video_title,
3967             'duration': duration,
3968         }
3969         return [info]
3970
3971 class LiveLeakIE(InfoExtractor):
3972
3973     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3974     IE_NAME = u'liveleak'
3975
3976     def _real_extract(self, url):
3977         mobj = re.match(self._VALID_URL, url)
3978         if mobj is None:
3979             raise ExtractorError(u'Invalid URL: %s' % url)
3980
3981         video_id = mobj.group('video_id')
3982
3983         webpage = self._download_webpage(url, video_id)
3984
3985         m = re.search(r'file: "(.*?)",', webpage)
3986         if not m:
3987             raise ExtractorError(u'Unable to find video url')
3988         video_url = m.group(1)
3989
3990         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3991         if not m:
3992             raise ExtractorError(u'Cannot find video title')
3993         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3994
3995         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3996         if m:
3997             desc = unescapeHTML(m.group('desc'))
3998         else:
3999             desc = None
4000
4001         m = re.search(r'By:.*?(\w+)</a>', webpage)
4002         if m:
4003             uploader = clean_html(m.group(1))
4004         else:
4005             uploader = None
4006
4007         info = {
4008             'id':  video_id,
4009             'url': video_url,
4010             'ext': 'mp4',
4011             'title': title,
4012             'description': desc,
4013             'uploader': uploader
4014         }
4015
4016         return [info]
4017
4018 class ARDIE(InfoExtractor):
4019     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4020     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4021     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4022
4023     def _real_extract(self, url):
4024         # determine video id from url
4025         m = re.match(self._VALID_URL, url)
4026
4027         numid = re.search(r'documentId=([0-9]+)', url)
4028         if numid:
4029             video_id = numid.group(1)
4030         else:
4031             video_id = m.group('video_id')
4032
4033         # determine title and media streams from webpage
4034         html = self._download_webpage(url, video_id)
4035         title = re.search(self._TITLE, html).group('title')
4036         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4037         if not streams:
4038             assert '"fsk"' in html
4039             raise ExtractorError(u'This video is only available after 8:00 pm')
4040
4041         # choose default media type and highest quality for now
4042         stream = max([s for s in streams if int(s["media_type"]) == 0],
4043                      key=lambda s: int(s["quality"]))
4044
4045         # there's two possibilities: RTMP stream or HTTP download
4046         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4047         if stream['rtmp_url']:
4048             self.to_screen(u'RTMP download detected')
4049             assert stream['video_url'].startswith('mp4:')
4050             info["url"] = stream["rtmp_url"]
4051             info["play_path"] = stream['video_url']
4052         else:
4053             assert stream["video_url"].endswith('.mp4')
4054             info["url"] = stream["video_url"]
4055         return [info]
4056
4057 class TumblrIE(InfoExtractor):
4058     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4059
4060     def _real_extract(self, url):
4061         m_url = re.match(self._VALID_URL, url)
4062         video_id = m_url.group('id')
4063         blog = m_url.group('blog_name')
4064
4065         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4066         webpage = self._download_webpage(url, video_id)
4067
4068         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4069         video = re.search(re_video, webpage)
4070         if video is None:
4071             self.to_screen("No video founded")
4072             return []
4073         video_url = video.group('video_url')
4074         ext = video.group('ext')
4075
4076         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4077         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4078
4079         # The only place where you can get a title, it's not complete,
4080         # but searching in other places doesn't work for all videos
4081         re_title = r'<title>(?P<title>.*?)</title>'
4082         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4083
4084         return [{'id': video_id,
4085                  'url': video_url,
4086                  'title': title,
4087                  'thumbnail': thumb,
4088                  'ext': ext
4089                  }]
4090
4091 class BandcampIE(InfoExtractor):
4092     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4093
4094     def _real_extract(self, url):
4095         mobj = re.match(self._VALID_URL, url)
4096         title = mobj.group('title')
4097         webpage = self._download_webpage(url, title)
4098         # We get the link to the free download page
4099         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4100         if m_download is None:
4101             raise ExtractorError(u'No free songs founded')
4102
4103         download_link = m_download.group(1)
4104         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4105                        webpage, re.MULTILINE|re.DOTALL).group('id')
4106
4107         download_webpage = self._download_webpage(download_link, id,
4108                                                   'Downloading free downloads page')
4109         # We get the dictionary of the track from some javascrip code
4110         info = re.search(r'items: (.*?),$',
4111                          download_webpage, re.MULTILINE).group(1)
4112         info = json.loads(info)[0]
4113         # We pick mp3-320 for now, until format selection can be easily implemented.
4114         mp3_info = info[u'downloads'][u'mp3-320']
4115         # If we try to use this url it says the link has expired
4116         initial_url = mp3_info[u'url']
4117         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4118         m_url = re.match(re_url, initial_url)
4119         #We build the url we will use to get the final track url
4120         # This url is build in Bandcamp in the script download_bunde_*.js
4121         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4122         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4123         # If we could correctly generate the .rand field the url would be
4124         #in the "download_url" key
4125         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4126
4127         track_info = {'id':id,
4128                       'title' : info[u'title'],
4129                       'ext' : 'mp3',
4130                       'url' : final_url,
4131                       'thumbnail' : info[u'thumb_url'],
4132                       'uploader' : info[u'artist']
4133                       }
4134
4135         return [track_info]
4136
4137 class RedTubeIE(InfoExtractor):
4138     """Information Extractor for redtube"""
4139     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4140
4141     def _real_extract(self,url):
4142         mobj = re.match(self._VALID_URL, url)
4143         if mobj is None:
4144             raise ExtractorError(u'Invalid URL: %s' % url)
4145
4146         video_id = mobj.group('id')
4147         video_extension = 'mp4'        
4148         webpage = self._download_webpage(url, video_id)
4149         self.report_extraction(video_id)
4150         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4151
4152         if mobj is None:
4153             raise ExtractorError(u'Unable to extract media URL')
4154
4155         video_url = mobj.group(1)
4156         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4157         if mobj is None:
4158             raise ExtractorError(u'Unable to extract title')
4159         video_title = mobj.group(1)
4160
4161         return [{
4162             'id':       video_id,
4163             'url':      video_url,
4164             'ext':      video_extension,
4165             'title':    video_title,
4166         }]
4167         
4168 class InaIE(InfoExtractor):
4169     """Information Extractor for Ina.fr"""
4170     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4171
4172     def _real_extract(self,url):
4173         mobj = re.match(self._VALID_URL, url)
4174
4175         video_id = mobj.group('id')
4176         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4177         video_extension = 'mp4'
4178         webpage = self._download_webpage(mrss_url, video_id)
4179
4180         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4181         if mobj is None:
4182             raise ExtractorError(u'Unable to extract media URL')
4183         video_url = mobj.group(1)
4184
4185         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4186         if mobj is None:
4187             raise ExtractorError(u'Unable to extract title')
4188         video_title = mobj.group(1)
4189
4190         return [{
4191             'id':       video_id,
4192             'url':      video_url,
4193             'ext':      video_extension,
4194             'title':    video_title,
4195         }]
4196
4197 class HowcastIE(InfoExtractor):
4198     """Information Extractor for Howcast.com"""
4199     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4200
4201     def _real_extract(self, url):
4202         mobj = re.match(self._VALID_URL, url)
4203
4204         video_id = mobj.group('id')
4205         webpage_url = 'http://www.howcast.com/videos/' + video_id
4206         webpage = self._download_webpage(webpage_url, video_id)
4207
4208         self.report_extraction(video_id)
4209
4210         mobj = re.search(r'\'file\': "(http://mobile-media\.howcast\.com/\d+\.mp4)"', webpage)
4211         if mobj is None:
4212             raise ExtractorError(u'Unable to extract video URL')
4213         video_url = mobj.group(1)
4214
4215         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4216         if mobj is None:
4217             raise ExtractorError(u'Unable to extract title')
4218         video_title = mobj.group(1) or mobj.group(2)
4219
4220         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4221         if mobj is None:
4222             self._downloader.report_warning(u'unable to extract description')
4223             video_description = None
4224         else:
4225             video_description = mobj.group(1) or mobj.group(2)
4226
4227         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4228         if mobj is None:
4229             raise ExtractorError(u'Unable to extract thumbnail')
4230         thumbnail = mobj.group(1)
4231
4232         return [{
4233             'id':       video_id,
4234             'url':      video_url,
4235             'ext':      'mp4',
4236             'title':    video_title,
4237             'description': video_description,
4238             'thumbnail': thumbnail,
4239         }]
4240
4241 class VineIE(InfoExtractor):
4242     """Information Extractor for Vine.co"""
4243     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4244
4245     def _real_extract(self, url):
4246
4247         mobj = re.match(self._VALID_URL, url)
4248
4249         video_id = mobj.group('id')
4250         webpage_url = 'https://vine.co/v/' + video_id
4251         webpage = self._download_webpage(webpage_url, video_id)
4252
4253         self.report_extraction(video_id)
4254
4255         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4256         if mobj is None:
4257             raise ExtractorError(u'Unable to extract video URL')
4258         video_url = mobj.group(1)
4259
4260         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4261         if mobj is None:
4262             raise ExtractorError(u'Unable to extract title')
4263         video_title = mobj.group(1)
4264
4265         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4266         if mobj is None:
4267             raise ExtractorError(u'Unable to extract thumbnail')
4268         thumbnail = mobj.group(1)
4269
4270         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4271         if mobj is None:
4272             raise ExtractorError(u'Unable to extract uploader')
4273         uploader = mobj.group(1)
4274
4275         return [{
4276             'id':        video_id,
4277             'url':       video_url,
4278             'ext':       'mp4',
4279             'title':     video_title,
4280             'thumbnail': thumbnail,
4281             'uploader':  uploader,
4282         }]
4283
4284 def gen_extractors():
4285     """ Return a list of an instance of every supported extractor.
4286     The order does matter; the first extractor matched is the one handling the URL.
4287     """
4288     return [
4289         YoutubePlaylistIE(),
4290         YoutubeChannelIE(),
4291         YoutubeUserIE(),
4292         YoutubeSearchIE(),
4293         YoutubeIE(),
4294         MetacafeIE(),
4295         DailymotionIE(),
4296         GoogleSearchIE(),
4297         PhotobucketIE(),
4298         YahooIE(),
4299         YahooSearchIE(),
4300         DepositFilesIE(),
4301         FacebookIE(),
4302         BlipTVUserIE(),
4303         BlipTVIE(),
4304         VimeoIE(),
4305         MyVideoIE(),
4306         ComedyCentralIE(),
4307         EscapistIE(),
4308         CollegeHumorIE(),
4309         XVideosIE(),
4310         SoundcloudSetIE(),
4311         SoundcloudIE(),
4312         InfoQIE(),
4313         MixcloudIE(),
4314         StanfordOpenClassroomIE(),
4315         MTVIE(),
4316         YoukuIE(),
4317         XNXXIE(),
4318         YouJizzIE(),
4319         PornotubeIE(),
4320         YouPornIE(),
4321         GooglePlusIE(),
4322         ArteTvIE(),
4323         NBAIE(),
4324         WorldStarHipHopIE(),
4325         JustinTVIE(),
4326         FunnyOrDieIE(),
4327         SteamIE(),
4328         UstreamIE(),
4329         RBMARadioIE(),
4330         EightTracksIE(),
4331         KeekIE(),
4332         TEDIE(),
4333         MySpassIE(),
4334         SpiegelIE(),
4335         LiveLeakIE(),
4336         ARDIE(),
4337         TumblrIE(),
4338         BandcampIE(),
4339         RedTubeIE(),
4340         InaIE(),
4341         HowcastIE(),
4342         VineIE(),
4343         GenericIE()
4344     ]
4345
4346 def get_info_extractor(ie_name):
4347     """Returns the info extractor class with the given ie_name"""
4348     return globals()[ie_name+'IE']