BlipTV: accept urls in the format http://a.blip.tv/api.swf#{id} (closes #857)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 class InfoExtractor(object):
27     """Information Extractor class.
28
29     Information extractors are the classes that, given a URL, extract
30     information about the video (or videos) the URL refers to. This
31     information includes the real video URL, the video title, author and
32     others. The information is stored in a dictionary which is then
33     passed to the FileDownloader. The FileDownloader processes this
34     information possibly downloading the video to the file system, among
35     other possible outcomes.
36
37     The dictionaries must include the following fields:
38
39     id:             Video identifier.
40     url:            Final video URL.
41     title:          Video title, unescaped.
42     ext:            Video filename extension.
43
44     The following fields are optional:
45
46     format:         The video format, defaults to ext (used for --get-format)
47     thumbnail:      Full URL to a video thumbnail image.
48     description:    One-line video description.
49     uploader:       Full name of the video uploader.
50     upload_date:    Video upload date (YYYYMMDD).
51     uploader_id:    Nickname or id of the video uploader.
52     location:       Physical location of the video.
53     player_url:     SWF Player URL (used for rtmpdump).
54     subtitles:      The subtitle file contents.
55     urlhandle:      [internal] The urlHandle to be used to download the file,
56                     like returned by urllib.request.urlopen
57
58     The fields should all be Unicode strings.
59
60     Subclasses of this one should re-define the _real_initialize() and
61     _real_extract() methods and define a _VALID_URL regexp.
62     Probably, they should also be added to the list of extractors.
63
64     _real_extract() must return a *list* of information dictionaries as
65     described above.
66
67     Finally, the _WORKING attribute should be set to False for broken IEs
68     in order to warn the users and skip the tests.
69     """
70
71     _ready = False
72     _downloader = None
73     _WORKING = True
74
75     def __init__(self, downloader=None):
76         """Constructor. Receives an optional downloader."""
77         self._ready = False
78         self.set_downloader(downloader)
79
80     @classmethod
81     def suitable(cls, url):
82         """Receives a URL and returns True if suitable for this IE."""
83         return re.match(cls._VALID_URL, url) is not None
84
85     @classmethod
86     def working(cls):
87         """Getter method for _WORKING."""
88         return cls._WORKING
89
90     def initialize(self):
91         """Initializes an instance (authentication, etc)."""
92         if not self._ready:
93             self._real_initialize()
94             self._ready = True
95
96     def extract(self, url):
97         """Extracts URL information and returns it in list of dicts."""
98         self.initialize()
99         return self._real_extract(url)
100
101     def set_downloader(self, downloader):
102         """Sets the downloader for this IE."""
103         self._downloader = downloader
104
105     def _real_initialize(self):
106         """Real initialization process. Redefine in subclasses."""
107         pass
108
109     def _real_extract(self, url):
110         """Real extraction process. Redefine in subclasses."""
111         pass
112
113     @property
114     def IE_NAME(self):
115         return type(self).__name__[:-2]
116
117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118         """ Returns the response handle """
119         if note is None:
120             self.report_download_webpage(video_id)
121         elif note is not False:
122             self.to_screen(u'%s: %s' % (video_id, note))
123         try:
124             return compat_urllib_request.urlopen(url_or_request)
125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126             if errnote is None:
127                 errnote = u'Unable to download webpage'
128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
129
130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131         """ Returns a tuple (page content as string, URL handle) """
132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133         content_type = urlh.headers.get('Content-Type', '')
134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
135         if m:
136             encoding = m.group(1)
137         else:
138             encoding = 'utf-8'
139         webpage_bytes = urlh.read()
140         if self._downloader.params.get('dump_intermediate_pages', False):
141             try:
142                 url = url_or_request.get_full_url()
143             except AttributeError:
144                 url = url_or_request
145             self.to_screen(u'Dumping request to ' + url)
146             dump = base64.b64encode(webpage_bytes).decode('ascii')
147             self._downloader.to_screen(dump)
148         content = webpage_bytes.decode(encoding, 'replace')
149         return (content, urlh)
150
151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152         """ Returns the data of the page as a string """
153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
154
155     def to_screen(self, msg):
156         """Print msg to screen, prefixing it with '[ie_name]'"""
157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
158
159     def report_extraction(self, id_or_name):
160         """Report information extraction."""
161         self.to_screen(u'%s: Extracting information' % id_or_name)
162
163     def report_download_webpage(self, video_id):
164         """Report webpage download."""
165         self.to_screen(u'%s: Downloading webpage' % video_id)
166
167     def report_age_confirmation(self):
168         """Report attempt to confirm age."""
169         self.to_screen(u'Confirming age')
170
171     #Methods for following #608
172     #They set the correct value of the '_type' key
173     def video_result(self, video_info):
174         """Returns a video"""
175         video_info['_type'] = 'video'
176         return video_info
177     def url_result(self, url, ie=None):
178         """Returns a url that points to a page that should be processed"""
179         #TODO: ie should be the class used for getting the info
180         video_info = {'_type': 'url',
181                       'url': url,
182                       'ie_key': ie}
183         return video_info
184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185         """Returns a playlist"""
186         video_info = {'_type': 'playlist',
187                       'entries': entries}
188         if playlist_id:
189             video_info['id'] = playlist_id
190         if playlist_title:
191             video_info['title'] = playlist_title
192         return video_info
193
194 class SearchInfoExtractor(InfoExtractor):
195     """
196     Base class for paged search queries extractors.
197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
199     """
200
201     @classmethod
202     def _make_valid_url(cls):
203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
204
205     @classmethod
206     def suitable(cls, url):
207         return re.match(cls._make_valid_url(), url) is not None
208
209     def _real_extract(self, query):
210         mobj = re.match(self._make_valid_url(), query)
211         if mobj is None:
212             raise ExtractorError(u'Invalid search query "%s"' % query)
213
214         prefix = mobj.group('prefix')
215         query = mobj.group('query')
216         if prefix == '':
217             return self._get_n_results(query, 1)
218         elif prefix == 'all':
219             return self._get_n_results(query, self._MAX_RESULTS)
220         else:
221             n = int(prefix)
222             if n <= 0:
223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224             elif n > self._MAX_RESULTS:
225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226                 n = self._MAX_RESULTS
227             return self._get_n_results(query, n)
228
229     def _get_n_results(self, query, n):
230         """Get a specified number of results for a query"""
231         raise NotImplementedError("This method must be implemented by sublclasses")
232
233
234 class YoutubeIE(InfoExtractor):
235     """Information extractor for youtube.com."""
236
237     _VALID_URL = r"""^
238                      (
239                          (?:https?://)?                                       # http(s):// (optional)
240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
243                          (?:                                                  # the various things that can precede the ID:
244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
245                              |(?:                                             # or the v= param in all its forms
246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
249                                  v=
250                              )
251                          )?                                                   # optional -> youtube.com/xxxx is OK
252                      )?                                                       # all until now is optional -> you can pass the naked ID
253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
254                      (?(1).+)?                                                # if we found the ID, everything can follow
255                      $"""
256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260     _NETRC_MACHINE = 'youtube'
261     # Listed in order of quality
262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264     _video_extensions = {
265         '13': '3gp',
266         '17': 'mp4',
267         '18': 'mp4',
268         '22': 'mp4',
269         '37': 'mp4',
270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
271         '43': 'webm',
272         '44': 'webm',
273         '45': 'webm',
274         '46': 'webm',
275     }
276     _video_dimensions = {
277         '5': '240x400',
278         '6': '???',
279         '13': '???',
280         '17': '144x176',
281         '18': '360x640',
282         '22': '720x1280',
283         '34': '360x640',
284         '35': '480x854',
285         '37': '1080x1920',
286         '38': '3072x4096',
287         '43': '360x640',
288         '44': '480x854',
289         '45': '720x1280',
290         '46': '1080x1920',
291     }
292     IE_NAME = u'youtube'
293
294     @classmethod
295     def suitable(cls, url):
296         """Receives a URL and returns True if suitable for this IE."""
297         if YoutubePlaylistIE.suitable(url): return False
298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
299
300     def report_lang(self):
301         """Report attempt to set language."""
302         self.to_screen(u'Setting language')
303
304     def report_login(self):
305         """Report attempt to log in."""
306         self.to_screen(u'Logging in')
307
308     def report_video_webpage_download(self, video_id):
309         """Report attempt to download video webpage."""
310         self.to_screen(u'%s: Downloading video webpage' % video_id)
311
312     def report_video_info_webpage_download(self, video_id):
313         """Report attempt to download video info webpage."""
314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
315
316     def report_video_subtitles_download(self, video_id):
317         """Report attempt to download video info webpage."""
318         self.to_screen(u'%s: Checking available subtitles' % video_id)
319
320     def report_video_subtitles_request(self, video_id, sub_lang, format):
321         """Report attempt to download video info webpage."""
322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
323
324     def report_video_subtitles_available(self, video_id, sub_lang_list):
325         """Report available subtitles."""
326         sub_lang = ",".join(list(sub_lang_list.keys()))
327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
328
329     def report_information_extraction(self, video_id):
330         """Report attempt to extract video information."""
331         self.to_screen(u'%s: Extracting video information' % video_id)
332
333     def report_unavailable_format(self, video_id, format):
334         """Report extracted video URL."""
335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
336
337     def report_rtmp_download(self):
338         """Indicate the download will use the RTMP protocol."""
339         self.to_screen(u'RTMP download detected')
340
341     def _get_available_subtitles(self, video_id):
342         self.report_video_subtitles_download(video_id)
343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
344         try:
345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350         if not sub_lang_list:
351             return (u'video doesn\'t have subtitles', None)
352         return sub_lang_list
353
354     def _list_available_subtitles(self, video_id):
355         sub_lang_list = self._get_available_subtitles(video_id)
356         self.report_video_subtitles_available(video_id, sub_lang_list)
357
358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
359         """
360         Return tuple:
361         (error_message, sub_lang, sub)
362         """
363         self.report_video_subtitles_request(video_id, sub_lang, format)
364         params = compat_urllib_parse.urlencode({
365             'lang': sub_lang,
366             'name': sub_name,
367             'v': video_id,
368             'fmt': format,
369         })
370         url = 'http://www.youtube.com/api/timedtext?' + params
371         try:
372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
375         if not sub:
376             return (u'Did not fetch video subtitles', None, None)
377         return (None, sub_lang, sub)
378
379     def _extract_subtitle(self, video_id):
380         """
381         Return a list with a tuple:
382         [(error_message, sub_lang, sub)]
383         """
384         sub_lang_list = self._get_available_subtitles(video_id)
385         sub_format = self._downloader.params.get('subtitlesformat')
386         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
387             return [(sub_lang_list[0], None, None)]
388         if self._downloader.params.get('subtitleslang', False):
389             sub_lang = self._downloader.params.get('subtitleslang')
390         elif 'en' in sub_lang_list:
391             sub_lang = 'en'
392         else:
393             sub_lang = list(sub_lang_list.keys())[0]
394         if not sub_lang in sub_lang_list:
395             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
396
397         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
398         return [subtitle]
399
400     def _extract_all_subtitles(self, video_id):
401         sub_lang_list = self._get_available_subtitles(video_id)
402         sub_format = self._downloader.params.get('subtitlesformat')
403         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
404             return [(sub_lang_list[0], None, None)]
405         subtitles = []
406         for sub_lang in sub_lang_list:
407             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
408             subtitles.append(subtitle)
409         return subtitles
410
411     def _print_formats(self, formats):
412         print('Available formats:')
413         for x in formats:
414             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
415
416     def _real_initialize(self):
417         if self._downloader is None:
418             return
419
420         username = None
421         password = None
422         downloader_params = self._downloader.params
423
424         # Attempt to use provided username and password or .netrc data
425         if downloader_params.get('username', None) is not None:
426             username = downloader_params['username']
427             password = downloader_params['password']
428         elif downloader_params.get('usenetrc', False):
429             try:
430                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431                 if info is not None:
432                     username = info[0]
433                     password = info[2]
434                 else:
435                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
436             except (IOError, netrc.NetrcParseError) as err:
437                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438                 return
439
440         # Set language
441         request = compat_urllib_request.Request(self._LANG_URL)
442         try:
443             self.report_lang()
444             compat_urllib_request.urlopen(request).read()
445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
447             return
448
449         # No authentication to be performed
450         if username is None:
451             return
452
453         request = compat_urllib_request.Request(self._LOGIN_URL)
454         try:
455             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
458             return
459
460         galx = None
461         dsh = None
462         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
463         if match:
464           galx = match.group(1)
465
466         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
467         if match:
468           dsh = match.group(1)
469
470         # Log in
471         login_form_strs = {
472                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473                 u'Email': username,
474                 u'GALX': galx,
475                 u'Passwd': password,
476                 u'PersistentCookie': u'yes',
477                 u'_utf8': u'霱',
478                 u'bgresponse': u'js_disabled',
479                 u'checkConnection': u'',
480                 u'checkedDomains': u'youtube',
481                 u'dnConn': u'',
482                 u'dsh': dsh,
483                 u'pstMsg': u'0',
484                 u'rmShown': u'1',
485                 u'secTok': u'',
486                 u'signIn': u'Sign in',
487                 u'timeStmp': u'',
488                 u'service': u'youtube',
489                 u'uilel': u'3',
490                 u'hl': u'en_US',
491         }
492         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
493         # chokes on unicode
494         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
495         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
496         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
497         try:
498             self.report_login()
499             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
500             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
501                 self._downloader.report_warning(u'unable to log in: bad username or password')
502                 return
503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
505             return
506
507         # Confirm age
508         age_form = {
509                 'next_url':     '/',
510                 'action_confirm':   'Confirm',
511                 }
512         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
513         try:
514             self.report_age_confirmation()
515             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
518
519     def _extract_id(self, url):
520         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
521         if mobj is None:
522             raise ExtractorError(u'Invalid URL: %s' % url)
523         video_id = mobj.group(2)
524         return video_id
525
526     def _real_extract(self, url):
527         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
528         mobj = re.search(self._NEXT_URL_RE, url)
529         if mobj:
530             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
531         video_id = self._extract_id(url)
532
533         # Get video webpage
534         self.report_video_webpage_download(video_id)
535         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
536         request = compat_urllib_request.Request(url)
537         try:
538             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
540             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
541
542         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
543
544         # Attempt to extract SWF player URL
545         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
546         if mobj is not None:
547             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
548         else:
549             player_url = None
550
551         # Get video info
552         self.report_video_info_webpage_download(video_id)
553         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
554             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
555                     % (video_id, el_type))
556             video_info_webpage = self._download_webpage(video_info_url, video_id,
557                                     note=False,
558                                     errnote='unable to download video info webpage')
559             video_info = compat_parse_qs(video_info_webpage)
560             if 'token' in video_info:
561                 break
562         if 'token' not in video_info:
563             if 'reason' in video_info:
564                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
565             else:
566                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
567
568         # Check for "rental" videos
569         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
570             raise ExtractorError(u'"rental" videos not supported')
571
572         # Start extracting information
573         self.report_information_extraction(video_id)
574
575         # uploader
576         if 'author' not in video_info:
577             raise ExtractorError(u'Unable to extract uploader name')
578         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
579
580         # uploader_id
581         video_uploader_id = None
582         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
583         if mobj is not None:
584             video_uploader_id = mobj.group(1)
585         else:
586             self._downloader.report_warning(u'unable to extract uploader nickname')
587
588         # title
589         if 'title' not in video_info:
590             raise ExtractorError(u'Unable to extract video title')
591         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
592
593         # thumbnail image
594         if 'thumbnail_url' not in video_info:
595             self._downloader.report_warning(u'unable to extract video thumbnail')
596             video_thumbnail = ''
597         else:   # don't panic if we can't find it
598             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599
600         # upload date
601         upload_date = None
602         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
603         if mobj is not None:
604             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
605             upload_date = unified_strdate(upload_date)
606
607         # description
608         video_description = get_element_by_id("eow-description", video_webpage)
609         if video_description:
610             video_description = clean_html(video_description)
611         else:
612             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
613             if fd_mobj:
614                 video_description = unescapeHTML(fd_mobj.group(1))
615             else:
616                 video_description = u''
617
618         # subtitles
619         video_subtitles = None
620
621         if self._downloader.params.get('writesubtitles', False):
622             video_subtitles = self._extract_subtitle(video_id)
623             if video_subtitles:
624                 (sub_error, sub_lang, sub) = video_subtitles[0]
625                 if sub_error:
626                     self._downloader.report_error(sub_error)
627
628         if self._downloader.params.get('allsubtitles', False):
629             video_subtitles = self._extract_all_subtitles(video_id)
630             for video_subtitle in video_subtitles:
631                 (sub_error, sub_lang, sub) = video_subtitle
632                 if sub_error:
633                     self._downloader.report_error(sub_error)
634
635         if self._downloader.params.get('listsubtitles', False):
636             sub_lang_list = self._list_available_subtitles(video_id)
637             return
638
639         if 'length_seconds' not in video_info:
640             self._downloader.report_warning(u'unable to extract video duration')
641             video_duration = ''
642         else:
643             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
644
645         # token
646         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
647
648         # Decide which formats to download
649         req_format = self._downloader.params.get('format', None)
650
651         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
652             self.report_rtmp_download()
653             video_url_list = [(None, video_info['conn'][0])]
654         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
655             url_map = {}
656             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
657                 url_data = compat_parse_qs(url_data_str)
658                 if 'itag' in url_data and 'url' in url_data:
659                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
660                     if not 'ratebypass' in url: url += '&ratebypass=yes'
661                     url_map[url_data['itag'][0]] = url
662
663             format_limit = self._downloader.params.get('format_limit', None)
664             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
665             if format_limit is not None and format_limit in available_formats:
666                 format_list = available_formats[available_formats.index(format_limit):]
667             else:
668                 format_list = available_formats
669             existing_formats = [x for x in format_list if x in url_map]
670             if len(existing_formats) == 0:
671                 raise ExtractorError(u'no known formats available for video')
672             if self._downloader.params.get('listformats', None):
673                 self._print_formats(existing_formats)
674                 return
675             if req_format is None or req_format == 'best':
676                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
677             elif req_format == 'worst':
678                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
679             elif req_format in ('-1', 'all'):
680                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
681             else:
682                 # Specific formats. We pick the first in a slash-delimeted sequence.
683                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
684                 req_formats = req_format.split('/')
685                 video_url_list = None
686                 for rf in req_formats:
687                     if rf in url_map:
688                         video_url_list = [(rf, url_map[rf])]
689                         break
690                 if video_url_list is None:
691                     raise ExtractorError(u'requested format not available')
692         else:
693             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
694
695         results = []
696         for format_param, video_real_url in video_url_list:
697             # Extension
698             video_extension = self._video_extensions.get(format_param, 'flv')
699
700             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
701                                               self._video_dimensions.get(format_param, '???'))
702
703             results.append({
704                 'id':       video_id,
705                 'url':      video_real_url,
706                 'uploader': video_uploader,
707                 'uploader_id': video_uploader_id,
708                 'upload_date':  upload_date,
709                 'title':    video_title,
710                 'ext':      video_extension,
711                 'format':   video_format,
712                 'thumbnail':    video_thumbnail,
713                 'description':  video_description,
714                 'player_url':   player_url,
715                 'subtitles':    video_subtitles,
716                 'duration':     video_duration
717             })
718         return results
719
720
721 class MetacafeIE(InfoExtractor):
722     """Information Extractor for metacafe.com."""
723
724     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
725     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
726     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
727     IE_NAME = u'metacafe'
728
729     def report_disclaimer(self):
730         """Report disclaimer retrieval."""
731         self.to_screen(u'Retrieving disclaimer')
732
733     def _real_initialize(self):
734         # Retrieve disclaimer
735         request = compat_urllib_request.Request(self._DISCLAIMER)
736         try:
737             self.report_disclaimer()
738             disclaimer = compat_urllib_request.urlopen(request).read()
739         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
740             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
741
742         # Confirm age
743         disclaimer_form = {
744             'filters': '0',
745             'submit': "Continue - I'm over 18",
746             }
747         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
748         try:
749             self.report_age_confirmation()
750             disclaimer = compat_urllib_request.urlopen(request).read()
751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
752             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
753
754     def _real_extract(self, url):
755         # Extract id and simplified title from URL
756         mobj = re.match(self._VALID_URL, url)
757         if mobj is None:
758             raise ExtractorError(u'Invalid URL: %s' % url)
759
760         video_id = mobj.group(1)
761
762         # Check if video comes from YouTube
763         mobj2 = re.match(r'^yt-(.*)$', video_id)
764         if mobj2 is not None:
765             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
766
767         # Retrieve video webpage to extract further information
768         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
769
770         # Extract URL, uploader and title from webpage
771         self.report_extraction(video_id)
772         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
773         if mobj is not None:
774             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775             video_extension = mediaURL[-3:]
776
777             # Extract gdaKey if available
778             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779             if mobj is None:
780                 video_url = mediaURL
781             else:
782                 gdaKey = mobj.group(1)
783                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
784         else:
785             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
786             if mobj is None:
787                 raise ExtractorError(u'Unable to extract media URL')
788             vardict = compat_parse_qs(mobj.group(1))
789             if 'mediaData' not in vardict:
790                 raise ExtractorError(u'Unable to extract media URL')
791             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
792             if mobj is None:
793                 raise ExtractorError(u'Unable to extract media URL')
794             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
795             video_extension = mediaURL[-3:]
796             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
797
798         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
799         if mobj is None:
800             raise ExtractorError(u'Unable to extract title')
801         video_title = mobj.group(1).decode('utf-8')
802
803         mobj = re.search(r'submitter=(.*?);', webpage)
804         if mobj is None:
805             raise ExtractorError(u'Unable to extract uploader nickname')
806         video_uploader = mobj.group(1)
807
808         return [{
809             'id':       video_id.decode('utf-8'),
810             'url':      video_url.decode('utf-8'),
811             'uploader': video_uploader.decode('utf-8'),
812             'upload_date':  None,
813             'title':    video_title,
814             'ext':      video_extension.decode('utf-8'),
815         }]
816
817 class DailymotionIE(InfoExtractor):
818     """Information Extractor for Dailymotion"""
819
820     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
821     IE_NAME = u'dailymotion'
822
823     def _real_extract(self, url):
824         # Extract id and simplified title from URL
825         mobj = re.match(self._VALID_URL, url)
826         if mobj is None:
827             raise ExtractorError(u'Invalid URL: %s' % url)
828
829         video_id = mobj.group(1).split('_')[0].split('?')[0]
830
831         video_extension = 'mp4'
832
833         # Retrieve video webpage to extract further information
834         request = compat_urllib_request.Request(url)
835         request.add_header('Cookie', 'family_filter=off')
836         webpage = self._download_webpage(request, video_id)
837
838         # Extract URL, uploader and title from webpage
839         self.report_extraction(video_id)
840         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
841         if mobj is None:
842             raise ExtractorError(u'Unable to extract media URL')
843         flashvars = compat_urllib_parse.unquote(mobj.group(1))
844
845         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
846             if key in flashvars:
847                 max_quality = key
848                 self.to_screen(u'Using %s' % key)
849                 break
850         else:
851             raise ExtractorError(u'Unable to extract video URL')
852
853         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
854         if mobj is None:
855             raise ExtractorError(u'Unable to extract video URL')
856
857         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
858
859         # TODO: support choosing qualities
860
861         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
862         if mobj is None:
863             raise ExtractorError(u'Unable to extract title')
864         video_title = unescapeHTML(mobj.group('title'))
865
866         video_uploader = None
867         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
868         if mobj is None:
869             # lookin for official user
870             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
871             if mobj_official is None:
872                 self._downloader.report_warning(u'unable to extract uploader nickname')
873             else:
874                 video_uploader = mobj_official.group(1)
875         else:
876             video_uploader = mobj.group(1)
877
878         video_upload_date = None
879         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
880         if mobj is not None:
881             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
882
883         return [{
884             'id':       video_id,
885             'url':      video_url,
886             'uploader': video_uploader,
887             'upload_date':  video_upload_date,
888             'title':    video_title,
889             'ext':      video_extension,
890         }]
891
892
893 class PhotobucketIE(InfoExtractor):
894     """Information extractor for photobucket.com."""
895
896     # TODO: the original _VALID_URL was:
897     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
898     # Check if it's necessary to keep the old extracion process
899     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
900     IE_NAME = u'photobucket'
901
902     def _real_extract(self, url):
903         # Extract id from URL
904         mobj = re.match(self._VALID_URL, url)
905         if mobj is None:
906             raise ExtractorError(u'Invalid URL: %s' % url)
907
908         video_id = mobj.group('id')
909
910         video_extension = mobj.group('ext')
911
912         # Retrieve video webpage to extract further information
913         webpage = self._download_webpage(url, video_id)
914
915         # Extract URL, uploader, and title from webpage
916         self.report_extraction(video_id)
917         # We try first by looking the javascript code:
918         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
919         if mobj is not None:
920             info = json.loads(mobj.group('json'))
921             return [{
922                 'id':       video_id,
923                 'url':      info[u'downloadUrl'],
924                 'uploader': info[u'username'],
925                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
926                 'title':    info[u'title'],
927                 'ext':      video_extension,
928                 'thumbnail': info[u'thumbUrl'],
929             }]
930
931         # We try looking in other parts of the webpage
932         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
933         if mobj is None:
934             raise ExtractorError(u'Unable to extract media URL')
935         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936
937         video_url = mediaURL
938
939         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
940         if mobj is None:
941             raise ExtractorError(u'Unable to extract title')
942         video_title = mobj.group(1).decode('utf-8')
943
944         video_uploader = mobj.group(2).decode('utf-8')
945
946         return [{
947             'id':       video_id.decode('utf-8'),
948             'url':      video_url.decode('utf-8'),
949             'uploader': video_uploader,
950             'upload_date':  None,
951             'title':    video_title,
952             'ext':      video_extension.decode('utf-8'),
953         }]
954
955
956 class YahooIE(InfoExtractor):
957     """Information extractor for screen.yahoo.com."""
958     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
959
960     def _real_extract(self, url):
961         mobj = re.match(self._VALID_URL, url)
962         if mobj is None:
963             raise ExtractorError(u'Invalid URL: %s' % url)
964         video_id = mobj.group('id')
965         webpage = self._download_webpage(url, video_id)
966         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
967
968         if m_id is None: 
969             # TODO: Check which url parameters are required
970             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
971             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
972             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
973                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
974                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
975                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
976                         '''
977             self.report_extraction(video_id)
978             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
979             if m_info is None:
980                 raise ExtractorError(u'Unable to extract video info')
981             video_title = m_info.group('title')
982             video_description = m_info.group('description')
983             video_thumb = m_info.group('thumb')
984             video_date = m_info.group('date')
985             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
986     
987             # TODO: Find a way to get mp4 videos
988             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
989             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
990             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
991             video_url = m_rest.group('url')
992             video_path = m_rest.group('path')
993             if m_rest is None:
994                 raise ExtractorError(u'Unable to extract video url')
995
996         else: # We have to use a different method if another id is defined
997             long_id = m_id.group('new_id')
998             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
999             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001             info = json.loads(json_str)
1002             res = info[u'query'][u'results'][u'mediaObj'][0]
1003             stream = res[u'streams'][0]
1004             video_path = stream[u'path']
1005             video_url = stream[u'host']
1006             meta = res[u'meta']
1007             video_title = meta[u'title']
1008             video_description = meta[u'description']
1009             video_thumb = meta[u'thumbnail']
1010             video_date = None # I can't find it
1011
1012         info_dict = {
1013                      'id': video_id,
1014                      'url': video_url,
1015                      'play_path': video_path,
1016                      'title':video_title,
1017                      'description': video_description,
1018                      'thumbnail': video_thumb,
1019                      'upload_date': video_date,
1020                      'ext': 'flv',
1021                      }
1022         return info_dict
1023
1024 class VimeoIE(InfoExtractor):
1025     """Information extractor for vimeo.com."""
1026
1027     # _VALID_URL matches Vimeo URLs
1028     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1029     IE_NAME = u'vimeo'
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             raise ExtractorError(u'Invalid URL: %s' % url)
1036
1037         video_id = mobj.group('id')
1038         if not mobj.group('proto'):
1039             url = 'https://' + url
1040         if mobj.group('direct_link'):
1041             url = 'https://vimeo.com/' + video_id
1042
1043         # Retrieve video webpage to extract further information
1044         request = compat_urllib_request.Request(url, None, std_headers)
1045         webpage = self._download_webpage(request, video_id)
1046
1047         # Now we begin extracting as much information as we can from what we
1048         # retrieved. First we extract the information common to all extractors,
1049         # and latter we extract those that are Vimeo specific.
1050         self.report_extraction(video_id)
1051
1052         # Extract the config JSON
1053         try:
1054             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055             config = json.loads(config)
1056         except:
1057             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1059             else:
1060                 raise ExtractorError(u'Unable to extract info section')
1061
1062         # Extract title
1063         video_title = config["video"]["title"]
1064
1065         # Extract uploader and uploader_id
1066         video_uploader = config["video"]["owner"]["name"]
1067         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1068
1069         # Extract video thumbnail
1070         video_thumbnail = config["video"]["thumbnail"]
1071
1072         # Extract video description
1073         video_description = get_element_by_attribute("itemprop", "description", webpage)
1074         if video_description: video_description = clean_html(video_description)
1075         else: video_description = u''
1076
1077         # Extract upload date
1078         video_upload_date = None
1079         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080         if mobj is not None:
1081             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1082
1083         # Vimeo specific: extract request signature and timestamp
1084         sig = config['request']['signature']
1085         timestamp = config['request']['timestamp']
1086
1087         # Vimeo specific: extract video codec and quality information
1088         # First consider quality, then codecs, then take everything
1089         # TODO bind to format param
1090         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091         files = { 'hd': [], 'sd': [], 'other': []}
1092         for codec_name, codec_extension in codecs:
1093             if codec_name in config["video"]["files"]:
1094                 if 'hd' in config["video"]["files"][codec_name]:
1095                     files['hd'].append((codec_name, codec_extension, 'hd'))
1096                 elif 'sd' in config["video"]["files"][codec_name]:
1097                     files['sd'].append((codec_name, codec_extension, 'sd'))
1098                 else:
1099                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1100
1101         for quality in ('hd', 'sd', 'other'):
1102             if len(files[quality]) > 0:
1103                 video_quality = files[quality][0][2]
1104                 video_codec = files[quality][0][0]
1105                 video_extension = files[quality][0][1]
1106                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1107                 break
1108         else:
1109             raise ExtractorError(u'No known codec found')
1110
1111         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1113
1114         return [{
1115             'id':       video_id,
1116             'url':      video_url,
1117             'uploader': video_uploader,
1118             'uploader_id': video_uploader_id,
1119             'upload_date':  video_upload_date,
1120             'title':    video_title,
1121             'ext':      video_extension,
1122             'thumbnail':    video_thumbnail,
1123             'description':  video_description,
1124         }]
1125
1126
1127 class ArteTvIE(InfoExtractor):
1128     """arte.tv information extractor."""
1129
1130     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131     _LIVE_URL = r'index-[0-9]+\.html$'
1132
1133     IE_NAME = u'arte.tv'
1134
1135     def fetch_webpage(self, url):
1136         request = compat_urllib_request.Request(url)
1137         try:
1138             self.report_download_webpage(url)
1139             webpage = compat_urllib_request.urlopen(request).read()
1140         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142         except ValueError as err:
1143             raise ExtractorError(u'Invalid URL: %s' % url)
1144         return webpage
1145
1146     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147         page = self.fetch_webpage(url)
1148         mobj = re.search(regex, page, regexFlags)
1149         info = {}
1150
1151         if mobj is None:
1152             raise ExtractorError(u'Invalid URL: %s' % url)
1153
1154         for (i, key, err) in matchTuples:
1155             if mobj.group(i) is None:
1156                 raise ExtractorError(err)
1157             else:
1158                 info[key] = mobj.group(i)
1159
1160         return info
1161
1162     def extractLiveStream(self, url):
1163         video_lang = url.split('/')[-4]
1164         info = self.grep_webpage(
1165             url,
1166             r'src="(.*?/videothek_js.*?\.js)',
1167             0,
1168             [
1169                 (1, 'url', u'Invalid URL: %s' % url)
1170             ]
1171         )
1172         http_host = url.split('/')[2]
1173         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177                 '(http://.*?\.swf).*?' +
1178                 '(rtmp://.*?)\'',
1179             re.DOTALL,
1180             [
1181                 (1, 'path',   u'could not extract video path: %s' % url),
1182                 (2, 'player', u'could not extract video player: %s' % url),
1183                 (3, 'url',    u'could not extract video url: %s' % url)
1184             ]
1185         )
1186         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1187
1188     def extractPlus7Stream(self, url):
1189         video_lang = url.split('/')[-3]
1190         info = self.grep_webpage(
1191             url,
1192             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1193             0,
1194             [
1195                 (1, 'url', u'Invalid URL: %s' % url)
1196             ]
1197         )
1198         next_url = compat_urllib_parse.unquote(info.get('url'))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1202             0,
1203             [
1204                 (1, 'url', u'Could not find <video> tag: %s' % url)
1205             ]
1206         )
1207         next_url = compat_urllib_parse.unquote(info.get('url'))
1208
1209         info = self.grep_webpage(
1210             next_url,
1211             r'<video id="(.*?)".*?>.*?' +
1212                 '<name>(.*?)</name>.*?' +
1213                 '<dateVideo>(.*?)</dateVideo>.*?' +
1214                 '<url quality="hd">(.*?)</url>',
1215             re.DOTALL,
1216             [
1217                 (1, 'id',    u'could not extract video id: %s' % url),
1218                 (2, 'title', u'could not extract video title: %s' % url),
1219                 (3, 'date',  u'could not extract video date: %s' % url),
1220                 (4, 'url',   u'could not extract video url: %s' % url)
1221             ]
1222         )
1223
1224         return {
1225             'id':           info.get('id'),
1226             'url':          compat_urllib_parse.unquote(info.get('url')),
1227             'uploader':     u'arte.tv',
1228             'upload_date':  unified_strdate(info.get('date')),
1229             'title':        info.get('title').decode('utf-8'),
1230             'ext':          u'mp4',
1231             'format':       u'NA',
1232             'player_url':   None,
1233         }
1234
1235     def _real_extract(self, url):
1236         video_id = url.split('/')[-1]
1237         self.report_extraction(video_id)
1238
1239         if re.search(self._LIVE_URL, video_id) is not None:
1240             self.extractLiveStream(url)
1241             return
1242         else:
1243             info = self.extractPlus7Stream(url)
1244
1245         return [info]
1246
1247
1248 class GenericIE(InfoExtractor):
1249     """Generic last-resort information extractor."""
1250
1251     _VALID_URL = r'.*'
1252     IE_NAME = u'generic'
1253
1254     def report_download_webpage(self, video_id):
1255         """Report webpage download."""
1256         if not self._downloader.params.get('test', False):
1257             self._downloader.report_warning(u'Falling back on generic information extractor.')
1258         super(GenericIE, self).report_download_webpage(video_id)
1259
1260     def report_following_redirect(self, new_url):
1261         """Report information extraction."""
1262         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1263
1264     def _test_redirect(self, url):
1265         """Check if it is a redirect, like url shorteners, in case return the new url."""
1266         class HeadRequest(compat_urllib_request.Request):
1267             def get_method(self):
1268                 return "HEAD"
1269
1270         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1271             """
1272             Subclass the HTTPRedirectHandler to make it use our
1273             HeadRequest also on the redirected URL
1274             """
1275             def redirect_request(self, req, fp, code, msg, headers, newurl):
1276                 if code in (301, 302, 303, 307):
1277                     newurl = newurl.replace(' ', '%20')
1278                     newheaders = dict((k,v) for k,v in req.headers.items()
1279                                       if k.lower() not in ("content-length", "content-type"))
1280                     return HeadRequest(newurl,
1281                                        headers=newheaders,
1282                                        origin_req_host=req.get_origin_req_host(),
1283                                        unverifiable=True)
1284                 else:
1285                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1286
1287         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1288             """
1289             Fallback to GET if HEAD is not allowed (405 HTTP error)
1290             """
1291             def http_error_405(self, req, fp, code, msg, headers):
1292                 fp.read()
1293                 fp.close()
1294
1295                 newheaders = dict((k,v) for k,v in req.headers.items()
1296                                   if k.lower() not in ("content-length", "content-type"))
1297                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1298                                                  headers=newheaders,
1299                                                  origin_req_host=req.get_origin_req_host(),
1300                                                  unverifiable=True))
1301
1302         # Build our opener
1303         opener = compat_urllib_request.OpenerDirector()
1304         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305                         HTTPMethodFallback, HEADRedirectHandler,
1306                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307             opener.add_handler(handler())
1308
1309         response = opener.open(HeadRequest(url))
1310         if response is None:
1311             raise ExtractorError(u'Invalid URL protocol')
1312         new_url = response.geturl()
1313
1314         if url == new_url:
1315             return False
1316
1317         self.report_following_redirect(new_url)
1318         return new_url
1319
1320     def _real_extract(self, url):
1321         new_url = self._test_redirect(url)
1322         if new_url: return [self.url_result(new_url)]
1323
1324         video_id = url.split('/')[-1]
1325         try:
1326             webpage = self._download_webpage(url, video_id)
1327         except ValueError as err:
1328             # since this is the last-resort InfoExtractor, if
1329             # this error is thrown, it'll be thrown here
1330             raise ExtractorError(u'Invalid URL: %s' % url)
1331
1332         self.report_extraction(video_id)
1333         # Start with something easy: JW Player in SWFObject
1334         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1335         if mobj is None:
1336             # Broaden the search a little bit
1337             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1338         if mobj is None:
1339             # Broaden the search a little bit: JWPlayer JS loader
1340             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1341         if mobj is None:
1342             raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344         # It's possible that one of the regexes
1345         # matched, but returned an empty group:
1346         if mobj.group(1) is None:
1347             raise ExtractorError(u'Invalid URL: %s' % url)
1348
1349         video_url = compat_urllib_parse.unquote(mobj.group(1))
1350         video_id = os.path.basename(video_url)
1351
1352         # here's a fun little line of code for you:
1353         video_extension = os.path.splitext(video_id)[1][1:]
1354         video_id = os.path.splitext(video_id)[0]
1355
1356         # it's tempting to parse this further, but you would
1357         # have to take into account all the variations like
1358         #   Video Title - Site Name
1359         #   Site Name | Video Title
1360         #   Video Title - Tagline | Site Name
1361         # and so on and so forth; it's just not practical
1362         mobj = re.search(r'<title>(.*)</title>', webpage)
1363         if mobj is None:
1364             raise ExtractorError(u'Unable to extract title')
1365         video_title = mobj.group(1)
1366
1367         # video uploader is domain name
1368         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1369         if mobj is None:
1370             raise ExtractorError(u'Unable to extract title')
1371         video_uploader = mobj.group(1)
1372
1373         return [{
1374             'id':       video_id,
1375             'url':      video_url,
1376             'uploader': video_uploader,
1377             'upload_date':  None,
1378             'title':    video_title,
1379             'ext':      video_extension,
1380         }]
1381
1382
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384     """Information Extractor for YouTube search queries."""
1385     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386     _MAX_RESULTS = 1000
1387     IE_NAME = u'youtube:search'
1388     _SEARCH_KEY = 'ytsearch'
1389
1390     def report_download_page(self, query, pagenum):
1391         """Report attempt to download search page with given number."""
1392         query = query.decode(preferredencoding())
1393         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394
1395     def _get_n_results(self, query, n):
1396         """Get a specified number of results for a query"""
1397
1398         video_ids = []
1399         pagenum = 0
1400         limit = n
1401
1402         while (50 * pagenum) < limit:
1403             self.report_download_page(query, pagenum+1)
1404             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405             request = compat_urllib_request.Request(result_url)
1406             try:
1407                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410             api_response = json.loads(data)['data']
1411
1412             if not 'items' in api_response:
1413                 raise ExtractorError(u'[youtube] No video results')
1414
1415             new_ids = list(video['id'] for video in api_response['items'])
1416             video_ids += new_ids
1417
1418             limit = min(n, api_response['totalItems'])
1419             pagenum += 1
1420
1421         if len(video_ids) > n:
1422             video_ids = video_ids[:n]
1423         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424         return self.playlist_result(videos, query)
1425
1426
1427 class GoogleSearchIE(SearchInfoExtractor):
1428     """Information Extractor for Google Video search queries."""
1429     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1430     _MAX_RESULTS = 1000
1431     IE_NAME = u'video.google:search'
1432     _SEARCH_KEY = 'gvsearch'
1433
1434     def _get_n_results(self, query, n):
1435         """Get a specified number of results for a query"""
1436
1437         res = {
1438             '_type': 'playlist',
1439             'id': query,
1440             'entries': []
1441         }
1442
1443         for pagenum in itertools.count(1):
1444             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446                                              note='Downloading result page ' + str(pagenum))
1447
1448             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449                 e = {
1450                     '_type': 'url',
1451                     'url': mobj.group(1)
1452                 }
1453                 res['entries'].append(e)
1454
1455             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456                 return res
1457
1458 class YahooSearchIE(SearchInfoExtractor):
1459     """Information Extractor for Yahoo! Video search queries."""
1460
1461     _MAX_RESULTS = 1000
1462     IE_NAME = u'screen.yahoo:search'
1463     _SEARCH_KEY = 'yvsearch'
1464
1465     def _get_n_results(self, query, n):
1466         """Get a specified number of results for a query"""
1467
1468         res = {
1469             '_type': 'playlist',
1470             'id': query,
1471             'entries': []
1472         }
1473         for pagenum in itertools.count(0): 
1474             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475             webpage = self._download_webpage(result_url, query,
1476                                              note='Downloading results page '+str(pagenum+1))
1477             info = json.loads(webpage)
1478             m = info[u'm']
1479             results = info[u'results']
1480
1481             for (i, r) in enumerate(results):
1482                 if (pagenum * 30) +i >= n:
1483                     break
1484                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486                 res['entries'].append(e)
1487             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488                 break
1489
1490         return res
1491
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494     """Information Extractor for YouTube playlists."""
1495
1496     _VALID_URL = r"""(?:
1497                         (?:https?://)?
1498                         (?:\w+\.)?
1499                         youtube\.com/
1500                         (?:
1501                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502                            \? (?:.*?&)*? (?:p|a|list)=
1503                         |  p/
1504                         )
1505                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1506                         .*
1507                      |
1508                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1509                      )"""
1510     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1511     _MAX_RESULTS = 50
1512     IE_NAME = u'youtube:playlist'
1513
1514     @classmethod
1515     def suitable(cls, url):
1516         """Receives a URL and returns True if suitable for this IE."""
1517         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1518
1519     def _real_extract(self, url):
1520         # Extract playlist id
1521         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1522         if mobj is None:
1523             raise ExtractorError(u'Invalid URL: %s' % url)
1524
1525         # Download playlist videos from API
1526         playlist_id = mobj.group(1) or mobj.group(2)
1527         page_num = 1
1528         videos = []
1529
1530         while True:
1531             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1533
1534             try:
1535                 response = json.loads(page)
1536             except ValueError as err:
1537                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1538
1539             if 'feed' not in response:
1540                 raise ExtractorError(u'Got a malformed response from YouTube API')
1541             playlist_title = response['feed']['title']['$t']
1542             if 'entry' not in response['feed']:
1543                 # Number of videos is a multiple of self._MAX_RESULTS
1544                 break
1545
1546             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547                         for entry in response['feed']['entry']
1548                         if 'content' in entry ]
1549
1550             if len(response['feed']['entry']) < self._MAX_RESULTS:
1551                 break
1552             page_num += 1
1553
1554         videos = [v[1] for v in sorted(videos)]
1555
1556         url_results = [self.url_result(url, 'Youtube') for url in videos]
1557         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1558
1559
1560 class YoutubeChannelIE(InfoExtractor):
1561     """Information Extractor for YouTube channels."""
1562
1563     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567     IE_NAME = u'youtube:channel'
1568
1569     def extract_videos_from_page(self, page):
1570         ids_in_page = []
1571         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572             if mobj.group(1) not in ids_in_page:
1573                 ids_in_page.append(mobj.group(1))
1574         return ids_in_page
1575
1576     def _real_extract(self, url):
1577         # Extract channel id
1578         mobj = re.match(self._VALID_URL, url)
1579         if mobj is None:
1580             raise ExtractorError(u'Invalid URL: %s' % url)
1581
1582         # Download channel page
1583         channel_id = mobj.group(1)
1584         video_ids = []
1585         pagenum = 1
1586
1587         url = self._TEMPLATE_URL % (channel_id, pagenum)
1588         page = self._download_webpage(url, channel_id,
1589                                       u'Downloading page #%s' % pagenum)
1590
1591         # Extract video identifiers
1592         ids_in_page = self.extract_videos_from_page(page)
1593         video_ids.extend(ids_in_page)
1594
1595         # Download any subsequent channel pages using the json-based channel_ajax query
1596         if self._MORE_PAGES_INDICATOR in page:
1597             while True:
1598                 pagenum = pagenum + 1
1599
1600                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601                 page = self._download_webpage(url, channel_id,
1602                                               u'Downloading page #%s' % pagenum)
1603
1604                 page = json.loads(page)
1605
1606                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607                 video_ids.extend(ids_in_page)
1608
1609                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1610                     break
1611
1612         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1613
1614         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616         return [self.playlist_result(url_entries, channel_id)]
1617
1618
1619 class YoutubeUserIE(InfoExtractor):
1620     """Information Extractor for YouTube users."""
1621
1622     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624     _GDATA_PAGE_SIZE = 50
1625     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627     IE_NAME = u'youtube:user'
1628
1629     def _real_extract(self, url):
1630         # Extract username
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             raise ExtractorError(u'Invalid URL: %s' % url)
1634
1635         username = mobj.group(1)
1636
1637         # Download video ids using YouTube Data API. Result size per
1638         # query is limited (currently to 50 videos) so we need to query
1639         # page by page until there are no video ids - it means we got
1640         # all of them.
1641
1642         video_ids = []
1643         pagenum = 0
1644
1645         while True:
1646             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1647
1648             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649             page = self._download_webpage(gdata_url, username,
1650                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654
1655             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656                 if mobj.group(1) not in ids_in_page:
1657                     ids_in_page.append(mobj.group(1))
1658
1659             video_ids.extend(ids_in_page)
1660
1661             # A little optimization - if current page is not
1662             # "full", ie. does not contain PAGE_SIZE video ids then
1663             # we can assume that this page is the last one - there
1664             # are no more ids on further pages - no need to query
1665             # again.
1666
1667             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1668                 break
1669
1670             pagenum += 1
1671
1672         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673         url_results = [self.url_result(url, 'Youtube') for url in urls]
1674         return [self.playlist_result(url_results, playlist_title = username)]
1675
1676
1677 class BlipTVUserIE(InfoExtractor):
1678     """Information Extractor for blip.tv users."""
1679
1680     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1681     _PAGE_SIZE = 12
1682     IE_NAME = u'blip.tv:user'
1683
1684     def _real_extract(self, url):
1685         # Extract username
1686         mobj = re.match(self._VALID_URL, url)
1687         if mobj is None:
1688             raise ExtractorError(u'Invalid URL: %s' % url)
1689
1690         username = mobj.group(1)
1691
1692         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1693
1694         page = self._download_webpage(url, username, u'Downloading user page')
1695         mobj = re.search(r'data-users-id="([^"]+)"', page)
1696         page_base = page_base % mobj.group(1)
1697
1698
1699         # Download video ids using BlipTV Ajax calls. Result size per
1700         # query is limited (currently to 12 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 1
1706
1707         while True:
1708             url = page_base + "&page=" + str(pagenum)
1709             page = self._download_webpage(url, username,
1710                                           u'Downloading video ids from page %d' % pagenum)
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714
1715             for mobj in re.finditer(r'href="/([^"]+)"', page):
1716                 if mobj.group(1) not in ids_in_page:
1717                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1718
1719             video_ids.extend(ids_in_page)
1720
1721             # A little optimization - if current page is not
1722             # "full", ie. does not contain PAGE_SIZE video ids then
1723             # we can assume that this page is the last one - there
1724             # are no more ids on further pages - no need to query
1725             # again.
1726
1727             if len(ids_in_page) < self._PAGE_SIZE:
1728                 break
1729
1730             pagenum += 1
1731
1732         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734         return [self.playlist_result(url_entries, playlist_title = username)]
1735
1736
1737 class DepositFilesIE(InfoExtractor):
1738     """Information extractor for depositfiles.com"""
1739
1740     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1741
1742     def _real_extract(self, url):
1743         file_id = url.split('/')[-1]
1744         # Rebuild url in english locale
1745         url = 'http://depositfiles.com/en/files/' + file_id
1746
1747         # Retrieve file webpage with 'Free download' button pressed
1748         free_download_indication = { 'gateway_result' : '1' }
1749         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1750         try:
1751             self.report_download_webpage(file_id)
1752             webpage = compat_urllib_request.urlopen(request).read()
1753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1755
1756         # Search for the real file URL
1757         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758         if (mobj is None) or (mobj.group(1) is None):
1759             # Try to figure out reason of the error.
1760             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761             if (mobj is not None) and (mobj.group(1) is not None):
1762                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763                 raise ExtractorError(u'%s' % restriction_message)
1764             else:
1765                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1766
1767         file_url = mobj.group(1)
1768         file_extension = os.path.splitext(file_url)[1][1:]
1769
1770         # Search for file title
1771         mobj = re.search(r'<b title="(.*?)">', webpage)
1772         if mobj is None:
1773             raise ExtractorError(u'Unable to extract title')
1774         file_title = mobj.group(1).decode('utf-8')
1775
1776         return [{
1777             'id':       file_id.decode('utf-8'),
1778             'url':      file_url.decode('utf-8'),
1779             'uploader': None,
1780             'upload_date':  None,
1781             'title':    file_title,
1782             'ext':      file_extension.decode('utf-8'),
1783         }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787     """Information Extractor for Facebook"""
1788
1789     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791     _NETRC_MACHINE = 'facebook'
1792     IE_NAME = u'facebook'
1793
1794     def report_login(self):
1795         """Report attempt to log in."""
1796         self.to_screen(u'Logging in')
1797
1798     def _real_initialize(self):
1799         if self._downloader is None:
1800             return
1801
1802         useremail = None
1803         password = None
1804         downloader_params = self._downloader.params
1805
1806         # Attempt to use provided username and password or .netrc data
1807         if downloader_params.get('username', None) is not None:
1808             useremail = downloader_params['username']
1809             password = downloader_params['password']
1810         elif downloader_params.get('usenetrc', False):
1811             try:
1812                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813                 if info is not None:
1814                     useremail = info[0]
1815                     password = info[2]
1816                 else:
1817                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818             except (IOError, netrc.NetrcParseError) as err:
1819                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1820                 return
1821
1822         if useremail is None:
1823             return
1824
1825         # Log in
1826         login_form = {
1827             'email': useremail,
1828             'pass': password,
1829             'login': 'Log+In'
1830             }
1831         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1832         try:
1833             self.report_login()
1834             login_results = compat_urllib_request.urlopen(request).read()
1835             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1837                 return
1838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1840             return
1841
1842     def _real_extract(self, url):
1843         mobj = re.match(self._VALID_URL, url)
1844         if mobj is None:
1845             raise ExtractorError(u'Invalid URL: %s' % url)
1846         video_id = mobj.group('ID')
1847
1848         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849         webpage = self._download_webpage(url, video_id)
1850
1851         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1854         if not m:
1855             raise ExtractorError(u'Cannot parse data')
1856         data = dict(json.loads(m.group(1)))
1857         params_raw = compat_urllib_parse.unquote(data['params'])
1858         params = json.loads(params_raw)
1859         video_data = params['video_data'][0]
1860         video_url = video_data.get('hd_src')
1861         if not video_url:
1862             video_url = video_data['sd_src']
1863         if not video_url:
1864             raise ExtractorError(u'Cannot find video URL')
1865         video_duration = int(video_data['video_duration'])
1866         thumbnail = video_data['thumbnail_src']
1867
1868         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1869         if not m:
1870             raise ExtractorError(u'Cannot find title in webpage')
1871         video_title = unescapeHTML(m.group(1))
1872
1873         info = {
1874             'id': video_id,
1875             'title': video_title,
1876             'url': video_url,
1877             'ext': 'mp4',
1878             'duration': video_duration,
1879             'thumbnail': thumbnail,
1880         }
1881         return [info]
1882
1883
1884 class BlipTVIE(InfoExtractor):
1885     """Information extractor for blip.tv"""
1886
1887     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1888     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889     IE_NAME = u'blip.tv'
1890
1891     def report_direct_download(self, title):
1892         """Report information extraction."""
1893         self.to_screen(u'%s: Direct download detected' % title)
1894
1895     def _real_extract(self, url):
1896         mobj = re.match(self._VALID_URL, url)
1897         if mobj is None:
1898             raise ExtractorError(u'Invalid URL: %s' % url)
1899
1900         # See https://github.com/rg3/youtube-dl/issues/857
1901         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1902         if api_mobj is not None:
1903             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1904         urlp = compat_urllib_parse_urlparse(url)
1905         if urlp.path.startswith('/play/'):
1906             request = compat_urllib_request.Request(url)
1907             response = compat_urllib_request.urlopen(request)
1908             redirecturl = response.geturl()
1909             rurlp = compat_urllib_parse_urlparse(redirecturl)
1910             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1911             url = 'http://blip.tv/a/a-' + file_id
1912             return self._real_extract(url)
1913
1914
1915         if '?' in url:
1916             cchar = '&'
1917         else:
1918             cchar = '?'
1919         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920         request = compat_urllib_request.Request(json_url)
1921         request.add_header('User-Agent', 'iTunes/10.6.1')
1922         self.report_extraction(mobj.group(1))
1923         info = None
1924         try:
1925             urlh = compat_urllib_request.urlopen(request)
1926             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1927                 basename = url.split('/')[-1]
1928                 title,ext = os.path.splitext(basename)
1929                 title = title.decode('UTF-8')
1930                 ext = ext.replace('.', '')
1931                 self.report_direct_download(title)
1932                 info = {
1933                     'id': title,
1934                     'url': url,
1935                     'uploader': None,
1936                     'upload_date': None,
1937                     'title': title,
1938                     'ext': ext,
1939                     'urlhandle': urlh
1940                 }
1941         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1942             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1943         if info is None: # Regular URL
1944             try:
1945                 json_code_bytes = urlh.read()
1946                 json_code = json_code_bytes.decode('utf-8')
1947             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1949
1950             try:
1951                 json_data = json.loads(json_code)
1952                 if 'Post' in json_data:
1953                     data = json_data['Post']
1954                 else:
1955                     data = json_data
1956
1957                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1958                 video_url = data['media']['url']
1959                 umobj = re.match(self._URL_EXT, video_url)
1960                 if umobj is None:
1961                     raise ValueError('Can not determine filename extension')
1962                 ext = umobj.group(1)
1963
1964                 info = {
1965                     'id': data['item_id'],
1966                     'url': video_url,
1967                     'uploader': data['display_name'],
1968                     'upload_date': upload_date,
1969                     'title': data['title'],
1970                     'ext': ext,
1971                     'format': data['media']['mimeType'],
1972                     'thumbnail': data['thumbnailUrl'],
1973                     'description': data['description'],
1974                     'player_url': data['embedUrl'],
1975                     'user_agent': 'iTunes/10.6.1',
1976                 }
1977             except (ValueError,KeyError) as err:
1978                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1979
1980         return [info]
1981
1982
1983 class MyVideoIE(InfoExtractor):
1984     """Information Extractor for myvideo.de."""
1985
1986     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1987     IE_NAME = u'myvideo'
1988
1989     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1990     # Released into the Public Domain by Tristan Fischer on 2013-05-19
1991     # https://github.com/rg3/youtube-dl/pull/842
1992     def __rc4crypt(self,data, key):
1993         x = 0
1994         box = list(range(256))
1995         for i in list(range(256)):
1996             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1997             box[i], box[x] = box[x], box[i]
1998         x = 0
1999         y = 0
2000         out = ''
2001         for char in data:
2002             x = (x + 1) % 256
2003             y = (y + box[x]) % 256
2004             box[x], box[y] = box[y], box[x]
2005             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2006         return out
2007
2008     def __md5(self,s):
2009         return hashlib.md5(s).hexdigest().encode()
2010
2011     def _real_extract(self,url):
2012         mobj = re.match(self._VALID_URL, url)
2013         if mobj is None:
2014             raise ExtractorError(u'invalid URL: %s' % url)
2015
2016         video_id = mobj.group(1)
2017
2018         GK = (
2019           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2020           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2021           b'TnpsbA0KTVRkbU1tSTRNdz09'
2022         )
2023
2024         # Get video webpage
2025         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2026         webpage = self._download_webpage(webpage_url, video_id)
2027
2028         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2029         if mobj is not None:
2030             self.report_extraction(video_id)
2031             video_url = mobj.group(1) + '.flv'
2032
2033             mobj = re.search('<title>([^<]+)</title>', webpage)
2034             if mobj is None:
2035                 raise ExtractorError(u'Unable to extract title')
2036             video_title = mobj.group(1)
2037
2038             mobj = re.search('[.](.+?)$', video_url)
2039             if mobj is None:
2040                 raise ExtractorError(u'Unable to extract extention')
2041             video_ext = mobj.group(1)
2042
2043             return [{
2044                 'id':       video_id,
2045                 'url':      video_url,
2046                 'uploader': None,
2047                 'upload_date':  None,
2048                 'title':    video_title,
2049                 'ext':      u'flv',
2050             }]
2051
2052         # try encxml
2053         mobj = re.search('var flashvars={(.+?)}', webpage)
2054         if mobj is None:
2055             raise ExtractorError(u'Unable to extract video')
2056
2057         params = {}
2058         encxml = ''
2059         sec = mobj.group(1)
2060         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2061             if not a == '_encxml':
2062                 params[a] = b
2063             else:
2064                 encxml = compat_urllib_parse.unquote(b)
2065         if not params.get('domain'):
2066             params['domain'] = 'www.myvideo.de'
2067         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2068         if 'flash_playertype=MTV' in xmldata_url:
2069             self._downloader.report_warning(u'avoiding MTV player')
2070             xmldata_url = (
2071                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2072                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2073             ) % video_id
2074
2075         # get enc data
2076         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2077         enc_data_b = binascii.unhexlify(enc_data)
2078         sk = self.__md5(
2079             base64.b64decode(base64.b64decode(GK)) +
2080             self.__md5(
2081                 str(video_id).encode('utf-8')
2082             )
2083         )
2084         dec_data = self.__rc4crypt(enc_data_b, sk)
2085
2086         # extracting infos
2087         self.report_extraction(video_id)
2088
2089         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2090         if mobj is None:
2091             raise ExtractorError(u'unable to extract rtmpurl')
2092         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2093         if 'myvideo2flash' in video_rtmpurl:
2094             self._downloader.report_warning(u'forcing RTMPT ...')
2095             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2096
2097         # extract non rtmp videos
2098         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2099             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2100             if mobj is None:
2101                 raise ExtractorError(u'unable to extract url')
2102             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2103
2104         mobj = re.search('source=\'(.*?)\'', dec_data)
2105         if mobj is None:
2106             raise ExtractorError(u'unable to extract swfobj')
2107         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2108
2109         if not video_file.endswith('f4m'):
2110             ppath, prefix = video_file.split('.')
2111             video_playpath = '%s:%s' % (prefix, ppath)
2112             video_hls_playlist = ''
2113         else:
2114             video_playpath = ''
2115             video_hls_playlist = (
2116                 video_filepath + video_file
2117             ).replace('.f4m', '.m3u8')
2118
2119         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2120         if mobj is None:
2121             raise ExtractorError(u'unable to extract swfobj')
2122         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2123
2124         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2125         if mobj is None:
2126             raise ExtractorError(u'unable to extract title')
2127         video_title = mobj.group(1)
2128
2129         return [{
2130             'id':                 video_id,
2131             'url':                video_rtmpurl,
2132             'tc_url':             video_rtmpurl,
2133             'uploader':           None,
2134             'upload_date':        None,
2135             'title':              video_title,
2136             'ext':                u'flv',
2137             'play_path':          video_playpath,
2138             'video_file':         video_file,
2139             'video_hls_playlist': video_hls_playlist,
2140             'player_url':         video_swfobj,
2141         }]
2142
2143 class ComedyCentralIE(InfoExtractor):
2144     """Information extractor for The Daily Show and Colbert Report """
2145
2146     # urls can be abbreviations like :thedailyshow or :colbert
2147     # urls for episodes like:
2148     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2149     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2150     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2151     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2152                       |(https?://)?(www\.)?
2153                           (?P<showname>thedailyshow|colbertnation)\.com/
2154                          (full-episodes/(?P<episode>.*)|
2155                           (?P<clip>
2156                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2157                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2158                      $"""
2159
2160     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2161
2162     _video_extensions = {
2163         '3500': 'mp4',
2164         '2200': 'mp4',
2165         '1700': 'mp4',
2166         '1200': 'mp4',
2167         '750': 'mp4',
2168         '400': 'mp4',
2169     }
2170     _video_dimensions = {
2171         '3500': '1280x720',
2172         '2200': '960x540',
2173         '1700': '768x432',
2174         '1200': '640x360',
2175         '750': '512x288',
2176         '400': '384x216',
2177     }
2178
2179     @classmethod
2180     def suitable(cls, url):
2181         """Receives a URL and returns True if suitable for this IE."""
2182         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2183
2184     def _print_formats(self, formats):
2185         print('Available formats:')
2186         for x in formats:
2187             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2188
2189
2190     def _real_extract(self, url):
2191         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2192         if mobj is None:
2193             raise ExtractorError(u'Invalid URL: %s' % url)
2194
2195         if mobj.group('shortname'):
2196             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2197                 url = u'http://www.thedailyshow.com/full-episodes/'
2198             else:
2199                 url = u'http://www.colbertnation.com/full-episodes/'
2200             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2201             assert mobj is not None
2202
2203         if mobj.group('clip'):
2204             if mobj.group('showname') == 'thedailyshow':
2205                 epTitle = mobj.group('tdstitle')
2206             else:
2207                 epTitle = mobj.group('cntitle')
2208             dlNewest = False
2209         else:
2210             dlNewest = not mobj.group('episode')
2211             if dlNewest:
2212                 epTitle = mobj.group('showname')
2213             else:
2214                 epTitle = mobj.group('episode')
2215
2216         self.report_extraction(epTitle)
2217         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2218         if dlNewest:
2219             url = htmlHandle.geturl()
2220             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2221             if mobj is None:
2222                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2223             if mobj.group('episode') == '':
2224                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2225             epTitle = mobj.group('episode')
2226
2227         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2228
2229         if len(mMovieParams) == 0:
2230             # The Colbert Report embeds the information in a without
2231             # a URL prefix; so extract the alternate reference
2232             # and then add the URL prefix manually.
2233
2234             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2235             if len(altMovieParams) == 0:
2236                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2237             else:
2238                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2239
2240         uri = mMovieParams[0][1]
2241         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2242         indexXml = self._download_webpage(indexUrl, epTitle,
2243                                           u'Downloading show index',
2244                                           u'unable to download episode index')
2245
2246         results = []
2247
2248         idoc = xml.etree.ElementTree.fromstring(indexXml)
2249         itemEls = idoc.findall('.//item')
2250         for partNum,itemEl in enumerate(itemEls):
2251             mediaId = itemEl.findall('./guid')[0].text
2252             shortMediaId = mediaId.split(':')[-1]
2253             showId = mediaId.split(':')[-2].replace('.com', '')
2254             officialTitle = itemEl.findall('./title')[0].text
2255             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2256
2257             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2258                         compat_urllib_parse.urlencode({'uri': mediaId}))
2259             configXml = self._download_webpage(configUrl, epTitle,
2260                                                u'Downloading configuration for %s' % shortMediaId)
2261
2262             cdoc = xml.etree.ElementTree.fromstring(configXml)
2263             turls = []
2264             for rendition in cdoc.findall('.//rendition'):
2265                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2266                 turls.append(finfo)
2267
2268             if len(turls) == 0:
2269                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2270                 continue
2271
2272             if self._downloader.params.get('listformats', None):
2273                 self._print_formats([i[0] for i in turls])
2274                 return
2275
2276             # For now, just pick the highest bitrate
2277             format,rtmp_video_url = turls[-1]
2278
2279             # Get the format arg from the arg stream
2280             req_format = self._downloader.params.get('format', None)
2281
2282             # Select format if we can find one
2283             for f,v in turls:
2284                 if f == req_format:
2285                     format, rtmp_video_url = f, v
2286                     break
2287
2288             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2289             if not m:
2290                 raise ExtractorError(u'Cannot transform RTMP url')
2291             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2292             video_url = base + m.group('finalid')
2293
2294             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2295             info = {
2296                 'id': shortMediaId,
2297                 'url': video_url,
2298                 'uploader': showId,
2299                 'upload_date': officialDate,
2300                 'title': effTitle,
2301                 'ext': 'mp4',
2302                 'format': format,
2303                 'thumbnail': None,
2304                 'description': officialTitle,
2305             }
2306             results.append(info)
2307
2308         return results
2309
2310
2311 class EscapistIE(InfoExtractor):
2312     """Information extractor for The Escapist """
2313
2314     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2315     IE_NAME = u'escapist'
2316
2317     def _real_extract(self, url):
2318         mobj = re.match(self._VALID_URL, url)
2319         if mobj is None:
2320             raise ExtractorError(u'Invalid URL: %s' % url)
2321         showName = mobj.group('showname')
2322         videoId = mobj.group('episode')
2323
2324         self.report_extraction(showName)
2325         webPage = self._download_webpage(url, showName)
2326
2327         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2328         description = unescapeHTML(descMatch.group(1))
2329         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2330         imgUrl = unescapeHTML(imgMatch.group(1))
2331         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2332         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2333         configUrlMatch = re.search('config=(.*)$', playerUrl)
2334         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2335
2336         configJSON = self._download_webpage(configUrl, showName,
2337                                             u'Downloading configuration',
2338                                             u'unable to download configuration')
2339
2340         # Technically, it's JavaScript, not JSON
2341         configJSON = configJSON.replace("'", '"')
2342
2343         try:
2344             config = json.loads(configJSON)
2345         except (ValueError,) as err:
2346             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2347
2348         playlist = config['playlist']
2349         videoUrl = playlist[1]['url']
2350
2351         info = {
2352             'id': videoId,
2353             'url': videoUrl,
2354             'uploader': showName,
2355             'upload_date': None,
2356             'title': showName,
2357             'ext': 'mp4',
2358             'thumbnail': imgUrl,
2359             'description': description,
2360             'player_url': playerUrl,
2361         }
2362
2363         return [info]
2364
2365 class CollegeHumorIE(InfoExtractor):
2366     """Information extractor for collegehumor.com"""
2367
2368     _WORKING = False
2369     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2370     IE_NAME = u'collegehumor'
2371
2372     def report_manifest(self, video_id):
2373         """Report information extraction."""
2374         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2375
2376     def _real_extract(self, url):
2377         mobj = re.match(self._VALID_URL, url)
2378         if mobj is None:
2379             raise ExtractorError(u'Invalid URL: %s' % url)
2380         video_id = mobj.group('videoid')
2381
2382         info = {
2383             'id': video_id,
2384             'uploader': None,
2385             'upload_date': None,
2386         }
2387
2388         self.report_extraction(video_id)
2389         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2390         try:
2391             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2392         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2393             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2394
2395         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2396         try:
2397             videoNode = mdoc.findall('./video')[0]
2398             info['description'] = videoNode.findall('./description')[0].text
2399             info['title'] = videoNode.findall('./caption')[0].text
2400             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2401             manifest_url = videoNode.findall('./file')[0].text
2402         except IndexError:
2403             raise ExtractorError(u'Invalid metadata XML file')
2404
2405         manifest_url += '?hdcore=2.10.3'
2406         self.report_manifest(video_id)
2407         try:
2408             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2410             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2411
2412         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2413         try:
2414             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2415             node_id = media_node.attrib['url']
2416             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2417         except IndexError as err:
2418             raise ExtractorError(u'Invalid manifest file')
2419
2420         url_pr = compat_urllib_parse_urlparse(manifest_url)
2421         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2422
2423         info['url'] = url
2424         info['ext'] = 'f4f'
2425         return [info]
2426
2427
2428 class XVideosIE(InfoExtractor):
2429     """Information extractor for xvideos.com"""
2430
2431     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2432     IE_NAME = u'xvideos'
2433
2434     def _real_extract(self, url):
2435         mobj = re.match(self._VALID_URL, url)
2436         if mobj is None:
2437             raise ExtractorError(u'Invalid URL: %s' % url)
2438         video_id = mobj.group(1)
2439
2440         webpage = self._download_webpage(url, video_id)
2441
2442         self.report_extraction(video_id)
2443
2444
2445         # Extract video URL
2446         mobj = re.search(r'flv_url=(.+?)&', webpage)
2447         if mobj is None:
2448             raise ExtractorError(u'Unable to extract video url')
2449         video_url = compat_urllib_parse.unquote(mobj.group(1))
2450
2451
2452         # Extract title
2453         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2454         if mobj is None:
2455             raise ExtractorError(u'Unable to extract video title')
2456         video_title = mobj.group(1)
2457
2458
2459         # Extract video thumbnail
2460         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2461         if mobj is None:
2462             raise ExtractorError(u'Unable to extract video thumbnail')
2463         video_thumbnail = mobj.group(0)
2464
2465         info = {
2466             'id': video_id,
2467             'url': video_url,
2468             'uploader': None,
2469             'upload_date': None,
2470             'title': video_title,
2471             'ext': 'flv',
2472             'thumbnail': video_thumbnail,
2473             'description': None,
2474         }
2475
2476         return [info]
2477
2478
2479 class SoundcloudIE(InfoExtractor):
2480     """Information extractor for soundcloud.com
2481        To access the media, the uid of the song and a stream token
2482        must be extracted from the page source and the script must make
2483        a request to media.soundcloud.com/crossdomain.xml. Then
2484        the media can be grabbed by requesting from an url composed
2485        of the stream token and uid
2486      """
2487
2488     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2489     IE_NAME = u'soundcloud'
2490
2491     def report_resolve(self, video_id):
2492         """Report information extraction."""
2493         self.to_screen(u'%s: Resolving id' % video_id)
2494
2495     def _real_extract(self, url):
2496         mobj = re.match(self._VALID_URL, url)
2497         if mobj is None:
2498             raise ExtractorError(u'Invalid URL: %s' % url)
2499
2500         # extract uploader (which is in the url)
2501         uploader = mobj.group(1)
2502         # extract simple title (uploader + slug of song title)
2503         slug_title =  mobj.group(2)
2504         simple_title = uploader + u'-' + slug_title
2505         full_title = '%s/%s' % (uploader, slug_title)
2506
2507         self.report_resolve(full_title)
2508
2509         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2510         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2511         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2512
2513         info = json.loads(info_json)
2514         video_id = info['id']
2515         self.report_extraction(full_title)
2516
2517         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2518         stream_json = self._download_webpage(streams_url, full_title,
2519                                              u'Downloading stream definitions',
2520                                              u'unable to download stream definitions')
2521
2522         streams = json.loads(stream_json)
2523         mediaURL = streams['http_mp3_128_url']
2524         upload_date = unified_strdate(info['created_at'])
2525
2526         return [{
2527             'id':       info['id'],
2528             'url':      mediaURL,
2529             'uploader': info['user']['username'],
2530             'upload_date': upload_date,
2531             'title':    info['title'],
2532             'ext':      u'mp3',
2533             'description': info['description'],
2534         }]
2535
2536 class SoundcloudSetIE(InfoExtractor):
2537     """Information extractor for soundcloud.com sets
2538        To access the media, the uid of the song and a stream token
2539        must be extracted from the page source and the script must make
2540        a request to media.soundcloud.com/crossdomain.xml. Then
2541        the media can be grabbed by requesting from an url composed
2542        of the stream token and uid
2543      """
2544
2545     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2546     IE_NAME = u'soundcloud:set'
2547
2548     def report_resolve(self, video_id):
2549         """Report information extraction."""
2550         self.to_screen(u'%s: Resolving id' % video_id)
2551
2552     def _real_extract(self, url):
2553         mobj = re.match(self._VALID_URL, url)
2554         if mobj is None:
2555             raise ExtractorError(u'Invalid URL: %s' % url)
2556
2557         # extract uploader (which is in the url)
2558         uploader = mobj.group(1)
2559         # extract simple title (uploader + slug of song title)
2560         slug_title =  mobj.group(2)
2561         simple_title = uploader + u'-' + slug_title
2562         full_title = '%s/sets/%s' % (uploader, slug_title)
2563
2564         self.report_resolve(full_title)
2565
2566         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2567         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2568         info_json = self._download_webpage(resolv_url, full_title)
2569
2570         videos = []
2571         info = json.loads(info_json)
2572         if 'errors' in info:
2573             for err in info['errors']:
2574                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2575             return
2576
2577         self.report_extraction(full_title)
2578         for track in info['tracks']:
2579             video_id = track['id']
2580
2581             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2582             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2583
2584             self.report_extraction(video_id)
2585             streams = json.loads(stream_json)
2586             mediaURL = streams['http_mp3_128_url']
2587
2588             videos.append({
2589                 'id':       video_id,
2590                 'url':      mediaURL,
2591                 'uploader': track['user']['username'],
2592                 'upload_date':  unified_strdate(track['created_at']),
2593                 'title':    track['title'],
2594                 'ext':      u'mp3',
2595                 'description': track['description'],
2596             })
2597         return videos
2598
2599
2600 class InfoQIE(InfoExtractor):
2601     """Information extractor for infoq.com"""
2602     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2603
2604     def _real_extract(self, url):
2605         mobj = re.match(self._VALID_URL, url)
2606         if mobj is None:
2607             raise ExtractorError(u'Invalid URL: %s' % url)
2608
2609         webpage = self._download_webpage(url, video_id=url)
2610         self.report_extraction(url)
2611
2612         # Extract video URL
2613         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2614         if mobj is None:
2615             raise ExtractorError(u'Unable to extract video url')
2616         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2617         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2618
2619         # Extract title
2620         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2621         if mobj is None:
2622             raise ExtractorError(u'Unable to extract video title')
2623         video_title = mobj.group(1)
2624
2625         # Extract description
2626         video_description = u'No description available.'
2627         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2628         if mobj is not None:
2629             video_description = mobj.group(1)
2630
2631         video_filename = video_url.split('/')[-1]
2632         video_id, extension = video_filename.split('.')
2633
2634         info = {
2635             'id': video_id,
2636             'url': video_url,
2637             'uploader': None,
2638             'upload_date': None,
2639             'title': video_title,
2640             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2641             'thumbnail': None,
2642             'description': video_description,
2643         }
2644
2645         return [info]
2646
2647 class MixcloudIE(InfoExtractor):
2648     """Information extractor for www.mixcloud.com"""
2649
2650     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2651     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2652     IE_NAME = u'mixcloud'
2653
2654     def report_download_json(self, file_id):
2655         """Report JSON download."""
2656         self.to_screen(u'Downloading json')
2657
2658     def get_urls(self, jsonData, fmt, bitrate='best'):
2659         """Get urls from 'audio_formats' section in json"""
2660         file_url = None
2661         try:
2662             bitrate_list = jsonData[fmt]
2663             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2664                 bitrate = max(bitrate_list) # select highest
2665
2666             url_list = jsonData[fmt][bitrate]
2667         except TypeError: # we have no bitrate info.
2668             url_list = jsonData[fmt]
2669         return url_list
2670
2671     def check_urls(self, url_list):
2672         """Returns 1st active url from list"""
2673         for url in url_list:
2674             try:
2675                 compat_urllib_request.urlopen(url)
2676                 return url
2677             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678                 url = None
2679
2680         return None
2681
2682     def _print_formats(self, formats):
2683         print('Available formats:')
2684         for fmt in formats.keys():
2685             for b in formats[fmt]:
2686                 try:
2687                     ext = formats[fmt][b][0]
2688                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2689                 except TypeError: # we have no bitrate info
2690                     ext = formats[fmt][0]
2691                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2692                     break
2693
2694     def _real_extract(self, url):
2695         mobj = re.match(self._VALID_URL, url)
2696         if mobj is None:
2697             raise ExtractorError(u'Invalid URL: %s' % url)
2698         # extract uploader & filename from url
2699         uploader = mobj.group(1).decode('utf-8')
2700         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2701
2702         # construct API request
2703         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2704         # retrieve .json file with links to files
2705         request = compat_urllib_request.Request(file_url)
2706         try:
2707             self.report_download_json(file_url)
2708             jsonData = compat_urllib_request.urlopen(request).read()
2709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2711
2712         # parse JSON
2713         json_data = json.loads(jsonData)
2714         player_url = json_data['player_swf_url']
2715         formats = dict(json_data['audio_formats'])
2716
2717         req_format = self._downloader.params.get('format', None)
2718         bitrate = None
2719
2720         if self._downloader.params.get('listformats', None):
2721             self._print_formats(formats)
2722             return
2723
2724         if req_format is None or req_format == 'best':
2725             for format_param in formats.keys():
2726                 url_list = self.get_urls(formats, format_param)
2727                 # check urls
2728                 file_url = self.check_urls(url_list)
2729                 if file_url is not None:
2730                     break # got it!
2731         else:
2732             if req_format not in formats:
2733                 raise ExtractorError(u'Format is not available')
2734
2735             url_list = self.get_urls(formats, req_format)
2736             file_url = self.check_urls(url_list)
2737             format_param = req_format
2738
2739         return [{
2740             'id': file_id.decode('utf-8'),
2741             'url': file_url.decode('utf-8'),
2742             'uploader': uploader.decode('utf-8'),
2743             'upload_date': None,
2744             'title': json_data['name'],
2745             'ext': file_url.split('.')[-1].decode('utf-8'),
2746             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2747             'thumbnail': json_data['thumbnail_url'],
2748             'description': json_data['description'],
2749             'player_url': player_url.decode('utf-8'),
2750         }]
2751
2752 class StanfordOpenClassroomIE(InfoExtractor):
2753     """Information extractor for Stanford's Open ClassRoom"""
2754
2755     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2756     IE_NAME = u'stanfordoc'
2757
2758     def _real_extract(self, url):
2759         mobj = re.match(self._VALID_URL, url)
2760         if mobj is None:
2761             raise ExtractorError(u'Invalid URL: %s' % url)
2762
2763         if mobj.group('course') and mobj.group('video'): # A specific video
2764             course = mobj.group('course')
2765             video = mobj.group('video')
2766             info = {
2767                 'id': course + '_' + video,
2768                 'uploader': None,
2769                 'upload_date': None,
2770             }
2771
2772             self.report_extraction(info['id'])
2773             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2774             xmlUrl = baseUrl + video + '.xml'
2775             try:
2776                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2779             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2780             try:
2781                 info['title'] = mdoc.findall('./title')[0].text
2782                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2783             except IndexError:
2784                 raise ExtractorError(u'Invalid metadata XML file')
2785             info['ext'] = info['url'].rpartition('.')[2]
2786             return [info]
2787         elif mobj.group('course'): # A course page
2788             course = mobj.group('course')
2789             info = {
2790                 'id': course,
2791                 'type': 'playlist',
2792                 'uploader': None,
2793                 'upload_date': None,
2794             }
2795
2796             coursepage = self._download_webpage(url, info['id'],
2797                                         note='Downloading course info page',
2798                                         errnote='Unable to download course info page')
2799
2800             m = re.search('<h1>([^<]+)</h1>', coursepage)
2801             if m:
2802                 info['title'] = unescapeHTML(m.group(1))
2803             else:
2804                 info['title'] = info['id']
2805
2806             m = re.search('<description>([^<]+)</description>', coursepage)
2807             if m:
2808                 info['description'] = unescapeHTML(m.group(1))
2809
2810             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2811             info['list'] = [
2812                 {
2813                     'type': 'reference',
2814                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2815                 }
2816                     for vpage in links]
2817             results = []
2818             for entry in info['list']:
2819                 assert entry['type'] == 'reference'
2820                 results += self.extract(entry['url'])
2821             return results
2822         else: # Root page
2823             info = {
2824                 'id': 'Stanford OpenClassroom',
2825                 'type': 'playlist',
2826                 'uploader': None,
2827                 'upload_date': None,
2828             }
2829
2830             self.report_download_webpage(info['id'])
2831             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2832             try:
2833                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2836
2837             info['title'] = info['id']
2838
2839             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2840             info['list'] = [
2841                 {
2842                     'type': 'reference',
2843                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2844                 }
2845                     for cpage in links]
2846
2847             results = []
2848             for entry in info['list']:
2849                 assert entry['type'] == 'reference'
2850                 results += self.extract(entry['url'])
2851             return results
2852
2853 class MTVIE(InfoExtractor):
2854     """Information extractor for MTV.com"""
2855
2856     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2857     IE_NAME = u'mtv'
2858
2859     def _real_extract(self, url):
2860         mobj = re.match(self._VALID_URL, url)
2861         if mobj is None:
2862             raise ExtractorError(u'Invalid URL: %s' % url)
2863         if not mobj.group('proto'):
2864             url = 'http://' + url
2865         video_id = mobj.group('videoid')
2866
2867         webpage = self._download_webpage(url, video_id)
2868
2869         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2870         if mobj is None:
2871             raise ExtractorError(u'Unable to extract song name')
2872         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2873         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2874         if mobj is None:
2875             raise ExtractorError(u'Unable to extract performer')
2876         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2877         video_title = performer + ' - ' + song_name
2878
2879         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2880         if mobj is None:
2881             raise ExtractorError(u'Unable to mtvn_uri')
2882         mtvn_uri = mobj.group(1)
2883
2884         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2885         if mobj is None:
2886             raise ExtractorError(u'Unable to extract content id')
2887         content_id = mobj.group(1)
2888
2889         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2890         self.report_extraction(video_id)
2891         request = compat_urllib_request.Request(videogen_url)
2892         try:
2893             metadataXml = compat_urllib_request.urlopen(request).read()
2894         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2896
2897         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2898         renditions = mdoc.findall('.//rendition')
2899
2900         # For now, always pick the highest quality.
2901         rendition = renditions[-1]
2902
2903         try:
2904             _,_,ext = rendition.attrib['type'].partition('/')
2905             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2906             video_url = rendition.find('./src').text
2907         except KeyError:
2908             raise ExtractorError('Invalid rendition field.')
2909
2910         info = {
2911             'id': video_id,
2912             'url': video_url,
2913             'uploader': performer,
2914             'upload_date': None,
2915             'title': video_title,
2916             'ext': ext,
2917             'format': format,
2918         }
2919
2920         return [info]
2921
2922
2923 class YoukuIE(InfoExtractor):
2924     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2925
2926     def _gen_sid(self):
2927         nowTime = int(time.time() * 1000)
2928         random1 = random.randint(1000,1998)
2929         random2 = random.randint(1000,9999)
2930
2931         return "%d%d%d" %(nowTime,random1,random2)
2932
2933     def _get_file_ID_mix_string(self, seed):
2934         mixed = []
2935         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2936         seed = float(seed)
2937         for i in range(len(source)):
2938             seed  =  (seed * 211 + 30031 ) % 65536
2939             index  =  math.floor(seed / 65536 * len(source) )
2940             mixed.append(source[int(index)])
2941             source.remove(source[int(index)])
2942         #return ''.join(mixed)
2943         return mixed
2944
2945     def _get_file_id(self, fileId, seed):
2946         mixed = self._get_file_ID_mix_string(seed)
2947         ids = fileId.split('*')
2948         realId = []
2949         for ch in ids:
2950             if ch:
2951                 realId.append(mixed[int(ch)])
2952         return ''.join(realId)
2953
2954     def _real_extract(self, url):
2955         mobj = re.match(self._VALID_URL, url)
2956         if mobj is None:
2957             raise ExtractorError(u'Invalid URL: %s' % url)
2958         video_id = mobj.group('ID')
2959
2960         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2961
2962         jsondata = self._download_webpage(info_url, video_id)
2963
2964         self.report_extraction(video_id)
2965         try:
2966             config = json.loads(jsondata)
2967
2968             video_title =  config['data'][0]['title']
2969             seed = config['data'][0]['seed']
2970
2971             format = self._downloader.params.get('format', None)
2972             supported_format = list(config['data'][0]['streamfileids'].keys())
2973
2974             if format is None or format == 'best':
2975                 if 'hd2' in supported_format:
2976                     format = 'hd2'
2977                 else:
2978                     format = 'flv'
2979                 ext = u'flv'
2980             elif format == 'worst':
2981                 format = 'mp4'
2982                 ext = u'mp4'
2983             else:
2984                 format = 'flv'
2985                 ext = u'flv'
2986
2987
2988             fileid = config['data'][0]['streamfileids'][format]
2989             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2990         except (UnicodeDecodeError, ValueError, KeyError):
2991             raise ExtractorError(u'Unable to extract info section')
2992
2993         files_info=[]
2994         sid = self._gen_sid()
2995         fileid = self._get_file_id(fileid, seed)
2996
2997         #column 8,9 of fileid represent the segment number
2998         #fileid[7:9] should be changed
2999         for index, key in enumerate(keys):
3000
3001             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3002             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3003
3004             info = {
3005                 'id': '%s_part%02d' % (video_id, index),
3006                 'url': download_url,
3007                 'uploader': None,
3008                 'upload_date': None,
3009                 'title': video_title,
3010                 'ext': ext,
3011             }
3012             files_info.append(info)
3013
3014         return files_info
3015
3016
3017 class XNXXIE(InfoExtractor):
3018     """Information extractor for xnxx.com"""
3019
3020     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3021     IE_NAME = u'xnxx'
3022     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3023     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3024     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3025
3026     def _real_extract(self, url):
3027         mobj = re.match(self._VALID_URL, url)
3028         if mobj is None:
3029             raise ExtractorError(u'Invalid URL: %s' % url)
3030         video_id = mobj.group(1)
3031
3032         # Get webpage content
3033         webpage = self._download_webpage(url, video_id)
3034
3035         result = re.search(self.VIDEO_URL_RE, webpage)
3036         if result is None:
3037             raise ExtractorError(u'Unable to extract video url')
3038         video_url = compat_urllib_parse.unquote(result.group(1))
3039
3040         result = re.search(self.VIDEO_TITLE_RE, webpage)
3041         if result is None:
3042             raise ExtractorError(u'Unable to extract video title')
3043         video_title = result.group(1)
3044
3045         result = re.search(self.VIDEO_THUMB_RE, webpage)
3046         if result is None:
3047             raise ExtractorError(u'Unable to extract video thumbnail')
3048         video_thumbnail = result.group(1)
3049
3050         return [{
3051             'id': video_id,
3052             'url': video_url,
3053             'uploader': None,
3054             'upload_date': None,
3055             'title': video_title,
3056             'ext': 'flv',
3057             'thumbnail': video_thumbnail,
3058             'description': None,
3059         }]
3060
3061
3062 class GooglePlusIE(InfoExtractor):
3063     """Information extractor for plus.google.com."""
3064
3065     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3066     IE_NAME = u'plus.google'
3067
3068     def report_extract_entry(self, url):
3069         """Report downloading extry"""
3070         self.to_screen(u'Downloading entry: %s' % url)
3071
3072     def report_date(self, upload_date):
3073         """Report downloading extry"""
3074         self.to_screen(u'Entry date: %s' % upload_date)
3075
3076     def report_uploader(self, uploader):
3077         """Report downloading extry"""
3078         self.to_screen(u'Uploader: %s' % uploader)
3079
3080     def report_title(self, video_title):
3081         """Report downloading extry"""
3082         self.to_screen(u'Title: %s' % video_title)
3083
3084     def report_extract_vid_page(self, video_page):
3085         """Report information extraction."""
3086         self.to_screen(u'Extracting video page: %s' % video_page)
3087
3088     def _real_extract(self, url):
3089         # Extract id from URL
3090         mobj = re.match(self._VALID_URL, url)
3091         if mobj is None:
3092             raise ExtractorError(u'Invalid URL: %s' % url)
3093
3094         post_url = mobj.group(0)
3095         video_id = mobj.group(1)
3096
3097         video_extension = 'flv'
3098
3099         # Step 1, Retrieve post webpage to extract further information
3100         self.report_extract_entry(post_url)
3101         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3102
3103         # Extract update date
3104         upload_date = None
3105         pattern = 'title="Timestamp">(.*?)</a>'
3106         mobj = re.search(pattern, webpage)
3107         if mobj:
3108             upload_date = mobj.group(1)
3109             # Convert timestring to a format suitable for filename
3110             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3111             upload_date = upload_date.strftime('%Y%m%d')
3112         self.report_date(upload_date)
3113
3114         # Extract uploader
3115         uploader = None
3116         pattern = r'rel\="author".*?>(.*?)</a>'
3117         mobj = re.search(pattern, webpage)
3118         if mobj:
3119             uploader = mobj.group(1)
3120         self.report_uploader(uploader)
3121
3122         # Extract title
3123         # Get the first line for title
3124         video_title = u'NA'
3125         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3126         mobj = re.search(pattern, webpage)
3127         if mobj:
3128             video_title = mobj.group(1)
3129         self.report_title(video_title)
3130
3131         # Step 2, Stimulate clicking the image box to launch video
3132         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3133         mobj = re.search(pattern, webpage)
3134         if mobj is None:
3135             raise ExtractorError(u'Unable to extract video page URL')
3136
3137         video_page = mobj.group(1)
3138         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3139         self.report_extract_vid_page(video_page)
3140
3141
3142         # Extract video links on video page
3143         """Extract video links of all sizes"""
3144         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3145         mobj = re.findall(pattern, webpage)
3146         if len(mobj) == 0:
3147             raise ExtractorError(u'Unable to extract video links')
3148
3149         # Sort in resolution
3150         links = sorted(mobj)
3151
3152         # Choose the lowest of the sort, i.e. highest resolution
3153         video_url = links[-1]
3154         # Only get the url. The resolution part in the tuple has no use anymore
3155         video_url = video_url[-1]
3156         # Treat escaped \u0026 style hex
3157         try:
3158             video_url = video_url.decode("unicode_escape")
3159         except AttributeError: # Python 3
3160             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3161
3162
3163         return [{
3164             'id':       video_id,
3165             'url':      video_url,
3166             'uploader': uploader,
3167             'upload_date':  upload_date,
3168             'title':    video_title,
3169             'ext':      video_extension,
3170         }]
3171
3172 class NBAIE(InfoExtractor):
3173     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3174     IE_NAME = u'nba'
3175
3176     def _real_extract(self, url):
3177         mobj = re.match(self._VALID_URL, url)
3178         if mobj is None:
3179             raise ExtractorError(u'Invalid URL: %s' % url)
3180
3181         video_id = mobj.group(1)
3182         if video_id.endswith('/index.html'):
3183             video_id = video_id[:-len('/index.html')]
3184
3185         webpage = self._download_webpage(url, video_id)
3186
3187         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3188         def _findProp(rexp, default=None):
3189             m = re.search(rexp, webpage)
3190             if m:
3191                 return unescapeHTML(m.group(1))
3192             else:
3193                 return default
3194
3195         shortened_video_id = video_id.rpartition('/')[2]
3196         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3197         info = {
3198             'id': shortened_video_id,
3199             'url': video_url,
3200             'ext': 'mp4',
3201             'title': title,
3202             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3203             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3204         }
3205         return [info]
3206
3207 class JustinTVIE(InfoExtractor):
3208     """Information extractor for justin.tv and twitch.tv"""
3209     # TODO: One broadcast may be split into multiple videos. The key
3210     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3211     # starts at 1 and increases. Can we treat all parts as one video?
3212
3213     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3214         (?:
3215             (?P<channelid>[^/]+)|
3216             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3217             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3218         )
3219         /?(?:\#.*)?$
3220         """
3221     _JUSTIN_PAGE_LIMIT = 100
3222     IE_NAME = u'justin.tv'
3223
3224     def report_download_page(self, channel, offset):
3225         """Report attempt to download a single page of videos."""
3226         self.to_screen(u'%s: Downloading video information from %d to %d' %
3227                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3228
3229     # Return count of items, list of *valid* items
3230     def _parse_page(self, url, video_id):
3231         webpage = self._download_webpage(url, video_id,
3232                                          u'Downloading video info JSON',
3233                                          u'unable to download video info JSON')
3234
3235         response = json.loads(webpage)
3236         if type(response) != list:
3237             error_text = response.get('error', 'unknown error')
3238             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3239         info = []
3240         for clip in response:
3241             video_url = clip['video_file_url']
3242             if video_url:
3243                 video_extension = os.path.splitext(video_url)[1][1:]
3244                 video_date = re.sub('-', '', clip['start_time'][:10])
3245                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3246                 video_id = clip['id']
3247                 video_title = clip.get('title', video_id)
3248                 info.append({
3249                     'id': video_id,
3250                     'url': video_url,
3251                     'title': video_title,
3252                     'uploader': clip.get('channel_name', video_uploader_id),
3253                     'uploader_id': video_uploader_id,
3254                     'upload_date': video_date,
3255                     'ext': video_extension,
3256                 })
3257         return (len(response), info)
3258
3259     def _real_extract(self, url):
3260         mobj = re.match(self._VALID_URL, url)
3261         if mobj is None:
3262             raise ExtractorError(u'invalid URL: %s' % url)
3263
3264         api_base = 'http://api.justin.tv'
3265         paged = False
3266         if mobj.group('channelid'):
3267             paged = True
3268             video_id = mobj.group('channelid')
3269             api = api_base + '/channel/archives/%s.json' % video_id
3270         elif mobj.group('chapterid'):
3271             chapter_id = mobj.group('chapterid')
3272
3273             webpage = self._download_webpage(url, chapter_id)
3274             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3275             if not m:
3276                 raise ExtractorError(u'Cannot find archive of a chapter')
3277             archive_id = m.group(1)
3278
3279             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3280             chapter_info_xml = self._download_webpage(api, chapter_id,
3281                                              note=u'Downloading chapter information',
3282                                              errnote=u'Chapter information download failed')
3283             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3284             for a in doc.findall('.//archive'):
3285                 if archive_id == a.find('./id').text:
3286                     break
3287             else:
3288                 raise ExtractorError(u'Could not find chapter in chapter information')
3289
3290             video_url = a.find('./video_file_url').text
3291             video_ext = video_url.rpartition('.')[2] or u'flv'
3292
3293             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3294             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3295                                    note='Downloading chapter metadata',
3296                                    errnote='Download of chapter metadata failed')
3297             chapter_info = json.loads(chapter_info_json)
3298
3299             bracket_start = int(doc.find('.//bracket_start').text)
3300             bracket_end = int(doc.find('.//bracket_end').text)
3301
3302             # TODO determine start (and probably fix up file)
3303             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3304             #video_url += u'?start=' + TODO:start_timestamp
3305             # bracket_start is 13290, but we want 51670615
3306             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3307                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3308
3309             info = {
3310                 'id': u'c' + chapter_id,
3311                 'url': video_url,
3312                 'ext': video_ext,
3313                 'title': chapter_info['title'],
3314                 'thumbnail': chapter_info['preview'],
3315                 'description': chapter_info['description'],
3316                 'uploader': chapter_info['channel']['display_name'],
3317                 'uploader_id': chapter_info['channel']['name'],
3318             }
3319             return [info]
3320         else:
3321             video_id = mobj.group('videoid')
3322             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3323
3324         self.report_extraction(video_id)
3325
3326         info = []
3327         offset = 0
3328         limit = self._JUSTIN_PAGE_LIMIT
3329         while True:
3330             if paged:
3331                 self.report_download_page(video_id, offset)
3332             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3333             page_count, page_info = self._parse_page(page_url, video_id)
3334             info.extend(page_info)
3335             if not paged or page_count != limit:
3336                 break
3337             offset += limit
3338         return info
3339
3340 class FunnyOrDieIE(InfoExtractor):
3341     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3342
3343     def _real_extract(self, url):
3344         mobj = re.match(self._VALID_URL, url)
3345         if mobj is None:
3346             raise ExtractorError(u'invalid URL: %s' % url)
3347
3348         video_id = mobj.group('id')
3349         webpage = self._download_webpage(url, video_id)
3350
3351         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3352         if not m:
3353             raise ExtractorError(u'Unable to find video information')
3354         video_url = unescapeHTML(m.group('url'))
3355
3356         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3357         if not m:
3358             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3359             if not m:
3360                 raise ExtractorError(u'Cannot find video title')
3361         title = clean_html(m.group('title'))
3362
3363         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3364         if m:
3365             desc = unescapeHTML(m.group('desc'))
3366         else:
3367             desc = None
3368
3369         info = {
3370             'id': video_id,
3371             'url': video_url,
3372             'ext': 'mp4',
3373             'title': title,
3374             'description': desc,
3375         }
3376         return [info]
3377
3378 class SteamIE(InfoExtractor):
3379     _VALID_URL = r"""http://store\.steampowered\.com/
3380                 (agecheck/)?
3381                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3382                 (?P<gameID>\d+)/?
3383                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3384                 """
3385
3386     @classmethod
3387     def suitable(cls, url):
3388         """Receives a URL and returns True if suitable for this IE."""
3389         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3390
3391     def _real_extract(self, url):
3392         m = re.match(self._VALID_URL, url, re.VERBOSE)
3393         gameID = m.group('gameID')
3394         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3395         self.report_age_confirmation()
3396         webpage = self._download_webpage(videourl, gameID)
3397         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3398         
3399         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3400         mweb = re.finditer(urlRE, webpage)
3401         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3402         titles = re.finditer(namesRE, webpage)
3403         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3404         thumbs = re.finditer(thumbsRE, webpage)
3405         videos = []
3406         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3407             video_id = vid.group('videoID')
3408             title = vtitle.group('videoName')
3409             video_url = vid.group('videoURL')
3410             video_thumb = thumb.group('thumbnail')
3411             if not video_url:
3412                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3413             info = {
3414                 'id':video_id,
3415                 'url':video_url,
3416                 'ext': 'flv',
3417                 'title': unescapeHTML(title),
3418                 'thumbnail': video_thumb
3419                   }
3420             videos.append(info)
3421         return [self.playlist_result(videos, gameID, game_title)]
3422
3423 class UstreamIE(InfoExtractor):
3424     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3425     IE_NAME = u'ustream'
3426
3427     def _real_extract(self, url):
3428         m = re.match(self._VALID_URL, url)
3429         video_id = m.group('videoID')
3430         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3431         webpage = self._download_webpage(url, video_id)
3432         self.report_extraction(video_id)
3433         try:
3434             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3435             title = m.group('title')
3436             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3437                           webpage, re.DOTALL)
3438             uploader = unescapeHTML(m.group('uploader').strip())
3439             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3440             thumb = m.group('thumb')
3441         except AttributeError:
3442             raise ExtractorError(u'Unable to extract info')
3443         info = {
3444                 'id':video_id,
3445                 'url':video_url,
3446                 'ext': 'flv',
3447                 'title': title,
3448                 'uploader': uploader,
3449                 'thumbnail': thumb,
3450                   }
3451         return info
3452
3453 class WorldStarHipHopIE(InfoExtractor):
3454     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3455     IE_NAME = u'WorldStarHipHop'
3456
3457     def _real_extract(self, url):
3458         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3459
3460         m = re.match(self._VALID_URL, url)
3461         video_id = m.group('id')
3462
3463         webpage_src = self._download_webpage(url, video_id) 
3464
3465         mobj = re.search(_src_url, webpage_src)
3466
3467         if mobj is not None:
3468             video_url = mobj.group(1)
3469             if 'mp4' in video_url:
3470                 ext = 'mp4'
3471             else:
3472                 ext = 'flv'
3473         else:
3474             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3475
3476         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3477
3478         if mobj is None:
3479             raise ExtractorError(u'Cannot determine title')
3480         title = mobj.group(1)
3481
3482         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3483         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3484         if mobj is not None:
3485             thumbnail = mobj.group(1)
3486         else:
3487             _title = r"""candytitles.*>(.*)</span>"""
3488             mobj = re.search(_title, webpage_src)
3489             if mobj is not None:
3490                 title = mobj.group(1)
3491             thumbnail = None
3492
3493         results = [{
3494                     'id': video_id,
3495                     'url' : video_url,
3496                     'title' : title,
3497                     'thumbnail' : thumbnail,
3498                     'ext' : ext,
3499                     }]
3500         return results
3501
3502 class RBMARadioIE(InfoExtractor):
3503     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3504
3505     def _real_extract(self, url):
3506         m = re.match(self._VALID_URL, url)
3507         video_id = m.group('videoID')
3508
3509         webpage = self._download_webpage(url, video_id)
3510         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3511         if not m:
3512             raise ExtractorError(u'Cannot find metadata')
3513         json_data = m.group(1)
3514
3515         try:
3516             data = json.loads(json_data)
3517         except ValueError as e:
3518             raise ExtractorError(u'Invalid JSON: ' + str(e))
3519
3520         video_url = data['akamai_url'] + '&cbr=256'
3521         url_parts = compat_urllib_parse_urlparse(video_url)
3522         video_ext = url_parts.path.rpartition('.')[2]
3523         info = {
3524                 'id': video_id,
3525                 'url': video_url,
3526                 'ext': video_ext,
3527                 'title': data['title'],
3528                 'description': data.get('teaser_text'),
3529                 'location': data.get('country_of_origin'),
3530                 'uploader': data.get('host', {}).get('name'),
3531                 'uploader_id': data.get('host', {}).get('slug'),
3532                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3533                 'duration': data.get('duration'),
3534         }
3535         return [info]
3536
3537
3538 class YouPornIE(InfoExtractor):
3539     """Information extractor for youporn.com."""
3540     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3541
3542     def _print_formats(self, formats):
3543         """Print all available formats"""
3544         print(u'Available formats:')
3545         print(u'ext\t\tformat')
3546         print(u'---------------------------------')
3547         for format in formats:
3548             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3549
3550     def _specific(self, req_format, formats):
3551         for x in formats:
3552             if(x["format"]==req_format):
3553                 return x
3554         return None
3555
3556     def _real_extract(self, url):
3557         mobj = re.match(self._VALID_URL, url)
3558         if mobj is None:
3559             raise ExtractorError(u'Invalid URL: %s' % url)
3560
3561         video_id = mobj.group('videoid')
3562
3563         req = compat_urllib_request.Request(url)
3564         req.add_header('Cookie', 'age_verified=1')
3565         webpage = self._download_webpage(req, video_id)
3566
3567         # Get the video title
3568         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3569         if result is None:
3570             raise ExtractorError(u'Unable to extract video title')
3571         video_title = result.group('title').strip()
3572
3573         # Get the video date
3574         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3575         if result is None:
3576             self._downloader.report_warning(u'unable to extract video date')
3577             upload_date = None
3578         else:
3579             upload_date = unified_strdate(result.group('date').strip())
3580
3581         # Get the video uploader
3582         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3583         if result is None:
3584             self._downloader.report_warning(u'unable to extract uploader')
3585             video_uploader = None
3586         else:
3587             video_uploader = result.group('uploader').strip()
3588             video_uploader = clean_html( video_uploader )
3589
3590         # Get all of the formats available
3591         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3592         result = re.search(DOWNLOAD_LIST_RE, webpage)
3593         if result is None:
3594             raise ExtractorError(u'Unable to extract download list')
3595         download_list_html = result.group('download_list').strip()
3596
3597         # Get all of the links from the page
3598         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3599         links = re.findall(LINK_RE, download_list_html)
3600         if(len(links) == 0):
3601             raise ExtractorError(u'ERROR: no known formats available for video')
3602
3603         self.to_screen(u'Links found: %d' % len(links))
3604
3605         formats = []
3606         for link in links:
3607
3608             # A link looks like this:
3609             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3610             # A path looks like this:
3611             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3612             video_url = unescapeHTML( link )
3613             path = compat_urllib_parse_urlparse( video_url ).path
3614             extension = os.path.splitext( path )[1][1:]
3615             format = path.split('/')[4].split('_')[:2]
3616             size = format[0]
3617             bitrate = format[1]
3618             format = "-".join( format )
3619             title = u'%s-%s-%s' % (video_title, size, bitrate)
3620
3621             formats.append({
3622                 'id': video_id,
3623                 'url': video_url,
3624                 'uploader': video_uploader,
3625                 'upload_date': upload_date,
3626                 'title': title,
3627                 'ext': extension,
3628                 'format': format,
3629                 'thumbnail': None,
3630                 'description': None,
3631                 'player_url': None
3632             })
3633
3634         if self._downloader.params.get('listformats', None):
3635             self._print_formats(formats)
3636             return
3637
3638         req_format = self._downloader.params.get('format', None)
3639         self.to_screen(u'Format: %s' % req_format)
3640
3641         if req_format is None or req_format == 'best':
3642             return [formats[0]]
3643         elif req_format == 'worst':
3644             return [formats[-1]]
3645         elif req_format in ('-1', 'all'):
3646             return formats
3647         else:
3648             format = self._specific( req_format, formats )
3649             if result is None:
3650                 raise ExtractorError(u'Requested format not available')
3651             return [format]
3652
3653
3654
3655 class PornotubeIE(InfoExtractor):
3656     """Information extractor for pornotube.com."""
3657     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3658
3659     def _real_extract(self, url):
3660         mobj = re.match(self._VALID_URL, url)
3661         if mobj is None:
3662             raise ExtractorError(u'Invalid URL: %s' % url)
3663
3664         video_id = mobj.group('videoid')
3665         video_title = mobj.group('title')
3666
3667         # Get webpage content
3668         webpage = self._download_webpage(url, video_id)
3669
3670         # Get the video URL
3671         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3672         result = re.search(VIDEO_URL_RE, webpage)
3673         if result is None:
3674             raise ExtractorError(u'Unable to extract video url')
3675         video_url = compat_urllib_parse.unquote(result.group('url'))
3676
3677         #Get the uploaded date
3678         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3679         result = re.search(VIDEO_UPLOADED_RE, webpage)
3680         if result is None:
3681             raise ExtractorError(u'Unable to extract video title')
3682         upload_date = unified_strdate(result.group('date'))
3683
3684         info = {'id': video_id,
3685                 'url': video_url,
3686                 'uploader': None,
3687                 'upload_date': upload_date,
3688                 'title': video_title,
3689                 'ext': 'flv',
3690                 'format': 'flv'}
3691
3692         return [info]
3693
3694 class YouJizzIE(InfoExtractor):
3695     """Information extractor for youjizz.com."""
3696     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3697
3698     def _real_extract(self, url):
3699         mobj = re.match(self._VALID_URL, url)
3700         if mobj is None:
3701             raise ExtractorError(u'Invalid URL: %s' % url)
3702
3703         video_id = mobj.group('videoid')
3704
3705         # Get webpage content
3706         webpage = self._download_webpage(url, video_id)
3707
3708         # Get the video title
3709         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3710         if result is None:
3711             raise ExtractorError(u'ERROR: unable to extract video title')
3712         video_title = result.group('title').strip()
3713
3714         # Get the embed page
3715         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3716         if result is None:
3717             raise ExtractorError(u'ERROR: unable to extract embed page')
3718
3719         embed_page_url = result.group(0).strip()
3720         video_id = result.group('videoid')
3721
3722         webpage = self._download_webpage(embed_page_url, video_id)
3723
3724         # Get the video URL
3725         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3726         if result is None:
3727             raise ExtractorError(u'ERROR: unable to extract video url')
3728         video_url = result.group('source')
3729
3730         info = {'id': video_id,
3731                 'url': video_url,
3732                 'title': video_title,
3733                 'ext': 'flv',
3734                 'format': 'flv',
3735                 'player_url': embed_page_url}
3736
3737         return [info]
3738
3739 class EightTracksIE(InfoExtractor):
3740     IE_NAME = '8tracks'
3741     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3742
3743     def _real_extract(self, url):
3744         mobj = re.match(self._VALID_URL, url)
3745         if mobj is None:
3746             raise ExtractorError(u'Invalid URL: %s' % url)
3747         playlist_id = mobj.group('id')
3748
3749         webpage = self._download_webpage(url, playlist_id)
3750
3751         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3752         if not m:
3753             raise ExtractorError(u'Cannot find trax information')
3754         json_like = m.group(1)
3755         data = json.loads(json_like)
3756
3757         session = str(random.randint(0, 1000000000))
3758         mix_id = data['id']
3759         track_count = data['tracks_count']
3760         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3761         next_url = first_url
3762         res = []
3763         for i in itertools.count():
3764             api_json = self._download_webpage(next_url, playlist_id,
3765                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3766                 errnote=u'Failed to download song information')
3767             api_data = json.loads(api_json)
3768             track_data = api_data[u'set']['track']
3769             info = {
3770                 'id': track_data['id'],
3771                 'url': track_data['track_file_stream_url'],
3772                 'title': track_data['performer'] + u' - ' + track_data['name'],
3773                 'raw_title': track_data['name'],
3774                 'uploader_id': data['user']['login'],
3775                 'ext': 'm4a',
3776             }
3777             res.append(info)
3778             if api_data['set']['at_last_track']:
3779                 break
3780             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3781         return res
3782
3783 class KeekIE(InfoExtractor):
3784     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3785     IE_NAME = u'keek'
3786
3787     def _real_extract(self, url):
3788         m = re.match(self._VALID_URL, url)
3789         video_id = m.group('videoID')
3790         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3791         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3792         webpage = self._download_webpage(url, video_id)
3793         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3794         title = unescapeHTML(m.group('title'))
3795         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3796         uploader = clean_html(m.group('uploader'))
3797         info = {
3798                 'id': video_id,
3799                 'url': video_url,
3800                 'ext': 'mp4',
3801                 'title': title,
3802                 'thumbnail': thumbnail,
3803                 'uploader': uploader
3804         }
3805         return [info]
3806
3807 class TEDIE(InfoExtractor):
3808     _VALID_URL=r'''http://www\.ted\.com/
3809                    (
3810                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3811                         |
3812                         ((?P<type_talk>talks)) # We have a simple talk
3813                    )
3814                    (/lang/(.*?))? # The url may contain the language
3815                    /(?P<name>\w+) # Here goes the name and then ".html"
3816                    '''
3817
3818     @classmethod
3819     def suitable(cls, url):
3820         """Receives a URL and returns True if suitable for this IE."""
3821         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3822
3823     def _real_extract(self, url):
3824         m=re.match(self._VALID_URL, url, re.VERBOSE)
3825         if m.group('type_talk'):
3826             return [self._talk_info(url)]
3827         else :
3828             playlist_id=m.group('playlist_id')
3829             name=m.group('name')
3830             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3831             return [self._playlist_videos_info(url,name,playlist_id)]
3832
3833     def _talk_video_link(self,mediaSlug):
3834         '''Returns the video link for that mediaSlug'''
3835         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3836
3837     def _playlist_videos_info(self,url,name,playlist_id=0):
3838         '''Returns the videos of the playlist'''
3839         video_RE=r'''
3840                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3841                      ([.\s]*?)data-playlist_item_id="(\d+)"
3842                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3843                      '''
3844         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3845         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3846         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3847         m_names=re.finditer(video_name_RE,webpage)
3848
3849         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3850         m_playlist = re.search(playlist_RE, webpage)
3851         playlist_title = m_playlist.group('playlist_title')
3852
3853         playlist_entries = []
3854         for m_video, m_name in zip(m_videos,m_names):
3855             video_id=m_video.group('video_id')
3856             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3857             playlist_entries.append(self.url_result(talk_url, 'TED'))
3858         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3859
3860     def _talk_info(self, url, video_id=0):
3861         """Return the video for the talk in the url"""
3862         m=re.match(self._VALID_URL, url,re.VERBOSE)
3863         videoName=m.group('name')
3864         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3865         # If the url includes the language we get the title translated
3866         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3867         title=re.search(title_RE, webpage).group('title')
3868         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3869                         "id":(?P<videoID>[\d]+).*?
3870                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3871         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3872         thumb_match=re.search(thumb_RE,webpage)
3873         info_match=re.search(info_RE,webpage,re.VERBOSE)
3874         video_id=info_match.group('videoID')
3875         mediaSlug=info_match.group('mediaSlug')
3876         video_url=self._talk_video_link(mediaSlug)
3877         info = {
3878                 'id': video_id,
3879                 'url': video_url,
3880                 'ext': 'mp4',
3881                 'title': title,
3882                 'thumbnail': thumb_match.group('thumbnail')
3883                 }
3884         return info
3885
3886 class MySpassIE(InfoExtractor):
3887     _VALID_URL = r'http://www.myspass.de/.*'
3888
3889     def _real_extract(self, url):
3890         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3891
3892         # video id is the last path element of the URL
3893         # usually there is a trailing slash, so also try the second but last
3894         url_path = compat_urllib_parse_urlparse(url).path
3895         url_parent_path, video_id = os.path.split(url_path)
3896         if not video_id:
3897             _, video_id = os.path.split(url_parent_path)
3898
3899         # get metadata
3900         metadata_url = META_DATA_URL_TEMPLATE % video_id
3901         metadata_text = self._download_webpage(metadata_url, video_id)
3902         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3903
3904         # extract values from metadata
3905         url_flv_el = metadata.find('url_flv')
3906         if url_flv_el is None:
3907             raise ExtractorError(u'Unable to extract download url')
3908         video_url = url_flv_el.text
3909         extension = os.path.splitext(video_url)[1][1:]
3910         title_el = metadata.find('title')
3911         if title_el is None:
3912             raise ExtractorError(u'Unable to extract title')
3913         title = title_el.text
3914         format_id_el = metadata.find('format_id')
3915         if format_id_el is None:
3916             format = ext
3917         else:
3918             format = format_id_el.text
3919         description_el = metadata.find('description')
3920         if description_el is not None:
3921             description = description_el.text
3922         else:
3923             description = None
3924         imagePreview_el = metadata.find('imagePreview')
3925         if imagePreview_el is not None:
3926             thumbnail = imagePreview_el.text
3927         else:
3928             thumbnail = None
3929         info = {
3930             'id': video_id,
3931             'url': video_url,
3932             'title': title,
3933             'ext': extension,
3934             'format': format,
3935             'thumbnail': thumbnail,
3936             'description': description
3937         }
3938         return [info]
3939
3940 class SpiegelIE(InfoExtractor):
3941     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3942
3943     def _real_extract(self, url):
3944         m = re.match(self._VALID_URL, url)
3945         video_id = m.group('videoID')
3946
3947         webpage = self._download_webpage(url, video_id)
3948         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3949         if not m:
3950             raise ExtractorError(u'Cannot find title')
3951         video_title = unescapeHTML(m.group(1))
3952
3953         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3954         xml_code = self._download_webpage(xml_url, video_id,
3955                     note=u'Downloading XML', errnote=u'Failed to download XML')
3956
3957         idoc = xml.etree.ElementTree.fromstring(xml_code)
3958         last_type = idoc[-1]
3959         filename = last_type.findall('./filename')[0].text
3960         duration = float(last_type.findall('./duration')[0].text)
3961
3962         video_url = 'http://video2.spiegel.de/flash/' + filename
3963         video_ext = filename.rpartition('.')[2]
3964         info = {
3965             'id': video_id,
3966             'url': video_url,
3967             'ext': video_ext,
3968             'title': video_title,
3969             'duration': duration,
3970         }
3971         return [info]
3972
3973 class LiveLeakIE(InfoExtractor):
3974
3975     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3976     IE_NAME = u'liveleak'
3977
3978     def _real_extract(self, url):
3979         mobj = re.match(self._VALID_URL, url)
3980         if mobj is None:
3981             raise ExtractorError(u'Invalid URL: %s' % url)
3982
3983         video_id = mobj.group('video_id')
3984
3985         webpage = self._download_webpage(url, video_id)
3986
3987         m = re.search(r'file: "(.*?)",', webpage)
3988         if not m:
3989             raise ExtractorError(u'Unable to find video url')
3990         video_url = m.group(1)
3991
3992         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3993         if not m:
3994             raise ExtractorError(u'Cannot find video title')
3995         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3996
3997         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3998         if m:
3999             desc = unescapeHTML(m.group('desc'))
4000         else:
4001             desc = None
4002
4003         m = re.search(r'By:.*?(\w+)</a>', webpage)
4004         if m:
4005             uploader = clean_html(m.group(1))
4006         else:
4007             uploader = None
4008
4009         info = {
4010             'id':  video_id,
4011             'url': video_url,
4012             'ext': 'mp4',
4013             'title': title,
4014             'description': desc,
4015             'uploader': uploader
4016         }
4017
4018         return [info]
4019
4020 class ARDIE(InfoExtractor):
4021     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4022     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4023     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4024
4025     def _real_extract(self, url):
4026         # determine video id from url
4027         m = re.match(self._VALID_URL, url)
4028
4029         numid = re.search(r'documentId=([0-9]+)', url)
4030         if numid:
4031             video_id = numid.group(1)
4032         else:
4033             video_id = m.group('video_id')
4034
4035         # determine title and media streams from webpage
4036         html = self._download_webpage(url, video_id)
4037         title = re.search(self._TITLE, html).group('title')
4038         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4039         if not streams:
4040             assert '"fsk"' in html
4041             raise ExtractorError(u'This video is only available after 8:00 pm')
4042
4043         # choose default media type and highest quality for now
4044         stream = max([s for s in streams if int(s["media_type"]) == 0],
4045                      key=lambda s: int(s["quality"]))
4046
4047         # there's two possibilities: RTMP stream or HTTP download
4048         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4049         if stream['rtmp_url']:
4050             self.to_screen(u'RTMP download detected')
4051             assert stream['video_url'].startswith('mp4:')
4052             info["url"] = stream["rtmp_url"]
4053             info["play_path"] = stream['video_url']
4054         else:
4055             assert stream["video_url"].endswith('.mp4')
4056             info["url"] = stream["video_url"]
4057         return [info]
4058
4059 class TumblrIE(InfoExtractor):
4060     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4061
4062     def _real_extract(self, url):
4063         m_url = re.match(self._VALID_URL, url)
4064         video_id = m_url.group('id')
4065         blog = m_url.group('blog_name')
4066
4067         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4068         webpage = self._download_webpage(url, video_id)
4069
4070         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4071         video = re.search(re_video, webpage)
4072         if video is None:
4073             self.to_screen("No video found")
4074             return []
4075         video_url = video.group('video_url')
4076         ext = video.group('ext')
4077
4078         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4079         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4080
4081         # The only place where you can get a title, it's not complete,
4082         # but searching in other places doesn't work for all videos
4083         re_title = r'<title>(?P<title>.*?)</title>'
4084         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4085
4086         return [{'id': video_id,
4087                  'url': video_url,
4088                  'title': title,
4089                  'thumbnail': thumb,
4090                  'ext': ext
4091                  }]
4092
4093 class BandcampIE(InfoExtractor):
4094     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4095
4096     def _real_extract(self, url):
4097         mobj = re.match(self._VALID_URL, url)
4098         title = mobj.group('title')
4099         webpage = self._download_webpage(url, title)
4100         # We get the link to the free download page
4101         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4102         if m_download is None:
4103             raise ExtractorError(u'No free songs founded')
4104
4105         download_link = m_download.group(1)
4106         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4107                        webpage, re.MULTILINE|re.DOTALL).group('id')
4108
4109         download_webpage = self._download_webpage(download_link, id,
4110                                                   'Downloading free downloads page')
4111         # We get the dictionary of the track from some javascrip code
4112         info = re.search(r'items: (.*?),$',
4113                          download_webpage, re.MULTILINE).group(1)
4114         info = json.loads(info)[0]
4115         # We pick mp3-320 for now, until format selection can be easily implemented.
4116         mp3_info = info[u'downloads'][u'mp3-320']
4117         # If we try to use this url it says the link has expired
4118         initial_url = mp3_info[u'url']
4119         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4120         m_url = re.match(re_url, initial_url)
4121         #We build the url we will use to get the final track url
4122         # This url is build in Bandcamp in the script download_bunde_*.js
4123         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4124         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4125         # If we could correctly generate the .rand field the url would be
4126         #in the "download_url" key
4127         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4128
4129         track_info = {'id':id,
4130                       'title' : info[u'title'],
4131                       'ext' : 'mp3',
4132                       'url' : final_url,
4133                       'thumbnail' : info[u'thumb_url'],
4134                       'uploader' : info[u'artist']
4135                       }
4136
4137         return [track_info]
4138
4139 class RedTubeIE(InfoExtractor):
4140     """Information Extractor for redtube"""
4141     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4142
4143     def _real_extract(self,url):
4144         mobj = re.match(self._VALID_URL, url)
4145         if mobj is None:
4146             raise ExtractorError(u'Invalid URL: %s' % url)
4147
4148         video_id = mobj.group('id')
4149         video_extension = 'mp4'        
4150         webpage = self._download_webpage(url, video_id)
4151         self.report_extraction(video_id)
4152         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4153
4154         if mobj is None:
4155             raise ExtractorError(u'Unable to extract media URL')
4156
4157         video_url = mobj.group(1)
4158         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4159         if mobj is None:
4160             raise ExtractorError(u'Unable to extract title')
4161         video_title = mobj.group(1)
4162
4163         return [{
4164             'id':       video_id,
4165             'url':      video_url,
4166             'ext':      video_extension,
4167             'title':    video_title,
4168         }]
4169         
4170 class InaIE(InfoExtractor):
4171     """Information Extractor for Ina.fr"""
4172     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4173
4174     def _real_extract(self,url):
4175         mobj = re.match(self._VALID_URL, url)
4176
4177         video_id = mobj.group('id')
4178         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4179         video_extension = 'mp4'
4180         webpage = self._download_webpage(mrss_url, video_id)
4181
4182         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4183         if mobj is None:
4184             raise ExtractorError(u'Unable to extract media URL')
4185         video_url = mobj.group(1)
4186
4187         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4188         if mobj is None:
4189             raise ExtractorError(u'Unable to extract title')
4190         video_title = mobj.group(1)
4191
4192         return [{
4193             'id':       video_id,
4194             'url':      video_url,
4195             'ext':      video_extension,
4196             'title':    video_title,
4197         }]
4198
4199 class HowcastIE(InfoExtractor):
4200     """Information Extractor for Howcast.com"""
4201     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4202
4203     def _real_extract(self, url):
4204         mobj = re.match(self._VALID_URL, url)
4205
4206         video_id = mobj.group('id')
4207         webpage_url = 'http://www.howcast.com/videos/' + video_id
4208         webpage = self._download_webpage(webpage_url, video_id)
4209
4210         self.report_extraction(video_id)
4211
4212         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4213         if mobj is None:
4214             raise ExtractorError(u'Unable to extract video URL')
4215         video_url = mobj.group(1)
4216
4217         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4218         if mobj is None:
4219             raise ExtractorError(u'Unable to extract title')
4220         video_title = mobj.group(1) or mobj.group(2)
4221
4222         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4223         if mobj is None:
4224             self._downloader.report_warning(u'unable to extract description')
4225             video_description = None
4226         else:
4227             video_description = mobj.group(1) or mobj.group(2)
4228
4229         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4230         if mobj is None:
4231             raise ExtractorError(u'Unable to extract thumbnail')
4232         thumbnail = mobj.group(1)
4233
4234         return [{
4235             'id':       video_id,
4236             'url':      video_url,
4237             'ext':      'mp4',
4238             'title':    video_title,
4239             'description': video_description,
4240             'thumbnail': thumbnail,
4241         }]
4242
4243 class VineIE(InfoExtractor):
4244     """Information Extractor for Vine.co"""
4245     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4246
4247     def _real_extract(self, url):
4248
4249         mobj = re.match(self._VALID_URL, url)
4250
4251         video_id = mobj.group('id')
4252         webpage_url = 'https://vine.co/v/' + video_id
4253         webpage = self._download_webpage(webpage_url, video_id)
4254
4255         self.report_extraction(video_id)
4256
4257         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4258         if mobj is None:
4259             raise ExtractorError(u'Unable to extract video URL')
4260         video_url = mobj.group(1)
4261
4262         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4263         if mobj is None:
4264             raise ExtractorError(u'Unable to extract title')
4265         video_title = mobj.group(1)
4266
4267         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4268         if mobj is None:
4269             raise ExtractorError(u'Unable to extract thumbnail')
4270         thumbnail = mobj.group(1)
4271
4272         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4273         if mobj is None:
4274             raise ExtractorError(u'Unable to extract uploader')
4275         uploader = mobj.group(1)
4276
4277         return [{
4278             'id':        video_id,
4279             'url':       video_url,
4280             'ext':       'mp4',
4281             'title':     video_title,
4282             'thumbnail': thumbnail,
4283             'uploader':  uploader,
4284         }]
4285
4286 class FlickrIE(InfoExtractor):
4287     """Information Extractor for Flickr videos"""
4288     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4289
4290     def _real_extract(self, url):
4291         mobj = re.match(self._VALID_URL, url)
4292
4293         video_id = mobj.group('id')
4294         video_uploader_id = mobj.group('uploader_id')
4295         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4296         webpage = self._download_webpage(webpage_url, video_id)
4297
4298         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4299         if mobj is None:
4300             raise ExtractorError(u'Unable to extract video secret')
4301         secret = mobj.group(1)
4302
4303         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4304         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4305
4306         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4307         if mobj is None:
4308             raise ExtractorError(u'Unable to extract node_id')
4309         node_id = mobj.group(1)
4310
4311         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4312         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4313
4314         self.report_extraction(video_id)
4315
4316         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4317         if mobj is None:
4318             raise ExtractorError(u'Unable to extract video url')
4319         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4320
4321         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4322         if mobj is None:
4323             raise ExtractorError(u'Unable to extract title')
4324         video_title = mobj.group(1) or mobj.group(2)
4325
4326         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4327         if mobj is None:
4328             self._downloader.report_warning(u'unable to extract description')
4329             video_description = None
4330         else:
4331             video_description = mobj.group(1) or mobj.group(2)
4332
4333         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4334         if mobj is None:
4335             raise ExtractorError(u'Unable to extract thumbnail')
4336         thumbnail = mobj.group(1) or mobj.group(2)
4337
4338         return [{
4339             'id':          video_id,
4340             'url':         video_url,
4341             'ext':         'mp4',
4342             'title':       video_title,
4343             'description': video_description,
4344             'thumbnail':   thumbnail,
4345             'uploader_id': video_uploader_id,
4346         }]
4347
4348 class TeamcocoIE(InfoExtractor):
4349     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4350
4351     def _real_extract(self, url):
4352         mobj = re.match(self._VALID_URL, url)
4353         if mobj is None:
4354             raise ExtractorError(u'Invalid URL: %s' % url)
4355         url_title = mobj.group('url_title')
4356         webpage = self._download_webpage(url, url_title)
4357
4358         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4359         video_id = mobj.group(1)
4360
4361         self.report_extraction(video_id)
4362
4363         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4364         if mobj is None:
4365             raise ExtractorError(u'Unable to extract title')
4366         video_title = mobj.group(1)
4367
4368         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4369         if mobj is None:
4370             raise ExtractorError(u'Unable to extract thumbnail')
4371         thumbnail = mobj.group(1)
4372
4373         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4374         if mobj is None:
4375             raise ExtractorError(u'Unable to extract description')
4376         description = mobj.group(1)
4377
4378         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4379         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4380         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4381         if mobj is None:
4382             raise ExtractorError(u'Unable to extract video url')
4383         video_url = mobj.group(1)
4384
4385         return [{
4386             'id':          video_id,
4387             'url':         video_url,
4388             'ext':         'mp4',
4389             'title':       video_title,
4390             'thumbnail':   thumbnail,
4391             'description': description,
4392         }]
4393
4394 def gen_extractors():
4395     """ Return a list of an instance of every supported extractor.
4396     The order does matter; the first extractor matched is the one handling the URL.
4397     """
4398     return [
4399         YoutubePlaylistIE(),
4400         YoutubeChannelIE(),
4401         YoutubeUserIE(),
4402         YoutubeSearchIE(),
4403         YoutubeIE(),
4404         MetacafeIE(),
4405         DailymotionIE(),
4406         GoogleSearchIE(),
4407         PhotobucketIE(),
4408         YahooIE(),
4409         YahooSearchIE(),
4410         DepositFilesIE(),
4411         FacebookIE(),
4412         BlipTVIE(),
4413         BlipTVUserIE(),
4414         VimeoIE(),
4415         MyVideoIE(),
4416         ComedyCentralIE(),
4417         EscapistIE(),
4418         CollegeHumorIE(),
4419         XVideosIE(),
4420         SoundcloudSetIE(),
4421         SoundcloudIE(),
4422         InfoQIE(),
4423         MixcloudIE(),
4424         StanfordOpenClassroomIE(),
4425         MTVIE(),
4426         YoukuIE(),
4427         XNXXIE(),
4428         YouJizzIE(),
4429         PornotubeIE(),
4430         YouPornIE(),
4431         GooglePlusIE(),
4432         ArteTvIE(),
4433         NBAIE(),
4434         WorldStarHipHopIE(),
4435         JustinTVIE(),
4436         FunnyOrDieIE(),
4437         SteamIE(),
4438         UstreamIE(),
4439         RBMARadioIE(),
4440         EightTracksIE(),
4441         KeekIE(),
4442         TEDIE(),
4443         MySpassIE(),
4444         SpiegelIE(),
4445         LiveLeakIE(),
4446         ARDIE(),
4447         TumblrIE(),
4448         BandcampIE(),
4449         RedTubeIE(),
4450         InaIE(),
4451         HowcastIE(),
4452         VineIE(),
4453         FlickrIE(),
4454         TeamcocoIE(),
4455         GenericIE()
4456     ]
4457
4458 def get_info_extractor(ie_name):
4459     """Returns the info extractor class with the given ie_name"""
4460     return globals()[ie_name+'IE']