Simplify generic search IE (Closes #839)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191 class SearchInfoExtractor(InfoExtractor):
192     """
193     Base class for paged search queries extractors.
194     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
195     Instances should define _SEARCH_KEY and _MAX_RESULTS.
196     """
197
198     @classmethod
199     def _make_valid_url(cls):
200         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
201
202     @classmethod
203     def suitable(cls, url):
204         return re.match(cls._make_valid_url(), url) is not None
205
206     def _real_extract(self, query):
207         mobj = re.match(self._make_valid_url(), query)
208         if mobj is None:
209             raise ExtractorError(u'Invalid search query "%s"' % query)
210
211         prefix = mobj.group('prefix')
212         query = mobj.group('query')
213         if prefix == '':
214             return self._get_n_results(query, 1)
215         elif prefix == 'all':
216             return self._get_n_results(query, self._MAX_RESULTS)
217         else:
218             n = int(prefix)
219             if n <= 0:
220                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
221             elif n > self._MAX_RESULTS:
222                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
223                 n = self._MAX_RESULTS
224             return self._get_n_results(query, n)
225
226     def _get_n_results(self, query, n):
227         """Get a specified number of results for a query"""
228         raise NotImplementedError("This method must be implemented by sublclasses")
229
230
231 class YoutubeIE(InfoExtractor):
232     """Information extractor for youtube.com."""
233
234     _VALID_URL = r"""^
235                      (
236                          (?:https?://)?                                       # http(s):// (optional)
237                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
238                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
239                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
240                          (?:                                                  # the various things that can precede the ID:
241                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
242                              |(?:                                             # or the v= param in all its forms
243                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
244                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
245                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
246                                  v=
247                              )
248                          )?                                                   # optional -> youtube.com/xxxx is OK
249                      )?                                                       # all until now is optional -> you can pass the naked ID
250                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
251                      (?(1).+)?                                                # if we found the ID, everything can follow
252                      $"""
253     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
254     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
255     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
256     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
257     _NETRC_MACHINE = 'youtube'
258     # Listed in order of quality
259     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
260     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
261     _video_extensions = {
262         '13': '3gp',
263         '17': 'mp4',
264         '18': 'mp4',
265         '22': 'mp4',
266         '37': 'mp4',
267         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
268         '43': 'webm',
269         '44': 'webm',
270         '45': 'webm',
271         '46': 'webm',
272     }
273     _video_dimensions = {
274         '5': '240x400',
275         '6': '???',
276         '13': '???',
277         '17': '144x176',
278         '18': '360x640',
279         '22': '720x1280',
280         '34': '360x640',
281         '35': '480x854',
282         '37': '1080x1920',
283         '38': '3072x4096',
284         '43': '360x640',
285         '44': '480x854',
286         '45': '720x1280',
287         '46': '1080x1920',
288     }
289     IE_NAME = u'youtube'
290
291     @classmethod
292     def suitable(cls, url):
293         """Receives a URL and returns True if suitable for this IE."""
294         if YoutubePlaylistIE.suitable(url): return False
295         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
296
297     def report_lang(self):
298         """Report attempt to set language."""
299         self.to_screen(u'Setting language')
300
301     def report_login(self):
302         """Report attempt to log in."""
303         self.to_screen(u'Logging in')
304
305     def report_video_webpage_download(self, video_id):
306         """Report attempt to download video webpage."""
307         self.to_screen(u'%s: Downloading video webpage' % video_id)
308
309     def report_video_info_webpage_download(self, video_id):
310         """Report attempt to download video info webpage."""
311         self.to_screen(u'%s: Downloading video info webpage' % video_id)
312
313     def report_video_subtitles_download(self, video_id):
314         """Report attempt to download video info webpage."""
315         self.to_screen(u'%s: Checking available subtitles' % video_id)
316
317     def report_video_subtitles_request(self, video_id, sub_lang, format):
318         """Report attempt to download video info webpage."""
319         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
320
321     def report_video_subtitles_available(self, video_id, sub_lang_list):
322         """Report available subtitles."""
323         sub_lang = ",".join(list(sub_lang_list.keys()))
324         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
325
326     def report_information_extraction(self, video_id):
327         """Report attempt to extract video information."""
328         self.to_screen(u'%s: Extracting video information' % video_id)
329
330     def report_unavailable_format(self, video_id, format):
331         """Report extracted video URL."""
332         self.to_screen(u'%s: Format %s not available' % (video_id, format))
333
334     def report_rtmp_download(self):
335         """Indicate the download will use the RTMP protocol."""
336         self.to_screen(u'RTMP download detected')
337
338     def _get_available_subtitles(self, video_id):
339         self.report_video_subtitles_download(video_id)
340         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
341         try:
342             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344             return (u'unable to download video subtitles: %s' % compat_str(err), None)
345         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
346         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
347         if not sub_lang_list:
348             return (u'video doesn\'t have subtitles', None)
349         return sub_lang_list
350
351     def _list_available_subtitles(self, video_id):
352         sub_lang_list = self._get_available_subtitles(video_id)
353         self.report_video_subtitles_available(video_id, sub_lang_list)
354
355     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
356         """
357         Return tuple:
358         (error_message, sub_lang, sub)
359         """
360         self.report_video_subtitles_request(video_id, sub_lang, format)
361         params = compat_urllib_parse.urlencode({
362             'lang': sub_lang,
363             'name': sub_name,
364             'v': video_id,
365             'fmt': format,
366         })
367         url = 'http://www.youtube.com/api/timedtext?' + params
368         try:
369             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
372         if not sub:
373             return (u'Did not fetch video subtitles', None, None)
374         return (None, sub_lang, sub)
375
376     def _extract_subtitle(self, video_id):
377         """
378         Return a list with a tuple:
379         [(error_message, sub_lang, sub)]
380         """
381         sub_lang_list = self._get_available_subtitles(video_id)
382         sub_format = self._downloader.params.get('subtitlesformat')
383         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
384             return [(sub_lang_list[0], None, None)]
385         if self._downloader.params.get('subtitleslang', False):
386             sub_lang = self._downloader.params.get('subtitleslang')
387         elif 'en' in sub_lang_list:
388             sub_lang = 'en'
389         else:
390             sub_lang = list(sub_lang_list.keys())[0]
391         if not sub_lang in sub_lang_list:
392             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
393
394         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
395         return [subtitle]
396
397     def _extract_all_subtitles(self, video_id):
398         sub_lang_list = self._get_available_subtitles(video_id)
399         sub_format = self._downloader.params.get('subtitlesformat')
400         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
401             return [(sub_lang_list[0], None, None)]
402         subtitles = []
403         for sub_lang in sub_lang_list:
404             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
405             subtitles.append(subtitle)
406         return subtitles
407
408     def _print_formats(self, formats):
409         print('Available formats:')
410         for x in formats:
411             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
412
413     def _real_initialize(self):
414         if self._downloader is None:
415             return
416
417         username = None
418         password = None
419         downloader_params = self._downloader.params
420
421         # Attempt to use provided username and password or .netrc data
422         if downloader_params.get('username', None) is not None:
423             username = downloader_params['username']
424             password = downloader_params['password']
425         elif downloader_params.get('usenetrc', False):
426             try:
427                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
428                 if info is not None:
429                     username = info[0]
430                     password = info[2]
431                 else:
432                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
433             except (IOError, netrc.NetrcParseError) as err:
434                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
435                 return
436
437         # Set language
438         request = compat_urllib_request.Request(self._LANG_URL)
439         try:
440             self.report_lang()
441             compat_urllib_request.urlopen(request).read()
442         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
443             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
444             return
445
446         # No authentication to be performed
447         if username is None:
448             return
449
450         request = compat_urllib_request.Request(self._LOGIN_URL)
451         try:
452             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
455             return
456
457         galx = None
458         dsh = None
459         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
460         if match:
461           galx = match.group(1)
462
463         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
464         if match:
465           dsh = match.group(1)
466
467         # Log in
468         login_form_strs = {
469                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
470                 u'Email': username,
471                 u'GALX': galx,
472                 u'Passwd': password,
473                 u'PersistentCookie': u'yes',
474                 u'_utf8': u'霱',
475                 u'bgresponse': u'js_disabled',
476                 u'checkConnection': u'',
477                 u'checkedDomains': u'youtube',
478                 u'dnConn': u'',
479                 u'dsh': dsh,
480                 u'pstMsg': u'0',
481                 u'rmShown': u'1',
482                 u'secTok': u'',
483                 u'signIn': u'Sign in',
484                 u'timeStmp': u'',
485                 u'service': u'youtube',
486                 u'uilel': u'3',
487                 u'hl': u'en_US',
488         }
489         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
490         # chokes on unicode
491         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
492         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
493         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
494         try:
495             self.report_login()
496             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
497             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
498                 self._downloader.report_warning(u'unable to log in: bad username or password')
499                 return
500         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
501             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
502             return
503
504         # Confirm age
505         age_form = {
506                 'next_url':     '/',
507                 'action_confirm':   'Confirm',
508                 }
509         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
510         try:
511             self.report_age_confirmation()
512             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
515
516     def _extract_id(self, url):
517         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
518         if mobj is None:
519             raise ExtractorError(u'Invalid URL: %s' % url)
520         video_id = mobj.group(2)
521         return video_id
522
523     def _real_extract(self, url):
524         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
525         mobj = re.search(self._NEXT_URL_RE, url)
526         if mobj:
527             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
528         video_id = self._extract_id(url)
529
530         # Get video webpage
531         self.report_video_webpage_download(video_id)
532         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
533         request = compat_urllib_request.Request(url)
534         try:
535             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
538
539         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
540
541         # Attempt to extract SWF player URL
542         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
543         if mobj is not None:
544             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
545         else:
546             player_url = None
547
548         # Get video info
549         self.report_video_info_webpage_download(video_id)
550         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
551             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
552                     % (video_id, el_type))
553             video_info_webpage = self._download_webpage(video_info_url, video_id,
554                                     note=False,
555                                     errnote='unable to download video info webpage')
556             video_info = compat_parse_qs(video_info_webpage)
557             if 'token' in video_info:
558                 break
559         if 'token' not in video_info:
560             if 'reason' in video_info:
561                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
562             else:
563                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
564
565         # Check for "rental" videos
566         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
567             raise ExtractorError(u'"rental" videos not supported')
568
569         # Start extracting information
570         self.report_information_extraction(video_id)
571
572         # uploader
573         if 'author' not in video_info:
574             raise ExtractorError(u'Unable to extract uploader name')
575         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
576
577         # uploader_id
578         video_uploader_id = None
579         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
580         if mobj is not None:
581             video_uploader_id = mobj.group(1)
582         else:
583             self._downloader.report_warning(u'unable to extract uploader nickname')
584
585         # title
586         if 'title' not in video_info:
587             raise ExtractorError(u'Unable to extract video title')
588         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
589
590         # thumbnail image
591         if 'thumbnail_url' not in video_info:
592             self._downloader.report_warning(u'unable to extract video thumbnail')
593             video_thumbnail = ''
594         else:   # don't panic if we can't find it
595             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
596
597         # upload date
598         upload_date = None
599         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
600         if mobj is not None:
601             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
602             upload_date = unified_strdate(upload_date)
603
604         # description
605         video_description = get_element_by_id("eow-description", video_webpage)
606         if video_description:
607             video_description = clean_html(video_description)
608         else:
609             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
610             if fd_mobj:
611                 video_description = unescapeHTML(fd_mobj.group(1))
612             else:
613                 video_description = u''
614
615         # subtitles
616         video_subtitles = None
617
618         if self._downloader.params.get('writesubtitles', False):
619             video_subtitles = self._extract_subtitle(video_id)
620             if video_subtitles:
621                 (sub_error, sub_lang, sub) = video_subtitles[0]
622                 if sub_error:
623                     self._downloader.report_error(sub_error)
624
625         if self._downloader.params.get('allsubtitles', False):
626             video_subtitles = self._extract_all_subtitles(video_id)
627             for video_subtitle in video_subtitles:
628                 (sub_error, sub_lang, sub) = video_subtitle
629                 if sub_error:
630                     self._downloader.report_error(sub_error)
631
632         if self._downloader.params.get('listsubtitles', False):
633             sub_lang_list = self._list_available_subtitles(video_id)
634             return
635
636         if 'length_seconds' not in video_info:
637             self._downloader.report_warning(u'unable to extract video duration')
638             video_duration = ''
639         else:
640             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
641
642         # token
643         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
644
645         # Decide which formats to download
646         req_format = self._downloader.params.get('format', None)
647
648         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
649             self.report_rtmp_download()
650             video_url_list = [(None, video_info['conn'][0])]
651         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
652             url_map = {}
653             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
654                 url_data = compat_parse_qs(url_data_str)
655                 if 'itag' in url_data and 'url' in url_data:
656                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
657                     if not 'ratebypass' in url: url += '&ratebypass=yes'
658                     url_map[url_data['itag'][0]] = url
659
660             format_limit = self._downloader.params.get('format_limit', None)
661             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
662             if format_limit is not None and format_limit in available_formats:
663                 format_list = available_formats[available_formats.index(format_limit):]
664             else:
665                 format_list = available_formats
666             existing_formats = [x for x in format_list if x in url_map]
667             if len(existing_formats) == 0:
668                 raise ExtractorError(u'no known formats available for video')
669             if self._downloader.params.get('listformats', None):
670                 self._print_formats(existing_formats)
671                 return
672             if req_format is None or req_format == 'best':
673                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
674             elif req_format == 'worst':
675                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
676             elif req_format in ('-1', 'all'):
677                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
678             else:
679                 # Specific formats. We pick the first in a slash-delimeted sequence.
680                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
681                 req_formats = req_format.split('/')
682                 video_url_list = None
683                 for rf in req_formats:
684                     if rf in url_map:
685                         video_url_list = [(rf, url_map[rf])]
686                         break
687                 if video_url_list is None:
688                     raise ExtractorError(u'requested format not available')
689         else:
690             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
691
692         results = []
693         for format_param, video_real_url in video_url_list:
694             # Extension
695             video_extension = self._video_extensions.get(format_param, 'flv')
696
697             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
698                                               self._video_dimensions.get(format_param, '???'))
699
700             results.append({
701                 'id':       video_id,
702                 'url':      video_real_url,
703                 'uploader': video_uploader,
704                 'uploader_id': video_uploader_id,
705                 'upload_date':  upload_date,
706                 'title':    video_title,
707                 'ext':      video_extension,
708                 'format':   video_format,
709                 'thumbnail':    video_thumbnail,
710                 'description':  video_description,
711                 'player_url':   player_url,
712                 'subtitles':    video_subtitles,
713                 'duration':     video_duration
714             })
715         return results
716
717
718 class MetacafeIE(InfoExtractor):
719     """Information Extractor for metacafe.com."""
720
721     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
722     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
723     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
724     IE_NAME = u'metacafe'
725
726     def report_disclaimer(self):
727         """Report disclaimer retrieval."""
728         self.to_screen(u'Retrieving disclaimer')
729
730     def _real_initialize(self):
731         # Retrieve disclaimer
732         request = compat_urllib_request.Request(self._DISCLAIMER)
733         try:
734             self.report_disclaimer()
735             disclaimer = compat_urllib_request.urlopen(request).read()
736         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
737             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
738
739         # Confirm age
740         disclaimer_form = {
741             'filters': '0',
742             'submit': "Continue - I'm over 18",
743             }
744         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
745         try:
746             self.report_age_confirmation()
747             disclaimer = compat_urllib_request.urlopen(request).read()
748         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
749             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
750
751     def _real_extract(self, url):
752         # Extract id and simplified title from URL
753         mobj = re.match(self._VALID_URL, url)
754         if mobj is None:
755             raise ExtractorError(u'Invalid URL: %s' % url)
756
757         video_id = mobj.group(1)
758
759         # Check if video comes from YouTube
760         mobj2 = re.match(r'^yt-(.*)$', video_id)
761         if mobj2 is not None:
762             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
763
764         # Retrieve video webpage to extract further information
765         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
766
767         # Extract URL, uploader and title from webpage
768         self.report_extraction(video_id)
769         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
770         if mobj is not None:
771             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
772             video_extension = mediaURL[-3:]
773
774             # Extract gdaKey if available
775             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
776             if mobj is None:
777                 video_url = mediaURL
778             else:
779                 gdaKey = mobj.group(1)
780                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
781         else:
782             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
783             if mobj is None:
784                 raise ExtractorError(u'Unable to extract media URL')
785             vardict = compat_parse_qs(mobj.group(1))
786             if 'mediaData' not in vardict:
787                 raise ExtractorError(u'Unable to extract media URL')
788             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
789             if mobj is None:
790                 raise ExtractorError(u'Unable to extract media URL')
791             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
792             video_extension = mediaURL[-3:]
793             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
794
795         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
796         if mobj is None:
797             raise ExtractorError(u'Unable to extract title')
798         video_title = mobj.group(1).decode('utf-8')
799
800         mobj = re.search(r'submitter=(.*?);', webpage)
801         if mobj is None:
802             raise ExtractorError(u'Unable to extract uploader nickname')
803         video_uploader = mobj.group(1)
804
805         return [{
806             'id':       video_id.decode('utf-8'),
807             'url':      video_url.decode('utf-8'),
808             'uploader': video_uploader.decode('utf-8'),
809             'upload_date':  None,
810             'title':    video_title,
811             'ext':      video_extension.decode('utf-8'),
812         }]
813
814 class DailymotionIE(InfoExtractor):
815     """Information Extractor for Dailymotion"""
816
817     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
818     IE_NAME = u'dailymotion'
819
820     def _real_extract(self, url):
821         # Extract id and simplified title from URL
822         mobj = re.match(self._VALID_URL, url)
823         if mobj is None:
824             raise ExtractorError(u'Invalid URL: %s' % url)
825
826         video_id = mobj.group(1).split('_')[0].split('?')[0]
827
828         video_extension = 'mp4'
829
830         # Retrieve video webpage to extract further information
831         request = compat_urllib_request.Request(url)
832         request.add_header('Cookie', 'family_filter=off')
833         webpage = self._download_webpage(request, video_id)
834
835         # Extract URL, uploader and title from webpage
836         self.report_extraction(video_id)
837         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
838         if mobj is None:
839             raise ExtractorError(u'Unable to extract media URL')
840         flashvars = compat_urllib_parse.unquote(mobj.group(1))
841
842         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
843             if key in flashvars:
844                 max_quality = key
845                 self.to_screen(u'Using %s' % key)
846                 break
847         else:
848             raise ExtractorError(u'Unable to extract video URL')
849
850         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
851         if mobj is None:
852             raise ExtractorError(u'Unable to extract video URL')
853
854         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
855
856         # TODO: support choosing qualities
857
858         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
859         if mobj is None:
860             raise ExtractorError(u'Unable to extract title')
861         video_title = unescapeHTML(mobj.group('title'))
862
863         video_uploader = None
864         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
865         if mobj is None:
866             # lookin for official user
867             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
868             if mobj_official is None:
869                 self._downloader.report_warning(u'unable to extract uploader nickname')
870             else:
871                 video_uploader = mobj_official.group(1)
872         else:
873             video_uploader = mobj.group(1)
874
875         video_upload_date = None
876         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
877         if mobj is not None:
878             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
879
880         return [{
881             'id':       video_id,
882             'url':      video_url,
883             'uploader': video_uploader,
884             'upload_date':  video_upload_date,
885             'title':    video_title,
886             'ext':      video_extension,
887         }]
888
889
890 class PhotobucketIE(InfoExtractor):
891     """Information extractor for photobucket.com."""
892
893     # TODO: the original _VALID_URL was:
894     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
895     # Check if it's necessary to keep the old extracion process
896     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
897     IE_NAME = u'photobucket'
898
899     def _real_extract(self, url):
900         # Extract id from URL
901         mobj = re.match(self._VALID_URL, url)
902         if mobj is None:
903             raise ExtractorError(u'Invalid URL: %s' % url)
904
905         video_id = mobj.group('id')
906
907         video_extension = mobj.group('ext')
908
909         # Retrieve video webpage to extract further information
910         webpage = self._download_webpage(url, video_id)
911
912         # Extract URL, uploader, and title from webpage
913         self.report_extraction(video_id)
914         # We try first by looking the javascript code:
915         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
916         if mobj is not None:
917             info = json.loads(mobj.group('json'))
918             return [{
919                 'id':       video_id,
920                 'url':      info[u'downloadUrl'],
921                 'uploader': info[u'username'],
922                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
923                 'title':    info[u'title'],
924                 'ext':      video_extension,
925                 'thumbnail': info[u'thumbUrl'],
926             }]
927
928         # We try looking in other parts of the webpage
929         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
930         if mobj is None:
931             raise ExtractorError(u'Unable to extract media URL')
932         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
933
934         video_url = mediaURL
935
936         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
937         if mobj is None:
938             raise ExtractorError(u'Unable to extract title')
939         video_title = mobj.group(1).decode('utf-8')
940
941         video_uploader = mobj.group(2).decode('utf-8')
942
943         return [{
944             'id':       video_id.decode('utf-8'),
945             'url':      video_url.decode('utf-8'),
946             'uploader': video_uploader,
947             'upload_date':  None,
948             'title':    video_title,
949             'ext':      video_extension.decode('utf-8'),
950         }]
951
952
953 class YahooIE(InfoExtractor):
954     """Information extractor for screen.yahoo.com."""
955     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
956
957     def _real_extract(self, url):
958         mobj = re.match(self._VALID_URL, url)
959         if mobj is None:
960             raise ExtractorError(u'Invalid URL: %s' % url)
961         video_id = mobj.group('id')
962         webpage = self._download_webpage(url, video_id)
963         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
964
965         if m_id is None: 
966             # TODO: Check which url parameters are required
967             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
968             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
969             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
970                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
971                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
972                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
973                         '''
974             self.report_extraction(video_id)
975             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
976             if m_info is None:
977                 raise ExtractorError(u'Unable to extract video info')
978             video_title = m_info.group('title')
979             video_description = m_info.group('description')
980             video_thumb = m_info.group('thumb')
981             video_date = m_info.group('date')
982             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
983     
984             # TODO: Find a way to get mp4 videos
985             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
986             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
987             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
988             video_url = m_rest.group('url')
989             video_path = m_rest.group('path')
990             if m_rest is None:
991                 raise ExtractorError(u'Unable to extract video url')
992
993         else: # We have to use a different method if another id is defined
994             long_id = m_id.group('new_id')
995             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
996             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
997             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
998             info = json.loads(json_str)
999             res = info[u'query'][u'results'][u'mediaObj'][0]
1000             stream = res[u'streams'][0]
1001             video_path = stream[u'path']
1002             video_url = stream[u'host']
1003             meta = res[u'meta']
1004             video_title = meta[u'title']
1005             video_description = meta[u'description']
1006             video_thumb = meta[u'thumbnail']
1007             video_date = None # I can't find it
1008
1009         info_dict = {
1010                      'id': video_id,
1011                      'url': video_url,
1012                      'play_path': video_path,
1013                      'title':video_title,
1014                      'description': video_description,
1015                      'thumbnail': video_thumb,
1016                      'upload_date': video_date,
1017                      'ext': 'flv',
1018                      }
1019         return info_dict
1020
1021 class VimeoIE(InfoExtractor):
1022     """Information extractor for vimeo.com."""
1023
1024     # _VALID_URL matches Vimeo URLs
1025     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1026     IE_NAME = u'vimeo'
1027
1028     def _real_extract(self, url, new_video=True):
1029         # Extract ID from URL
1030         mobj = re.match(self._VALID_URL, url)
1031         if mobj is None:
1032             raise ExtractorError(u'Invalid URL: %s' % url)
1033
1034         video_id = mobj.group('id')
1035         if not mobj.group('proto'):
1036             url = 'https://' + url
1037         if mobj.group('direct_link'):
1038             url = 'https://vimeo.com/' + video_id
1039
1040         # Retrieve video webpage to extract further information
1041         request = compat_urllib_request.Request(url, None, std_headers)
1042         webpage = self._download_webpage(request, video_id)
1043
1044         # Now we begin extracting as much information as we can from what we
1045         # retrieved. First we extract the information common to all extractors,
1046         # and latter we extract those that are Vimeo specific.
1047         self.report_extraction(video_id)
1048
1049         # Extract the config JSON
1050         try:
1051             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1052             config = json.loads(config)
1053         except:
1054             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1055                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1056             else:
1057                 raise ExtractorError(u'Unable to extract info section')
1058
1059         # Extract title
1060         video_title = config["video"]["title"]
1061
1062         # Extract uploader and uploader_id
1063         video_uploader = config["video"]["owner"]["name"]
1064         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1065
1066         # Extract video thumbnail
1067         video_thumbnail = config["video"]["thumbnail"]
1068
1069         # Extract video description
1070         video_description = get_element_by_attribute("itemprop", "description", webpage)
1071         if video_description: video_description = clean_html(video_description)
1072         else: video_description = u''
1073
1074         # Extract upload date
1075         video_upload_date = None
1076         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1077         if mobj is not None:
1078             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1079
1080         # Vimeo specific: extract request signature and timestamp
1081         sig = config['request']['signature']
1082         timestamp = config['request']['timestamp']
1083
1084         # Vimeo specific: extract video codec and quality information
1085         # First consider quality, then codecs, then take everything
1086         # TODO bind to format param
1087         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1088         files = { 'hd': [], 'sd': [], 'other': []}
1089         for codec_name, codec_extension in codecs:
1090             if codec_name in config["video"]["files"]:
1091                 if 'hd' in config["video"]["files"][codec_name]:
1092                     files['hd'].append((codec_name, codec_extension, 'hd'))
1093                 elif 'sd' in config["video"]["files"][codec_name]:
1094                     files['sd'].append((codec_name, codec_extension, 'sd'))
1095                 else:
1096                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1097
1098         for quality in ('hd', 'sd', 'other'):
1099             if len(files[quality]) > 0:
1100                 video_quality = files[quality][0][2]
1101                 video_codec = files[quality][0][0]
1102                 video_extension = files[quality][0][1]
1103                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1104                 break
1105         else:
1106             raise ExtractorError(u'No known codec found')
1107
1108         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1109                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1110
1111         return [{
1112             'id':       video_id,
1113             'url':      video_url,
1114             'uploader': video_uploader,
1115             'uploader_id': video_uploader_id,
1116             'upload_date':  video_upload_date,
1117             'title':    video_title,
1118             'ext':      video_extension,
1119             'thumbnail':    video_thumbnail,
1120             'description':  video_description,
1121         }]
1122
1123
1124 class ArteTvIE(InfoExtractor):
1125     """arte.tv information extractor."""
1126
1127     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1128     _LIVE_URL = r'index-[0-9]+\.html$'
1129
1130     IE_NAME = u'arte.tv'
1131
1132     def fetch_webpage(self, url):
1133         request = compat_urllib_request.Request(url)
1134         try:
1135             self.report_download_webpage(url)
1136             webpage = compat_urllib_request.urlopen(request).read()
1137         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1139         except ValueError as err:
1140             raise ExtractorError(u'Invalid URL: %s' % url)
1141         return webpage
1142
1143     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1144         page = self.fetch_webpage(url)
1145         mobj = re.search(regex, page, regexFlags)
1146         info = {}
1147
1148         if mobj is None:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150
1151         for (i, key, err) in matchTuples:
1152             if mobj.group(i) is None:
1153                 raise ExtractorError(err)
1154             else:
1155                 info[key] = mobj.group(i)
1156
1157         return info
1158
1159     def extractLiveStream(self, url):
1160         video_lang = url.split('/')[-4]
1161         info = self.grep_webpage(
1162             url,
1163             r'src="(.*?/videothek_js.*?\.js)',
1164             0,
1165             [
1166                 (1, 'url', u'Invalid URL: %s' % url)
1167             ]
1168         )
1169         http_host = url.split('/')[2]
1170         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1171         info = self.grep_webpage(
1172             next_url,
1173             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1174                 '(http://.*?\.swf).*?' +
1175                 '(rtmp://.*?)\'',
1176             re.DOTALL,
1177             [
1178                 (1, 'path',   u'could not extract video path: %s' % url),
1179                 (2, 'player', u'could not extract video player: %s' % url),
1180                 (3, 'url',    u'could not extract video url: %s' % url)
1181             ]
1182         )
1183         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1184
1185     def extractPlus7Stream(self, url):
1186         video_lang = url.split('/')[-3]
1187         info = self.grep_webpage(
1188             url,
1189             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1190             0,
1191             [
1192                 (1, 'url', u'Invalid URL: %s' % url)
1193             ]
1194         )
1195         next_url = compat_urllib_parse.unquote(info.get('url'))
1196         info = self.grep_webpage(
1197             next_url,
1198             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1199             0,
1200             [
1201                 (1, 'url', u'Could not find <video> tag: %s' % url)
1202             ]
1203         )
1204         next_url = compat_urllib_parse.unquote(info.get('url'))
1205
1206         info = self.grep_webpage(
1207             next_url,
1208             r'<video id="(.*?)".*?>.*?' +
1209                 '<name>(.*?)</name>.*?' +
1210                 '<dateVideo>(.*?)</dateVideo>.*?' +
1211                 '<url quality="hd">(.*?)</url>',
1212             re.DOTALL,
1213             [
1214                 (1, 'id',    u'could not extract video id: %s' % url),
1215                 (2, 'title', u'could not extract video title: %s' % url),
1216                 (3, 'date',  u'could not extract video date: %s' % url),
1217                 (4, 'url',   u'could not extract video url: %s' % url)
1218             ]
1219         )
1220
1221         return {
1222             'id':           info.get('id'),
1223             'url':          compat_urllib_parse.unquote(info.get('url')),
1224             'uploader':     u'arte.tv',
1225             'upload_date':  unified_strdate(info.get('date')),
1226             'title':        info.get('title').decode('utf-8'),
1227             'ext':          u'mp4',
1228             'format':       u'NA',
1229             'player_url':   None,
1230         }
1231
1232     def _real_extract(self, url):
1233         video_id = url.split('/')[-1]
1234         self.report_extraction(video_id)
1235
1236         if re.search(self._LIVE_URL, video_id) is not None:
1237             self.extractLiveStream(url)
1238             return
1239         else:
1240             info = self.extractPlus7Stream(url)
1241
1242         return [info]
1243
1244
1245 class GenericIE(InfoExtractor):
1246     """Generic last-resort information extractor."""
1247
1248     _VALID_URL = r'.*'
1249     IE_NAME = u'generic'
1250
1251     def report_download_webpage(self, video_id):
1252         """Report webpage download."""
1253         if not self._downloader.params.get('test', False):
1254             self._downloader.report_warning(u'Falling back on generic information extractor.')
1255         super(GenericIE, self).report_download_webpage(video_id)
1256
1257     def report_following_redirect(self, new_url):
1258         """Report information extraction."""
1259         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1260
1261     def _test_redirect(self, url):
1262         """Check if it is a redirect, like url shorteners, in case return the new url."""
1263         class HeadRequest(compat_urllib_request.Request):
1264             def get_method(self):
1265                 return "HEAD"
1266
1267         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1268             """
1269             Subclass the HTTPRedirectHandler to make it use our
1270             HeadRequest also on the redirected URL
1271             """
1272             def redirect_request(self, req, fp, code, msg, headers, newurl):
1273                 if code in (301, 302, 303, 307):
1274                     newurl = newurl.replace(' ', '%20')
1275                     newheaders = dict((k,v) for k,v in req.headers.items()
1276                                       if k.lower() not in ("content-length", "content-type"))
1277                     return HeadRequest(newurl,
1278                                        headers=newheaders,
1279                                        origin_req_host=req.get_origin_req_host(),
1280                                        unverifiable=True)
1281                 else:
1282                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1283
1284         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1285             """
1286             Fallback to GET if HEAD is not allowed (405 HTTP error)
1287             """
1288             def http_error_405(self, req, fp, code, msg, headers):
1289                 fp.read()
1290                 fp.close()
1291
1292                 newheaders = dict((k,v) for k,v in req.headers.items()
1293                                   if k.lower() not in ("content-length", "content-type"))
1294                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1295                                                  headers=newheaders,
1296                                                  origin_req_host=req.get_origin_req_host(),
1297                                                  unverifiable=True))
1298
1299         # Build our opener
1300         opener = compat_urllib_request.OpenerDirector()
1301         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1302                         HTTPMethodFallback, HEADRedirectHandler,
1303                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1304             opener.add_handler(handler())
1305
1306         response = opener.open(HeadRequest(url))
1307         new_url = response.geturl()
1308
1309         if url == new_url:
1310             return False
1311
1312         self.report_following_redirect(new_url)
1313         return new_url
1314
1315     def _real_extract(self, url):
1316         new_url = self._test_redirect(url)
1317         if new_url: return [self.url_result(new_url)]
1318
1319         video_id = url.split('/')[-1]
1320         try:
1321             webpage = self._download_webpage(url, video_id)
1322         except ValueError as err:
1323             # since this is the last-resort InfoExtractor, if
1324             # this error is thrown, it'll be thrown here
1325             raise ExtractorError(u'Invalid URL: %s' % url)
1326
1327         self.report_extraction(video_id)
1328         # Start with something easy: JW Player in SWFObject
1329         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1330         if mobj is None:
1331             # Broaden the search a little bit
1332             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1333         if mobj is None:
1334             # Broaden the search a little bit: JWPlayer JS loader
1335             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1336         if mobj is None:
1337             raise ExtractorError(u'Invalid URL: %s' % url)
1338
1339         # It's possible that one of the regexes
1340         # matched, but returned an empty group:
1341         if mobj.group(1) is None:
1342             raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344         video_url = compat_urllib_parse.unquote(mobj.group(1))
1345         video_id = os.path.basename(video_url)
1346
1347         # here's a fun little line of code for you:
1348         video_extension = os.path.splitext(video_id)[1][1:]
1349         video_id = os.path.splitext(video_id)[0]
1350
1351         # it's tempting to parse this further, but you would
1352         # have to take into account all the variations like
1353         #   Video Title - Site Name
1354         #   Site Name | Video Title
1355         #   Video Title - Tagline | Site Name
1356         # and so on and so forth; it's just not practical
1357         mobj = re.search(r'<title>(.*)</title>', webpage)
1358         if mobj is None:
1359             raise ExtractorError(u'Unable to extract title')
1360         video_title = mobj.group(1)
1361
1362         # video uploader is domain name
1363         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1364         if mobj is None:
1365             raise ExtractorError(u'Unable to extract title')
1366         video_uploader = mobj.group(1)
1367
1368         return [{
1369             'id':       video_id,
1370             'url':      video_url,
1371             'uploader': video_uploader,
1372             'upload_date':  None,
1373             'title':    video_title,
1374             'ext':      video_extension,
1375         }]
1376
1377
1378 class YoutubeSearchIE(SearchInfoExtractor):
1379     """Information Extractor for YouTube search queries."""
1380     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1381     _MAX_RESULTS = 1000
1382     IE_NAME = u'youtube:search'
1383     _SEARCH_KEY = 'ytsearch'
1384
1385     def report_download_page(self, query, pagenum):
1386         """Report attempt to download search page with given number."""
1387         query = query.decode(preferredencoding())
1388         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1389
1390     def _get_n_results(self, query, n):
1391         """Get a specified number of results for a query"""
1392
1393         video_ids = []
1394         pagenum = 0
1395         limit = n
1396
1397         while (50 * pagenum) < limit:
1398             self.report_download_page(query, pagenum+1)
1399             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1400             request = compat_urllib_request.Request(result_url)
1401             try:
1402                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1403             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1404                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1405             api_response = json.loads(data)['data']
1406
1407             if not 'items' in api_response:
1408                 raise ExtractorError(u'[youtube] No video results')
1409
1410             new_ids = list(video['id'] for video in api_response['items'])
1411             video_ids += new_ids
1412
1413             limit = min(n, api_response['totalItems'])
1414             pagenum += 1
1415
1416         if len(video_ids) > n:
1417             video_ids = video_ids[:n]
1418         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1419         return self.playlist_result(videos, query)
1420
1421
1422 class GoogleSearchIE(SearchInfoExtractor):
1423     """Information Extractor for Google Video search queries."""
1424     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1425     _MAX_RESULTS = 1000
1426     IE_NAME = u'video.google:search'
1427     _SEARCH_KEY = 'gvsearch'
1428
1429     def _get_n_results(self, query, n):
1430         """Get a specified number of results for a query"""
1431
1432         res = {
1433             '_type': 'playlist',
1434             'id': query,
1435             'entries': []
1436         }
1437
1438         for pagenum in itertools.count(1):
1439             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1440             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1441                                              note='Downloading result page ' + str(pagenum))
1442
1443             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1444                 e = {
1445                     '_type': 'url',
1446                     'url': mobj.group(1)
1447                 }
1448                 res['entries'].append(e)
1449
1450             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1451                 return res
1452
1453 class YahooSearchIE(SearchInfoExtractor):
1454     """Information Extractor for Yahoo! Video search queries."""
1455
1456     _MAX_RESULTS = 1000
1457     IE_NAME = u'screen.yahoo:search'
1458     _SEARCH_KEY = 'yvsearch'
1459
1460     def _get_n_results(self, query, n):
1461         """Get a specified number of results for a query"""
1462
1463         res = {
1464             '_type': 'playlist',
1465             'id': query,
1466             'entries': []
1467         }
1468         for pagenum in itertools.count(0): 
1469             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1470             webpage = self._download_webpage(result_url, query,
1471                                              note='Downloading results page '+str(pagenum+1))
1472             info = json.loads(webpage)
1473             m = info[u'm']
1474             results = info[u'results']
1475
1476             for (i, r) in enumerate(results):
1477                 if (pagenum * 30) +i >= n:
1478                     break
1479                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1480                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1481                 res['entries'].append(e)
1482             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1483                 break
1484
1485         return res
1486
1487
1488 class YoutubePlaylistIE(InfoExtractor):
1489     """Information Extractor for YouTube playlists."""
1490
1491     _VALID_URL = r"""(?:
1492                         (?:https?://)?
1493                         (?:\w+\.)?
1494                         youtube\.com/
1495                         (?:
1496                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1497                            \? (?:.*?&)*? (?:p|a|list)=
1498                         |  p/
1499                         )
1500                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1501                         .*
1502                      |
1503                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1504                      )"""
1505     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1506     _MAX_RESULTS = 50
1507     IE_NAME = u'youtube:playlist'
1508
1509     @classmethod
1510     def suitable(cls, url):
1511         """Receives a URL and returns True if suitable for this IE."""
1512         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1513
1514     def _real_extract(self, url):
1515         # Extract playlist id
1516         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1517         if mobj is None:
1518             raise ExtractorError(u'Invalid URL: %s' % url)
1519
1520         # Download playlist videos from API
1521         playlist_id = mobj.group(1) or mobj.group(2)
1522         page_num = 1
1523         videos = []
1524
1525         while True:
1526             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1527             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1528
1529             try:
1530                 response = json.loads(page)
1531             except ValueError as err:
1532                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1533
1534             if 'feed' not in response:
1535                 raise ExtractorError(u'Got a malformed response from YouTube API')
1536             playlist_title = response['feed']['title']['$t']
1537             if 'entry' not in response['feed']:
1538                 # Number of videos is a multiple of self._MAX_RESULTS
1539                 break
1540
1541             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1542                         for entry in response['feed']['entry']
1543                         if 'content' in entry ]
1544
1545             if len(response['feed']['entry']) < self._MAX_RESULTS:
1546                 break
1547             page_num += 1
1548
1549         videos = [v[1] for v in sorted(videos)]
1550
1551         url_results = [self.url_result(url, 'Youtube') for url in videos]
1552         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1553
1554
1555 class YoutubeChannelIE(InfoExtractor):
1556     """Information Extractor for YouTube channels."""
1557
1558     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1559     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1560     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1561     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1562     IE_NAME = u'youtube:channel'
1563
1564     def extract_videos_from_page(self, page):
1565         ids_in_page = []
1566         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1567             if mobj.group(1) not in ids_in_page:
1568                 ids_in_page.append(mobj.group(1))
1569         return ids_in_page
1570
1571     def _real_extract(self, url):
1572         # Extract channel id
1573         mobj = re.match(self._VALID_URL, url)
1574         if mobj is None:
1575             raise ExtractorError(u'Invalid URL: %s' % url)
1576
1577         # Download channel page
1578         channel_id = mobj.group(1)
1579         video_ids = []
1580         pagenum = 1
1581
1582         url = self._TEMPLATE_URL % (channel_id, pagenum)
1583         page = self._download_webpage(url, channel_id,
1584                                       u'Downloading page #%s' % pagenum)
1585
1586         # Extract video identifiers
1587         ids_in_page = self.extract_videos_from_page(page)
1588         video_ids.extend(ids_in_page)
1589
1590         # Download any subsequent channel pages using the json-based channel_ajax query
1591         if self._MORE_PAGES_INDICATOR in page:
1592             while True:
1593                 pagenum = pagenum + 1
1594
1595                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1596                 page = self._download_webpage(url, channel_id,
1597                                               u'Downloading page #%s' % pagenum)
1598
1599                 page = json.loads(page)
1600
1601                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1602                 video_ids.extend(ids_in_page)
1603
1604                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1605                     break
1606
1607         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1608
1609         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1610         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1611         return [self.playlist_result(url_entries, channel_id)]
1612
1613
1614 class YoutubeUserIE(InfoExtractor):
1615     """Information Extractor for YouTube users."""
1616
1617     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1618     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1619     _GDATA_PAGE_SIZE = 50
1620     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1621     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1622     IE_NAME = u'youtube:user'
1623
1624     def _real_extract(self, url):
1625         # Extract username
1626         mobj = re.match(self._VALID_URL, url)
1627         if mobj is None:
1628             raise ExtractorError(u'Invalid URL: %s' % url)
1629
1630         username = mobj.group(1)
1631
1632         # Download video ids using YouTube Data API. Result size per
1633         # query is limited (currently to 50 videos) so we need to query
1634         # page by page until there are no video ids - it means we got
1635         # all of them.
1636
1637         video_ids = []
1638         pagenum = 0
1639
1640         while True:
1641             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1642
1643             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1644             page = self._download_webpage(gdata_url, username,
1645                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1646
1647             # Extract video identifiers
1648             ids_in_page = []
1649
1650             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651                 if mobj.group(1) not in ids_in_page:
1652                     ids_in_page.append(mobj.group(1))
1653
1654             video_ids.extend(ids_in_page)
1655
1656             # A little optimization - if current page is not
1657             # "full", ie. does not contain PAGE_SIZE video ids then
1658             # we can assume that this page is the last one - there
1659             # are no more ids on further pages - no need to query
1660             # again.
1661
1662             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1663                 break
1664
1665             pagenum += 1
1666
1667         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1668         url_results = [self.url_result(url, 'Youtube') for url in urls]
1669         return [self.playlist_result(url_results, playlist_title = username)]
1670
1671
1672 class BlipTVUserIE(InfoExtractor):
1673     """Information Extractor for blip.tv users."""
1674
1675     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1676     _PAGE_SIZE = 12
1677     IE_NAME = u'blip.tv:user'
1678
1679     def _real_extract(self, url):
1680         # Extract username
1681         mobj = re.match(self._VALID_URL, url)
1682         if mobj is None:
1683             raise ExtractorError(u'Invalid URL: %s' % url)
1684
1685         username = mobj.group(1)
1686
1687         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1688
1689         page = self._download_webpage(url, username, u'Downloading user page')
1690         mobj = re.search(r'data-users-id="([^"]+)"', page)
1691         page_base = page_base % mobj.group(1)
1692
1693
1694         # Download video ids using BlipTV Ajax calls. Result size per
1695         # query is limited (currently to 12 videos) so we need to query
1696         # page by page until there are no video ids - it means we got
1697         # all of them.
1698
1699         video_ids = []
1700         pagenum = 1
1701
1702         while True:
1703             url = page_base + "&page=" + str(pagenum)
1704             page = self._download_webpage(url, username,
1705                                           u'Downloading video ids from page %d' % pagenum)
1706
1707             # Extract video identifiers
1708             ids_in_page = []
1709
1710             for mobj in re.finditer(r'href="/([^"]+)"', page):
1711                 if mobj.group(1) not in ids_in_page:
1712                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1713
1714             video_ids.extend(ids_in_page)
1715
1716             # A little optimization - if current page is not
1717             # "full", ie. does not contain PAGE_SIZE video ids then
1718             # we can assume that this page is the last one - there
1719             # are no more ids on further pages - no need to query
1720             # again.
1721
1722             if len(ids_in_page) < self._PAGE_SIZE:
1723                 break
1724
1725             pagenum += 1
1726
1727         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1728         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1729         return [self.playlist_result(url_entries, playlist_title = username)]
1730
1731
1732 class DepositFilesIE(InfoExtractor):
1733     """Information extractor for depositfiles.com"""
1734
1735     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1736
1737     def _real_extract(self, url):
1738         file_id = url.split('/')[-1]
1739         # Rebuild url in english locale
1740         url = 'http://depositfiles.com/en/files/' + file_id
1741
1742         # Retrieve file webpage with 'Free download' button pressed
1743         free_download_indication = { 'gateway_result' : '1' }
1744         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1745         try:
1746             self.report_download_webpage(file_id)
1747             webpage = compat_urllib_request.urlopen(request).read()
1748         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1749             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1750
1751         # Search for the real file URL
1752         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1753         if (mobj is None) or (mobj.group(1) is None):
1754             # Try to figure out reason of the error.
1755             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1756             if (mobj is not None) and (mobj.group(1) is not None):
1757                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1758                 raise ExtractorError(u'%s' % restriction_message)
1759             else:
1760                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1761
1762         file_url = mobj.group(1)
1763         file_extension = os.path.splitext(file_url)[1][1:]
1764
1765         # Search for file title
1766         mobj = re.search(r'<b title="(.*?)">', webpage)
1767         if mobj is None:
1768             raise ExtractorError(u'Unable to extract title')
1769         file_title = mobj.group(1).decode('utf-8')
1770
1771         return [{
1772             'id':       file_id.decode('utf-8'),
1773             'url':      file_url.decode('utf-8'),
1774             'uploader': None,
1775             'upload_date':  None,
1776             'title':    file_title,
1777             'ext':      file_extension.decode('utf-8'),
1778         }]
1779
1780
1781 class FacebookIE(InfoExtractor):
1782     """Information Extractor for Facebook"""
1783
1784     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1785     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1786     _NETRC_MACHINE = 'facebook'
1787     IE_NAME = u'facebook'
1788
1789     def report_login(self):
1790         """Report attempt to log in."""
1791         self.to_screen(u'Logging in')
1792
1793     def _real_initialize(self):
1794         if self._downloader is None:
1795             return
1796
1797         useremail = None
1798         password = None
1799         downloader_params = self._downloader.params
1800
1801         # Attempt to use provided username and password or .netrc data
1802         if downloader_params.get('username', None) is not None:
1803             useremail = downloader_params['username']
1804             password = downloader_params['password']
1805         elif downloader_params.get('usenetrc', False):
1806             try:
1807                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1808                 if info is not None:
1809                     useremail = info[0]
1810                     password = info[2]
1811                 else:
1812                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1813             except (IOError, netrc.NetrcParseError) as err:
1814                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1815                 return
1816
1817         if useremail is None:
1818             return
1819
1820         # Log in
1821         login_form = {
1822             'email': useremail,
1823             'pass': password,
1824             'login': 'Log+In'
1825             }
1826         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1827         try:
1828             self.report_login()
1829             login_results = compat_urllib_request.urlopen(request).read()
1830             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1831                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1832                 return
1833         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1835             return
1836
1837     def _real_extract(self, url):
1838         mobj = re.match(self._VALID_URL, url)
1839         if mobj is None:
1840             raise ExtractorError(u'Invalid URL: %s' % url)
1841         video_id = mobj.group('ID')
1842
1843         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1844         webpage = self._download_webpage(url, video_id)
1845
1846         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1847         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1848         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1849         if not m:
1850             raise ExtractorError(u'Cannot parse data')
1851         data = dict(json.loads(m.group(1)))
1852         params_raw = compat_urllib_parse.unquote(data['params'])
1853         params = json.loads(params_raw)
1854         video_data = params['video_data'][0]
1855         video_url = video_data.get('hd_src')
1856         if not video_url:
1857             video_url = video_data['sd_src']
1858         if not video_url:
1859             raise ExtractorError(u'Cannot find video URL')
1860         video_duration = int(video_data['video_duration'])
1861         thumbnail = video_data['thumbnail_src']
1862
1863         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1864         if not m:
1865             raise ExtractorError(u'Cannot find title in webpage')
1866         video_title = unescapeHTML(m.group(1))
1867
1868         info = {
1869             'id': video_id,
1870             'title': video_title,
1871             'url': video_url,
1872             'ext': 'mp4',
1873             'duration': video_duration,
1874             'thumbnail': thumbnail,
1875         }
1876         return [info]
1877
1878
1879 class BlipTVIE(InfoExtractor):
1880     """Information extractor for blip.tv"""
1881
1882     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1883     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1884     IE_NAME = u'blip.tv'
1885
1886     def report_direct_download(self, title):
1887         """Report information extraction."""
1888         self.to_screen(u'%s: Direct download detected' % title)
1889
1890     def _real_extract(self, url):
1891         mobj = re.match(self._VALID_URL, url)
1892         if mobj is None:
1893             raise ExtractorError(u'Invalid URL: %s' % url)
1894
1895         urlp = compat_urllib_parse_urlparse(url)
1896         if urlp.path.startswith('/play/'):
1897             request = compat_urllib_request.Request(url)
1898             response = compat_urllib_request.urlopen(request)
1899             redirecturl = response.geturl()
1900             rurlp = compat_urllib_parse_urlparse(redirecturl)
1901             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1902             url = 'http://blip.tv/a/a-' + file_id
1903             return self._real_extract(url)
1904
1905
1906         if '?' in url:
1907             cchar = '&'
1908         else:
1909             cchar = '?'
1910         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1911         request = compat_urllib_request.Request(json_url)
1912         request.add_header('User-Agent', 'iTunes/10.6.1')
1913         self.report_extraction(mobj.group(1))
1914         info = None
1915         try:
1916             urlh = compat_urllib_request.urlopen(request)
1917             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1918                 basename = url.split('/')[-1]
1919                 title,ext = os.path.splitext(basename)
1920                 title = title.decode('UTF-8')
1921                 ext = ext.replace('.', '')
1922                 self.report_direct_download(title)
1923                 info = {
1924                     'id': title,
1925                     'url': url,
1926                     'uploader': None,
1927                     'upload_date': None,
1928                     'title': title,
1929                     'ext': ext,
1930                     'urlhandle': urlh
1931                 }
1932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1934         if info is None: # Regular URL
1935             try:
1936                 json_code_bytes = urlh.read()
1937                 json_code = json_code_bytes.decode('utf-8')
1938             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1940
1941             try:
1942                 json_data = json.loads(json_code)
1943                 if 'Post' in json_data:
1944                     data = json_data['Post']
1945                 else:
1946                     data = json_data
1947
1948                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1949                 video_url = data['media']['url']
1950                 umobj = re.match(self._URL_EXT, video_url)
1951                 if umobj is None:
1952                     raise ValueError('Can not determine filename extension')
1953                 ext = umobj.group(1)
1954
1955                 info = {
1956                     'id': data['item_id'],
1957                     'url': video_url,
1958                     'uploader': data['display_name'],
1959                     'upload_date': upload_date,
1960                     'title': data['title'],
1961                     'ext': ext,
1962                     'format': data['media']['mimeType'],
1963                     'thumbnail': data['thumbnailUrl'],
1964                     'description': data['description'],
1965                     'player_url': data['embedUrl'],
1966                     'user_agent': 'iTunes/10.6.1',
1967                 }
1968             except (ValueError,KeyError) as err:
1969                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1970
1971         return [info]
1972
1973
1974 class MyVideoIE(InfoExtractor):
1975     """Information Extractor for myvideo.de."""
1976
1977     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1978     IE_NAME = u'myvideo'
1979
1980     def _real_extract(self,url):
1981         mobj = re.match(self._VALID_URL, url)
1982         if mobj is None:
1983             raise ExtractorError(u'Invalid URL: %s' % url)
1984
1985         video_id = mobj.group(1)
1986
1987         # Get video webpage
1988         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1989         webpage = self._download_webpage(webpage_url, video_id)
1990
1991         self.report_extraction(video_id)
1992         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
1993                  webpage)
1994         if mobj is None:
1995             raise ExtractorError(u'Unable to extract media URL')
1996         video_url = mobj.group(1) + ('/%s.flv' % video_id)
1997
1998         mobj = re.search('<title>([^<]+)</title>', webpage)
1999         if mobj is None:
2000             raise ExtractorError(u'Unable to extract title')
2001
2002         video_title = mobj.group(1)
2003
2004         return [{
2005             'id':       video_id,
2006             'url':      video_url,
2007             'uploader': None,
2008             'upload_date':  None,
2009             'title':    video_title,
2010             'ext':      u'flv',
2011         }]
2012
2013 class ComedyCentralIE(InfoExtractor):
2014     """Information extractor for The Daily Show and Colbert Report """
2015
2016     # urls can be abbreviations like :thedailyshow or :colbert
2017     # urls for episodes like:
2018     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2019     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2020     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2021     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2022                       |(https?://)?(www\.)?
2023                           (?P<showname>thedailyshow|colbertnation)\.com/
2024                          (full-episodes/(?P<episode>.*)|
2025                           (?P<clip>
2026                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2027                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2028                      $"""
2029
2030     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2031
2032     _video_extensions = {
2033         '3500': 'mp4',
2034         '2200': 'mp4',
2035         '1700': 'mp4',
2036         '1200': 'mp4',
2037         '750': 'mp4',
2038         '400': 'mp4',
2039     }
2040     _video_dimensions = {
2041         '3500': '1280x720',
2042         '2200': '960x540',
2043         '1700': '768x432',
2044         '1200': '640x360',
2045         '750': '512x288',
2046         '400': '384x216',
2047     }
2048
2049     @classmethod
2050     def suitable(cls, url):
2051         """Receives a URL and returns True if suitable for this IE."""
2052         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2053
2054     def _print_formats(self, formats):
2055         print('Available formats:')
2056         for x in formats:
2057             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2058
2059
2060     def _real_extract(self, url):
2061         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2062         if mobj is None:
2063             raise ExtractorError(u'Invalid URL: %s' % url)
2064
2065         if mobj.group('shortname'):
2066             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067                 url = u'http://www.thedailyshow.com/full-episodes/'
2068             else:
2069                 url = u'http://www.colbertnation.com/full-episodes/'
2070             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2071             assert mobj is not None
2072
2073         if mobj.group('clip'):
2074             if mobj.group('showname') == 'thedailyshow':
2075                 epTitle = mobj.group('tdstitle')
2076             else:
2077                 epTitle = mobj.group('cntitle')
2078             dlNewest = False
2079         else:
2080             dlNewest = not mobj.group('episode')
2081             if dlNewest:
2082                 epTitle = mobj.group('showname')
2083             else:
2084                 epTitle = mobj.group('episode')
2085
2086         self.report_extraction(epTitle)
2087         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2088         if dlNewest:
2089             url = htmlHandle.geturl()
2090             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2091             if mobj is None:
2092                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2093             if mobj.group('episode') == '':
2094                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2095             epTitle = mobj.group('episode')
2096
2097         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2098
2099         if len(mMovieParams) == 0:
2100             # The Colbert Report embeds the information in a without
2101             # a URL prefix; so extract the alternate reference
2102             # and then add the URL prefix manually.
2103
2104             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2105             if len(altMovieParams) == 0:
2106                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2107             else:
2108                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2109
2110         uri = mMovieParams[0][1]
2111         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2112         indexXml = self._download_webpage(indexUrl, epTitle,
2113                                           u'Downloading show index',
2114                                           u'unable to download episode index')
2115
2116         results = []
2117
2118         idoc = xml.etree.ElementTree.fromstring(indexXml)
2119         itemEls = idoc.findall('.//item')
2120         for partNum,itemEl in enumerate(itemEls):
2121             mediaId = itemEl.findall('./guid')[0].text
2122             shortMediaId = mediaId.split(':')[-1]
2123             showId = mediaId.split(':')[-2].replace('.com', '')
2124             officialTitle = itemEl.findall('./title')[0].text
2125             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2126
2127             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2128                         compat_urllib_parse.urlencode({'uri': mediaId}))
2129             configXml = self._download_webpage(configUrl, epTitle,
2130                                                u'Downloading configuration for %s' % shortMediaId)
2131
2132             cdoc = xml.etree.ElementTree.fromstring(configXml)
2133             turls = []
2134             for rendition in cdoc.findall('.//rendition'):
2135                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2136                 turls.append(finfo)
2137
2138             if len(turls) == 0:
2139                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2140                 continue
2141
2142             if self._downloader.params.get('listformats', None):
2143                 self._print_formats([i[0] for i in turls])
2144                 return
2145
2146             # For now, just pick the highest bitrate
2147             format,rtmp_video_url = turls[-1]
2148
2149             # Get the format arg from the arg stream
2150             req_format = self._downloader.params.get('format', None)
2151
2152             # Select format if we can find one
2153             for f,v in turls:
2154                 if f == req_format:
2155                     format, rtmp_video_url = f, v
2156                     break
2157
2158             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2159             if not m:
2160                 raise ExtractorError(u'Cannot transform RTMP url')
2161             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2162             video_url = base + m.group('finalid')
2163
2164             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2165             info = {
2166                 'id': shortMediaId,
2167                 'url': video_url,
2168                 'uploader': showId,
2169                 'upload_date': officialDate,
2170                 'title': effTitle,
2171                 'ext': 'mp4',
2172                 'format': format,
2173                 'thumbnail': None,
2174                 'description': officialTitle,
2175             }
2176             results.append(info)
2177
2178         return results
2179
2180
2181 class EscapistIE(InfoExtractor):
2182     """Information extractor for The Escapist """
2183
2184     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2185     IE_NAME = u'escapist'
2186
2187     def _real_extract(self, url):
2188         mobj = re.match(self._VALID_URL, url)
2189         if mobj is None:
2190             raise ExtractorError(u'Invalid URL: %s' % url)
2191         showName = mobj.group('showname')
2192         videoId = mobj.group('episode')
2193
2194         self.report_extraction(showName)
2195         webPage = self._download_webpage(url, showName)
2196
2197         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2198         description = unescapeHTML(descMatch.group(1))
2199         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2200         imgUrl = unescapeHTML(imgMatch.group(1))
2201         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2202         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2203         configUrlMatch = re.search('config=(.*)$', playerUrl)
2204         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2205
2206         configJSON = self._download_webpage(configUrl, showName,
2207                                             u'Downloading configuration',
2208                                             u'unable to download configuration')
2209
2210         # Technically, it's JavaScript, not JSON
2211         configJSON = configJSON.replace("'", '"')
2212
2213         try:
2214             config = json.loads(configJSON)
2215         except (ValueError,) as err:
2216             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2217
2218         playlist = config['playlist']
2219         videoUrl = playlist[1]['url']
2220
2221         info = {
2222             'id': videoId,
2223             'url': videoUrl,
2224             'uploader': showName,
2225             'upload_date': None,
2226             'title': showName,
2227             'ext': 'mp4',
2228             'thumbnail': imgUrl,
2229             'description': description,
2230             'player_url': playerUrl,
2231         }
2232
2233         return [info]
2234
2235 class CollegeHumorIE(InfoExtractor):
2236     """Information extractor for collegehumor.com"""
2237
2238     _WORKING = False
2239     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2240     IE_NAME = u'collegehumor'
2241
2242     def report_manifest(self, video_id):
2243         """Report information extraction."""
2244         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2245
2246     def _real_extract(self, url):
2247         mobj = re.match(self._VALID_URL, url)
2248         if mobj is None:
2249             raise ExtractorError(u'Invalid URL: %s' % url)
2250         video_id = mobj.group('videoid')
2251
2252         info = {
2253             'id': video_id,
2254             'uploader': None,
2255             'upload_date': None,
2256         }
2257
2258         self.report_extraction(video_id)
2259         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2260         try:
2261             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2263             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2264
2265         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2266         try:
2267             videoNode = mdoc.findall('./video')[0]
2268             info['description'] = videoNode.findall('./description')[0].text
2269             info['title'] = videoNode.findall('./caption')[0].text
2270             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2271             manifest_url = videoNode.findall('./file')[0].text
2272         except IndexError:
2273             raise ExtractorError(u'Invalid metadata XML file')
2274
2275         manifest_url += '?hdcore=2.10.3'
2276         self.report_manifest(video_id)
2277         try:
2278             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2280             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2281
2282         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2283         try:
2284             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2285             node_id = media_node.attrib['url']
2286             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2287         except IndexError as err:
2288             raise ExtractorError(u'Invalid manifest file')
2289
2290         url_pr = compat_urllib_parse_urlparse(manifest_url)
2291         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2292
2293         info['url'] = url
2294         info['ext'] = 'f4f'
2295         return [info]
2296
2297
2298 class XVideosIE(InfoExtractor):
2299     """Information extractor for xvideos.com"""
2300
2301     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2302     IE_NAME = u'xvideos'
2303
2304     def _real_extract(self, url):
2305         mobj = re.match(self._VALID_URL, url)
2306         if mobj is None:
2307             raise ExtractorError(u'Invalid URL: %s' % url)
2308         video_id = mobj.group(1)
2309
2310         webpage = self._download_webpage(url, video_id)
2311
2312         self.report_extraction(video_id)
2313
2314
2315         # Extract video URL
2316         mobj = re.search(r'flv_url=(.+?)&', webpage)
2317         if mobj is None:
2318             raise ExtractorError(u'Unable to extract video url')
2319         video_url = compat_urllib_parse.unquote(mobj.group(1))
2320
2321
2322         # Extract title
2323         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2324         if mobj is None:
2325             raise ExtractorError(u'Unable to extract video title')
2326         video_title = mobj.group(1)
2327
2328
2329         # Extract video thumbnail
2330         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2331         if mobj is None:
2332             raise ExtractorError(u'Unable to extract video thumbnail')
2333         video_thumbnail = mobj.group(0)
2334
2335         info = {
2336             'id': video_id,
2337             'url': video_url,
2338             'uploader': None,
2339             'upload_date': None,
2340             'title': video_title,
2341             'ext': 'flv',
2342             'thumbnail': video_thumbnail,
2343             'description': None,
2344         }
2345
2346         return [info]
2347
2348
2349 class SoundcloudIE(InfoExtractor):
2350     """Information extractor for soundcloud.com
2351        To access the media, the uid of the song and a stream token
2352        must be extracted from the page source and the script must make
2353        a request to media.soundcloud.com/crossdomain.xml. Then
2354        the media can be grabbed by requesting from an url composed
2355        of the stream token and uid
2356      """
2357
2358     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2359     IE_NAME = u'soundcloud'
2360
2361     def report_resolve(self, video_id):
2362         """Report information extraction."""
2363         self.to_screen(u'%s: Resolving id' % video_id)
2364
2365     def _real_extract(self, url):
2366         mobj = re.match(self._VALID_URL, url)
2367         if mobj is None:
2368             raise ExtractorError(u'Invalid URL: %s' % url)
2369
2370         # extract uploader (which is in the url)
2371         uploader = mobj.group(1)
2372         # extract simple title (uploader + slug of song title)
2373         slug_title =  mobj.group(2)
2374         simple_title = uploader + u'-' + slug_title
2375         full_title = '%s/%s' % (uploader, slug_title)
2376
2377         self.report_resolve(full_title)
2378
2379         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2380         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2381         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2382
2383         info = json.loads(info_json)
2384         video_id = info['id']
2385         self.report_extraction(full_title)
2386
2387         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2388         stream_json = self._download_webpage(streams_url, full_title,
2389                                              u'Downloading stream definitions',
2390                                              u'unable to download stream definitions')
2391
2392         streams = json.loads(stream_json)
2393         mediaURL = streams['http_mp3_128_url']
2394         upload_date = unified_strdate(info['created_at'])
2395
2396         return [{
2397             'id':       info['id'],
2398             'url':      mediaURL,
2399             'uploader': info['user']['username'],
2400             'upload_date': upload_date,
2401             'title':    info['title'],
2402             'ext':      u'mp3',
2403             'description': info['description'],
2404         }]
2405
2406 class SoundcloudSetIE(InfoExtractor):
2407     """Information extractor for soundcloud.com sets
2408        To access the media, the uid of the song and a stream token
2409        must be extracted from the page source and the script must make
2410        a request to media.soundcloud.com/crossdomain.xml. Then
2411        the media can be grabbed by requesting from an url composed
2412        of the stream token and uid
2413      """
2414
2415     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2416     IE_NAME = u'soundcloud:set'
2417
2418     def report_resolve(self, video_id):
2419         """Report information extraction."""
2420         self.to_screen(u'%s: Resolving id' % video_id)
2421
2422     def _real_extract(self, url):
2423         mobj = re.match(self._VALID_URL, url)
2424         if mobj is None:
2425             raise ExtractorError(u'Invalid URL: %s' % url)
2426
2427         # extract uploader (which is in the url)
2428         uploader = mobj.group(1)
2429         # extract simple title (uploader + slug of song title)
2430         slug_title =  mobj.group(2)
2431         simple_title = uploader + u'-' + slug_title
2432         full_title = '%s/sets/%s' % (uploader, slug_title)
2433
2434         self.report_resolve(full_title)
2435
2436         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2437         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2438         info_json = self._download_webpage(resolv_url, full_title)
2439
2440         videos = []
2441         info = json.loads(info_json)
2442         if 'errors' in info:
2443             for err in info['errors']:
2444                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2445             return
2446
2447         self.report_extraction(full_title)
2448         for track in info['tracks']:
2449             video_id = track['id']
2450
2451             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2452             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2453
2454             self.report_extraction(video_id)
2455             streams = json.loads(stream_json)
2456             mediaURL = streams['http_mp3_128_url']
2457
2458             videos.append({
2459                 'id':       video_id,
2460                 'url':      mediaURL,
2461                 'uploader': track['user']['username'],
2462                 'upload_date':  unified_strdate(track['created_at']),
2463                 'title':    track['title'],
2464                 'ext':      u'mp3',
2465                 'description': track['description'],
2466             })
2467         return videos
2468
2469
2470 class InfoQIE(InfoExtractor):
2471     """Information extractor for infoq.com"""
2472     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2473
2474     def _real_extract(self, url):
2475         mobj = re.match(self._VALID_URL, url)
2476         if mobj is None:
2477             raise ExtractorError(u'Invalid URL: %s' % url)
2478
2479         webpage = self._download_webpage(url, video_id=url)
2480         self.report_extraction(url)
2481
2482         # Extract video URL
2483         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2484         if mobj is None:
2485             raise ExtractorError(u'Unable to extract video url')
2486         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2487         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2488
2489         # Extract title
2490         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2491         if mobj is None:
2492             raise ExtractorError(u'Unable to extract video title')
2493         video_title = mobj.group(1)
2494
2495         # Extract description
2496         video_description = u'No description available.'
2497         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2498         if mobj is not None:
2499             video_description = mobj.group(1)
2500
2501         video_filename = video_url.split('/')[-1]
2502         video_id, extension = video_filename.split('.')
2503
2504         info = {
2505             'id': video_id,
2506             'url': video_url,
2507             'uploader': None,
2508             'upload_date': None,
2509             'title': video_title,
2510             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2511             'thumbnail': None,
2512             'description': video_description,
2513         }
2514
2515         return [info]
2516
2517 class MixcloudIE(InfoExtractor):
2518     """Information extractor for www.mixcloud.com"""
2519
2520     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2521     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2522     IE_NAME = u'mixcloud'
2523
2524     def report_download_json(self, file_id):
2525         """Report JSON download."""
2526         self.to_screen(u'Downloading json')
2527
2528     def get_urls(self, jsonData, fmt, bitrate='best'):
2529         """Get urls from 'audio_formats' section in json"""
2530         file_url = None
2531         try:
2532             bitrate_list = jsonData[fmt]
2533             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2534                 bitrate = max(bitrate_list) # select highest
2535
2536             url_list = jsonData[fmt][bitrate]
2537         except TypeError: # we have no bitrate info.
2538             url_list = jsonData[fmt]
2539         return url_list
2540
2541     def check_urls(self, url_list):
2542         """Returns 1st active url from list"""
2543         for url in url_list:
2544             try:
2545                 compat_urllib_request.urlopen(url)
2546                 return url
2547             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548                 url = None
2549
2550         return None
2551
2552     def _print_formats(self, formats):
2553         print('Available formats:')
2554         for fmt in formats.keys():
2555             for b in formats[fmt]:
2556                 try:
2557                     ext = formats[fmt][b][0]
2558                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2559                 except TypeError: # we have no bitrate info
2560                     ext = formats[fmt][0]
2561                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2562                     break
2563
2564     def _real_extract(self, url):
2565         mobj = re.match(self._VALID_URL, url)
2566         if mobj is None:
2567             raise ExtractorError(u'Invalid URL: %s' % url)
2568         # extract uploader & filename from url
2569         uploader = mobj.group(1).decode('utf-8')
2570         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2571
2572         # construct API request
2573         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2574         # retrieve .json file with links to files
2575         request = compat_urllib_request.Request(file_url)
2576         try:
2577             self.report_download_json(file_url)
2578             jsonData = compat_urllib_request.urlopen(request).read()
2579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2581
2582         # parse JSON
2583         json_data = json.loads(jsonData)
2584         player_url = json_data['player_swf_url']
2585         formats = dict(json_data['audio_formats'])
2586
2587         req_format = self._downloader.params.get('format', None)
2588         bitrate = None
2589
2590         if self._downloader.params.get('listformats', None):
2591             self._print_formats(formats)
2592             return
2593
2594         if req_format is None or req_format == 'best':
2595             for format_param in formats.keys():
2596                 url_list = self.get_urls(formats, format_param)
2597                 # check urls
2598                 file_url = self.check_urls(url_list)
2599                 if file_url is not None:
2600                     break # got it!
2601         else:
2602             if req_format not in formats:
2603                 raise ExtractorError(u'Format is not available')
2604
2605             url_list = self.get_urls(formats, req_format)
2606             file_url = self.check_urls(url_list)
2607             format_param = req_format
2608
2609         return [{
2610             'id': file_id.decode('utf-8'),
2611             'url': file_url.decode('utf-8'),
2612             'uploader': uploader.decode('utf-8'),
2613             'upload_date': None,
2614             'title': json_data['name'],
2615             'ext': file_url.split('.')[-1].decode('utf-8'),
2616             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2617             'thumbnail': json_data['thumbnail_url'],
2618             'description': json_data['description'],
2619             'player_url': player_url.decode('utf-8'),
2620         }]
2621
2622 class StanfordOpenClassroomIE(InfoExtractor):
2623     """Information extractor for Stanford's Open ClassRoom"""
2624
2625     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2626     IE_NAME = u'stanfordoc'
2627
2628     def _real_extract(self, url):
2629         mobj = re.match(self._VALID_URL, url)
2630         if mobj is None:
2631             raise ExtractorError(u'Invalid URL: %s' % url)
2632
2633         if mobj.group('course') and mobj.group('video'): # A specific video
2634             course = mobj.group('course')
2635             video = mobj.group('video')
2636             info = {
2637                 'id': course + '_' + video,
2638                 'uploader': None,
2639                 'upload_date': None,
2640             }
2641
2642             self.report_extraction(info['id'])
2643             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2644             xmlUrl = baseUrl + video + '.xml'
2645             try:
2646                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2647             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2649             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2650             try:
2651                 info['title'] = mdoc.findall('./title')[0].text
2652                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2653             except IndexError:
2654                 raise ExtractorError(u'Invalid metadata XML file')
2655             info['ext'] = info['url'].rpartition('.')[2]
2656             return [info]
2657         elif mobj.group('course'): # A course page
2658             course = mobj.group('course')
2659             info = {
2660                 'id': course,
2661                 'type': 'playlist',
2662                 'uploader': None,
2663                 'upload_date': None,
2664             }
2665
2666             coursepage = self._download_webpage(url, info['id'],
2667                                         note='Downloading course info page',
2668                                         errnote='Unable to download course info page')
2669
2670             m = re.search('<h1>([^<]+)</h1>', coursepage)
2671             if m:
2672                 info['title'] = unescapeHTML(m.group(1))
2673             else:
2674                 info['title'] = info['id']
2675
2676             m = re.search('<description>([^<]+)</description>', coursepage)
2677             if m:
2678                 info['description'] = unescapeHTML(m.group(1))
2679
2680             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2681             info['list'] = [
2682                 {
2683                     'type': 'reference',
2684                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2685                 }
2686                     for vpage in links]
2687             results = []
2688             for entry in info['list']:
2689                 assert entry['type'] == 'reference'
2690                 results += self.extract(entry['url'])
2691             return results
2692         else: # Root page
2693             info = {
2694                 'id': 'Stanford OpenClassroom',
2695                 'type': 'playlist',
2696                 'uploader': None,
2697                 'upload_date': None,
2698             }
2699
2700             self.report_download_webpage(info['id'])
2701             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2702             try:
2703                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2704             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2705                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2706
2707             info['title'] = info['id']
2708
2709             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2710             info['list'] = [
2711                 {
2712                     'type': 'reference',
2713                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2714                 }
2715                     for cpage in links]
2716
2717             results = []
2718             for entry in info['list']:
2719                 assert entry['type'] == 'reference'
2720                 results += self.extract(entry['url'])
2721             return results
2722
2723 class MTVIE(InfoExtractor):
2724     """Information extractor for MTV.com"""
2725
2726     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2727     IE_NAME = u'mtv'
2728
2729     def _real_extract(self, url):
2730         mobj = re.match(self._VALID_URL, url)
2731         if mobj is None:
2732             raise ExtractorError(u'Invalid URL: %s' % url)
2733         if not mobj.group('proto'):
2734             url = 'http://' + url
2735         video_id = mobj.group('videoid')
2736
2737         webpage = self._download_webpage(url, video_id)
2738
2739         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2740         if mobj is None:
2741             raise ExtractorError(u'Unable to extract song name')
2742         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2743         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2744         if mobj is None:
2745             raise ExtractorError(u'Unable to extract performer')
2746         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2747         video_title = performer + ' - ' + song_name
2748
2749         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2750         if mobj is None:
2751             raise ExtractorError(u'Unable to mtvn_uri')
2752         mtvn_uri = mobj.group(1)
2753
2754         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2755         if mobj is None:
2756             raise ExtractorError(u'Unable to extract content id')
2757         content_id = mobj.group(1)
2758
2759         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2760         self.report_extraction(video_id)
2761         request = compat_urllib_request.Request(videogen_url)
2762         try:
2763             metadataXml = compat_urllib_request.urlopen(request).read()
2764         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2765             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2766
2767         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2768         renditions = mdoc.findall('.//rendition')
2769
2770         # For now, always pick the highest quality.
2771         rendition = renditions[-1]
2772
2773         try:
2774             _,_,ext = rendition.attrib['type'].partition('/')
2775             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2776             video_url = rendition.find('./src').text
2777         except KeyError:
2778             raise ExtractorError('Invalid rendition field.')
2779
2780         info = {
2781             'id': video_id,
2782             'url': video_url,
2783             'uploader': performer,
2784             'upload_date': None,
2785             'title': video_title,
2786             'ext': ext,
2787             'format': format,
2788         }
2789
2790         return [info]
2791
2792
2793 class YoukuIE(InfoExtractor):
2794     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2795
2796     def _gen_sid(self):
2797         nowTime = int(time.time() * 1000)
2798         random1 = random.randint(1000,1998)
2799         random2 = random.randint(1000,9999)
2800
2801         return "%d%d%d" %(nowTime,random1,random2)
2802
2803     def _get_file_ID_mix_string(self, seed):
2804         mixed = []
2805         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2806         seed = float(seed)
2807         for i in range(len(source)):
2808             seed  =  (seed * 211 + 30031 ) % 65536
2809             index  =  math.floor(seed / 65536 * len(source) )
2810             mixed.append(source[int(index)])
2811             source.remove(source[int(index)])
2812         #return ''.join(mixed)
2813         return mixed
2814
2815     def _get_file_id(self, fileId, seed):
2816         mixed = self._get_file_ID_mix_string(seed)
2817         ids = fileId.split('*')
2818         realId = []
2819         for ch in ids:
2820             if ch:
2821                 realId.append(mixed[int(ch)])
2822         return ''.join(realId)
2823
2824     def _real_extract(self, url):
2825         mobj = re.match(self._VALID_URL, url)
2826         if mobj is None:
2827             raise ExtractorError(u'Invalid URL: %s' % url)
2828         video_id = mobj.group('ID')
2829
2830         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2831
2832         jsondata = self._download_webpage(info_url, video_id)
2833
2834         self.report_extraction(video_id)
2835         try:
2836             config = json.loads(jsondata)
2837
2838             video_title =  config['data'][0]['title']
2839             seed = config['data'][0]['seed']
2840
2841             format = self._downloader.params.get('format', None)
2842             supported_format = list(config['data'][0]['streamfileids'].keys())
2843
2844             if format is None or format == 'best':
2845                 if 'hd2' in supported_format:
2846                     format = 'hd2'
2847                 else:
2848                     format = 'flv'
2849                 ext = u'flv'
2850             elif format == 'worst':
2851                 format = 'mp4'
2852                 ext = u'mp4'
2853             else:
2854                 format = 'flv'
2855                 ext = u'flv'
2856
2857
2858             fileid = config['data'][0]['streamfileids'][format]
2859             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2860         except (UnicodeDecodeError, ValueError, KeyError):
2861             raise ExtractorError(u'Unable to extract info section')
2862
2863         files_info=[]
2864         sid = self._gen_sid()
2865         fileid = self._get_file_id(fileid, seed)
2866
2867         #column 8,9 of fileid represent the segment number
2868         #fileid[7:9] should be changed
2869         for index, key in enumerate(keys):
2870
2871             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2872             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2873
2874             info = {
2875                 'id': '%s_part%02d' % (video_id, index),
2876                 'url': download_url,
2877                 'uploader': None,
2878                 'upload_date': None,
2879                 'title': video_title,
2880                 'ext': ext,
2881             }
2882             files_info.append(info)
2883
2884         return files_info
2885
2886
2887 class XNXXIE(InfoExtractor):
2888     """Information extractor for xnxx.com"""
2889
2890     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2891     IE_NAME = u'xnxx'
2892     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2893     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2894     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2895
2896     def _real_extract(self, url):
2897         mobj = re.match(self._VALID_URL, url)
2898         if mobj is None:
2899             raise ExtractorError(u'Invalid URL: %s' % url)
2900         video_id = mobj.group(1)
2901
2902         # Get webpage content
2903         webpage = self._download_webpage(url, video_id)
2904
2905         result = re.search(self.VIDEO_URL_RE, webpage)
2906         if result is None:
2907             raise ExtractorError(u'Unable to extract video url')
2908         video_url = compat_urllib_parse.unquote(result.group(1))
2909
2910         result = re.search(self.VIDEO_TITLE_RE, webpage)
2911         if result is None:
2912             raise ExtractorError(u'Unable to extract video title')
2913         video_title = result.group(1)
2914
2915         result = re.search(self.VIDEO_THUMB_RE, webpage)
2916         if result is None:
2917             raise ExtractorError(u'Unable to extract video thumbnail')
2918         video_thumbnail = result.group(1)
2919
2920         return [{
2921             'id': video_id,
2922             'url': video_url,
2923             'uploader': None,
2924             'upload_date': None,
2925             'title': video_title,
2926             'ext': 'flv',
2927             'thumbnail': video_thumbnail,
2928             'description': None,
2929         }]
2930
2931
2932 class GooglePlusIE(InfoExtractor):
2933     """Information extractor for plus.google.com."""
2934
2935     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2936     IE_NAME = u'plus.google'
2937
2938     def report_extract_entry(self, url):
2939         """Report downloading extry"""
2940         self.to_screen(u'Downloading entry: %s' % url)
2941
2942     def report_date(self, upload_date):
2943         """Report downloading extry"""
2944         self.to_screen(u'Entry date: %s' % upload_date)
2945
2946     def report_uploader(self, uploader):
2947         """Report downloading extry"""
2948         self.to_screen(u'Uploader: %s' % uploader)
2949
2950     def report_title(self, video_title):
2951         """Report downloading extry"""
2952         self.to_screen(u'Title: %s' % video_title)
2953
2954     def report_extract_vid_page(self, video_page):
2955         """Report information extraction."""
2956         self.to_screen(u'Extracting video page: %s' % video_page)
2957
2958     def _real_extract(self, url):
2959         # Extract id from URL
2960         mobj = re.match(self._VALID_URL, url)
2961         if mobj is None:
2962             raise ExtractorError(u'Invalid URL: %s' % url)
2963
2964         post_url = mobj.group(0)
2965         video_id = mobj.group(1)
2966
2967         video_extension = 'flv'
2968
2969         # Step 1, Retrieve post webpage to extract further information
2970         self.report_extract_entry(post_url)
2971         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2972
2973         # Extract update date
2974         upload_date = None
2975         pattern = 'title="Timestamp">(.*?)</a>'
2976         mobj = re.search(pattern, webpage)
2977         if mobj:
2978             upload_date = mobj.group(1)
2979             # Convert timestring to a format suitable for filename
2980             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2981             upload_date = upload_date.strftime('%Y%m%d')
2982         self.report_date(upload_date)
2983
2984         # Extract uploader
2985         uploader = None
2986         pattern = r'rel\="author".*?>(.*?)</a>'
2987         mobj = re.search(pattern, webpage)
2988         if mobj:
2989             uploader = mobj.group(1)
2990         self.report_uploader(uploader)
2991
2992         # Extract title
2993         # Get the first line for title
2994         video_title = u'NA'
2995         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
2996         mobj = re.search(pattern, webpage)
2997         if mobj:
2998             video_title = mobj.group(1)
2999         self.report_title(video_title)
3000
3001         # Step 2, Stimulate clicking the image box to launch video
3002         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3003         mobj = re.search(pattern, webpage)
3004         if mobj is None:
3005             raise ExtractorError(u'Unable to extract video page URL')
3006
3007         video_page = mobj.group(1)
3008         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3009         self.report_extract_vid_page(video_page)
3010
3011
3012         # Extract video links on video page
3013         """Extract video links of all sizes"""
3014         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3015         mobj = re.findall(pattern, webpage)
3016         if len(mobj) == 0:
3017             raise ExtractorError(u'Unable to extract video links')
3018
3019         # Sort in resolution
3020         links = sorted(mobj)
3021
3022         # Choose the lowest of the sort, i.e. highest resolution
3023         video_url = links[-1]
3024         # Only get the url. The resolution part in the tuple has no use anymore
3025         video_url = video_url[-1]
3026         # Treat escaped \u0026 style hex
3027         try:
3028             video_url = video_url.decode("unicode_escape")
3029         except AttributeError: # Python 3
3030             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3031
3032
3033         return [{
3034             'id':       video_id,
3035             'url':      video_url,
3036             'uploader': uploader,
3037             'upload_date':  upload_date,
3038             'title':    video_title,
3039             'ext':      video_extension,
3040         }]
3041
3042 class NBAIE(InfoExtractor):
3043     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3044     IE_NAME = u'nba'
3045
3046     def _real_extract(self, url):
3047         mobj = re.match(self._VALID_URL, url)
3048         if mobj is None:
3049             raise ExtractorError(u'Invalid URL: %s' % url)
3050
3051         video_id = mobj.group(1)
3052         if video_id.endswith('/index.html'):
3053             video_id = video_id[:-len('/index.html')]
3054
3055         webpage = self._download_webpage(url, video_id)
3056
3057         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3058         def _findProp(rexp, default=None):
3059             m = re.search(rexp, webpage)
3060             if m:
3061                 return unescapeHTML(m.group(1))
3062             else:
3063                 return default
3064
3065         shortened_video_id = video_id.rpartition('/')[2]
3066         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3067         info = {
3068             'id': shortened_video_id,
3069             'url': video_url,
3070             'ext': 'mp4',
3071             'title': title,
3072             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3073             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3074         }
3075         return [info]
3076
3077 class JustinTVIE(InfoExtractor):
3078     """Information extractor for justin.tv and twitch.tv"""
3079     # TODO: One broadcast may be split into multiple videos. The key
3080     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3081     # starts at 1 and increases. Can we treat all parts as one video?
3082
3083     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3084         (?:
3085             (?P<channelid>[^/]+)|
3086             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3087             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3088         )
3089         /?(?:\#.*)?$
3090         """
3091     _JUSTIN_PAGE_LIMIT = 100
3092     IE_NAME = u'justin.tv'
3093
3094     def report_download_page(self, channel, offset):
3095         """Report attempt to download a single page of videos."""
3096         self.to_screen(u'%s: Downloading video information from %d to %d' %
3097                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3098
3099     # Return count of items, list of *valid* items
3100     def _parse_page(self, url, video_id):
3101         webpage = self._download_webpage(url, video_id,
3102                                          u'Downloading video info JSON',
3103                                          u'unable to download video info JSON')
3104
3105         response = json.loads(webpage)
3106         if type(response) != list:
3107             error_text = response.get('error', 'unknown error')
3108             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3109         info = []
3110         for clip in response:
3111             video_url = clip['video_file_url']
3112             if video_url:
3113                 video_extension = os.path.splitext(video_url)[1][1:]
3114                 video_date = re.sub('-', '', clip['start_time'][:10])
3115                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3116                 video_id = clip['id']
3117                 video_title = clip.get('title', video_id)
3118                 info.append({
3119                     'id': video_id,
3120                     'url': video_url,
3121                     'title': video_title,
3122                     'uploader': clip.get('channel_name', video_uploader_id),
3123                     'uploader_id': video_uploader_id,
3124                     'upload_date': video_date,
3125                     'ext': video_extension,
3126                 })
3127         return (len(response), info)
3128
3129     def _real_extract(self, url):
3130         mobj = re.match(self._VALID_URL, url)
3131         if mobj is None:
3132             raise ExtractorError(u'invalid URL: %s' % url)
3133
3134         api_base = 'http://api.justin.tv'
3135         paged = False
3136         if mobj.group('channelid'):
3137             paged = True
3138             video_id = mobj.group('channelid')
3139             api = api_base + '/channel/archives/%s.json' % video_id
3140         elif mobj.group('chapterid'):
3141             chapter_id = mobj.group('chapterid')
3142
3143             webpage = self._download_webpage(url, chapter_id)
3144             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3145             if not m:
3146                 raise ExtractorError(u'Cannot find archive of a chapter')
3147             archive_id = m.group(1)
3148
3149             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3150             chapter_info_xml = self._download_webpage(api, chapter_id,
3151                                              note=u'Downloading chapter information',
3152                                              errnote=u'Chapter information download failed')
3153             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3154             for a in doc.findall('.//archive'):
3155                 if archive_id == a.find('./id').text:
3156                     break
3157             else:
3158                 raise ExtractorError(u'Could not find chapter in chapter information')
3159
3160             video_url = a.find('./video_file_url').text
3161             video_ext = video_url.rpartition('.')[2] or u'flv'
3162
3163             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3164             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3165                                    note='Downloading chapter metadata',
3166                                    errnote='Download of chapter metadata failed')
3167             chapter_info = json.loads(chapter_info_json)
3168
3169             bracket_start = int(doc.find('.//bracket_start').text)
3170             bracket_end = int(doc.find('.//bracket_end').text)
3171
3172             # TODO determine start (and probably fix up file)
3173             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3174             #video_url += u'?start=' + TODO:start_timestamp
3175             # bracket_start is 13290, but we want 51670615
3176             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3177                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3178
3179             info = {
3180                 'id': u'c' + chapter_id,
3181                 'url': video_url,
3182                 'ext': video_ext,
3183                 'title': chapter_info['title'],
3184                 'thumbnail': chapter_info['preview'],
3185                 'description': chapter_info['description'],
3186                 'uploader': chapter_info['channel']['display_name'],
3187                 'uploader_id': chapter_info['channel']['name'],
3188             }
3189             return [info]
3190         else:
3191             video_id = mobj.group('videoid')
3192             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3193
3194         self.report_extraction(video_id)
3195
3196         info = []
3197         offset = 0
3198         limit = self._JUSTIN_PAGE_LIMIT
3199         while True:
3200             if paged:
3201                 self.report_download_page(video_id, offset)
3202             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3203             page_count, page_info = self._parse_page(page_url, video_id)
3204             info.extend(page_info)
3205             if not paged or page_count != limit:
3206                 break
3207             offset += limit
3208         return info
3209
3210 class FunnyOrDieIE(InfoExtractor):
3211     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3212
3213     def _real_extract(self, url):
3214         mobj = re.match(self._VALID_URL, url)
3215         if mobj is None:
3216             raise ExtractorError(u'invalid URL: %s' % url)
3217
3218         video_id = mobj.group('id')
3219         webpage = self._download_webpage(url, video_id)
3220
3221         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3222         if not m:
3223             raise ExtractorError(u'Unable to find video information')
3224         video_url = unescapeHTML(m.group('url'))
3225
3226         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3227         if not m:
3228             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3229             if not m:
3230                 raise ExtractorError(u'Cannot find video title')
3231         title = clean_html(m.group('title'))
3232
3233         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3234         if m:
3235             desc = unescapeHTML(m.group('desc'))
3236         else:
3237             desc = None
3238
3239         info = {
3240             'id': video_id,
3241             'url': video_url,
3242             'ext': 'mp4',
3243             'title': title,
3244             'description': desc,
3245         }
3246         return [info]
3247
3248 class SteamIE(InfoExtractor):
3249     _VALID_URL = r"""http://store\.steampowered\.com/
3250                 (agecheck/)?
3251                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3252                 (?P<gameID>\d+)/?
3253                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3254                 """
3255
3256     @classmethod
3257     def suitable(cls, url):
3258         """Receives a URL and returns True if suitable for this IE."""
3259         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3260
3261     def _real_extract(self, url):
3262         m = re.match(self._VALID_URL, url, re.VERBOSE)
3263         gameID = m.group('gameID')
3264         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3265         self.report_age_confirmation()
3266         webpage = self._download_webpage(videourl, gameID)
3267         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3268         
3269         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3270         mweb = re.finditer(urlRE, webpage)
3271         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3272         titles = re.finditer(namesRE, webpage)
3273         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3274         thumbs = re.finditer(thumbsRE, webpage)
3275         videos = []
3276         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3277             video_id = vid.group('videoID')
3278             title = vtitle.group('videoName')
3279             video_url = vid.group('videoURL')
3280             video_thumb = thumb.group('thumbnail')
3281             if not video_url:
3282                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3283             info = {
3284                 'id':video_id,
3285                 'url':video_url,
3286                 'ext': 'flv',
3287                 'title': unescapeHTML(title),
3288                 'thumbnail': video_thumb
3289                   }
3290             videos.append(info)
3291         return [self.playlist_result(videos, gameID, game_title)]
3292
3293 class UstreamIE(InfoExtractor):
3294     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3295     IE_NAME = u'ustream'
3296
3297     def _real_extract(self, url):
3298         m = re.match(self._VALID_URL, url)
3299         video_id = m.group('videoID')
3300         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3301         webpage = self._download_webpage(url, video_id)
3302         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3303         title = m.group('title')
3304         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3305         uploader = m.group('uploader')
3306         info = {
3307                 'id':video_id,
3308                 'url':video_url,
3309                 'ext': 'flv',
3310                 'title': title,
3311                 'uploader': uploader
3312                   }
3313         return [info]
3314
3315 class WorldStarHipHopIE(InfoExtractor):
3316     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3317     IE_NAME = u'WorldStarHipHop'
3318
3319     def _real_extract(self, url):
3320         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3321
3322         m = re.match(self._VALID_URL, url)
3323         video_id = m.group('id')
3324
3325         webpage_src = self._download_webpage(url, video_id) 
3326
3327         mobj = re.search(_src_url, webpage_src)
3328
3329         if mobj is not None:
3330             video_url = mobj.group(1)
3331             if 'mp4' in video_url:
3332                 ext = 'mp4'
3333             else:
3334                 ext = 'flv'
3335         else:
3336             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3337
3338         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3339
3340         if mobj is None:
3341             raise ExtractorError(u'Cannot determine title')
3342         title = mobj.group(1)
3343
3344         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3345         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3346         if mobj is not None:
3347             thumbnail = mobj.group(1)
3348         else:
3349             _title = r"""candytitles.*>(.*)</span>"""
3350             mobj = re.search(_title, webpage_src)
3351             if mobj is not None:
3352                 title = mobj.group(1)
3353             thumbnail = None
3354
3355         results = [{
3356                     'id': video_id,
3357                     'url' : video_url,
3358                     'title' : title,
3359                     'thumbnail' : thumbnail,
3360                     'ext' : ext,
3361                     }]
3362         return results
3363
3364 class RBMARadioIE(InfoExtractor):
3365     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3366
3367     def _real_extract(self, url):
3368         m = re.match(self._VALID_URL, url)
3369         video_id = m.group('videoID')
3370
3371         webpage = self._download_webpage(url, video_id)
3372         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3373         if not m:
3374             raise ExtractorError(u'Cannot find metadata')
3375         json_data = m.group(1)
3376
3377         try:
3378             data = json.loads(json_data)
3379         except ValueError as e:
3380             raise ExtractorError(u'Invalid JSON: ' + str(e))
3381
3382         video_url = data['akamai_url'] + '&cbr=256'
3383         url_parts = compat_urllib_parse_urlparse(video_url)
3384         video_ext = url_parts.path.rpartition('.')[2]
3385         info = {
3386                 'id': video_id,
3387                 'url': video_url,
3388                 'ext': video_ext,
3389                 'title': data['title'],
3390                 'description': data.get('teaser_text'),
3391                 'location': data.get('country_of_origin'),
3392                 'uploader': data.get('host', {}).get('name'),
3393                 'uploader_id': data.get('host', {}).get('slug'),
3394                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3395                 'duration': data.get('duration'),
3396         }
3397         return [info]
3398
3399
3400 class YouPornIE(InfoExtractor):
3401     """Information extractor for youporn.com."""
3402     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3403
3404     def _print_formats(self, formats):
3405         """Print all available formats"""
3406         print(u'Available formats:')
3407         print(u'ext\t\tformat')
3408         print(u'---------------------------------')
3409         for format in formats:
3410             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3411
3412     def _specific(self, req_format, formats):
3413         for x in formats:
3414             if(x["format"]==req_format):
3415                 return x
3416         return None
3417
3418     def _real_extract(self, url):
3419         mobj = re.match(self._VALID_URL, url)
3420         if mobj is None:
3421             raise ExtractorError(u'Invalid URL: %s' % url)
3422
3423         video_id = mobj.group('videoid')
3424
3425         req = compat_urllib_request.Request(url)
3426         req.add_header('Cookie', 'age_verified=1')
3427         webpage = self._download_webpage(req, video_id)
3428
3429         # Get the video title
3430         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3431         if result is None:
3432             raise ExtractorError(u'Unable to extract video title')
3433         video_title = result.group('title').strip()
3434
3435         # Get the video date
3436         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3437         if result is None:
3438             self._downloader.report_warning(u'unable to extract video date')
3439             upload_date = None
3440         else:
3441             upload_date = unified_strdate(result.group('date').strip())
3442
3443         # Get the video uploader
3444         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3445         if result is None:
3446             self._downloader.report_warning(u'unable to extract uploader')
3447             video_uploader = None
3448         else:
3449             video_uploader = result.group('uploader').strip()
3450             video_uploader = clean_html( video_uploader )
3451
3452         # Get all of the formats available
3453         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3454         result = re.search(DOWNLOAD_LIST_RE, webpage)
3455         if result is None:
3456             raise ExtractorError(u'Unable to extract download list')
3457         download_list_html = result.group('download_list').strip()
3458
3459         # Get all of the links from the page
3460         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3461         links = re.findall(LINK_RE, download_list_html)
3462         if(len(links) == 0):
3463             raise ExtractorError(u'ERROR: no known formats available for video')
3464
3465         self.to_screen(u'Links found: %d' % len(links))
3466
3467         formats = []
3468         for link in links:
3469
3470             # A link looks like this:
3471             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3472             # A path looks like this:
3473             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3474             video_url = unescapeHTML( link )
3475             path = compat_urllib_parse_urlparse( video_url ).path
3476             extension = os.path.splitext( path )[1][1:]
3477             format = path.split('/')[4].split('_')[:2]
3478             size = format[0]
3479             bitrate = format[1]
3480             format = "-".join( format )
3481             title = u'%s-%s-%s' % (video_title, size, bitrate)
3482
3483             formats.append({
3484                 'id': video_id,
3485                 'url': video_url,
3486                 'uploader': video_uploader,
3487                 'upload_date': upload_date,
3488                 'title': title,
3489                 'ext': extension,
3490                 'format': format,
3491                 'thumbnail': None,
3492                 'description': None,
3493                 'player_url': None
3494             })
3495
3496         if self._downloader.params.get('listformats', None):
3497             self._print_formats(formats)
3498             return
3499
3500         req_format = self._downloader.params.get('format', None)
3501         self.to_screen(u'Format: %s' % req_format)
3502
3503         if req_format is None or req_format == 'best':
3504             return [formats[0]]
3505         elif req_format == 'worst':
3506             return [formats[-1]]
3507         elif req_format in ('-1', 'all'):
3508             return formats
3509         else:
3510             format = self._specific( req_format, formats )
3511             if result is None:
3512                 raise ExtractorError(u'Requested format not available')
3513             return [format]
3514
3515
3516
3517 class PornotubeIE(InfoExtractor):
3518     """Information extractor for pornotube.com."""
3519     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3520
3521     def _real_extract(self, url):
3522         mobj = re.match(self._VALID_URL, url)
3523         if mobj is None:
3524             raise ExtractorError(u'Invalid URL: %s' % url)
3525
3526         video_id = mobj.group('videoid')
3527         video_title = mobj.group('title')
3528
3529         # Get webpage content
3530         webpage = self._download_webpage(url, video_id)
3531
3532         # Get the video URL
3533         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3534         result = re.search(VIDEO_URL_RE, webpage)
3535         if result is None:
3536             raise ExtractorError(u'Unable to extract video url')
3537         video_url = compat_urllib_parse.unquote(result.group('url'))
3538
3539         #Get the uploaded date
3540         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3541         result = re.search(VIDEO_UPLOADED_RE, webpage)
3542         if result is None:
3543             raise ExtractorError(u'Unable to extract video title')
3544         upload_date = unified_strdate(result.group('date'))
3545
3546         info = {'id': video_id,
3547                 'url': video_url,
3548                 'uploader': None,
3549                 'upload_date': upload_date,
3550                 'title': video_title,
3551                 'ext': 'flv',
3552                 'format': 'flv'}
3553
3554         return [info]
3555
3556 class YouJizzIE(InfoExtractor):
3557     """Information extractor for youjizz.com."""
3558     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3559
3560     def _real_extract(self, url):
3561         mobj = re.match(self._VALID_URL, url)
3562         if mobj is None:
3563             raise ExtractorError(u'Invalid URL: %s' % url)
3564
3565         video_id = mobj.group('videoid')
3566
3567         # Get webpage content
3568         webpage = self._download_webpage(url, video_id)
3569
3570         # Get the video title
3571         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3572         if result is None:
3573             raise ExtractorError(u'ERROR: unable to extract video title')
3574         video_title = result.group('title').strip()
3575
3576         # Get the embed page
3577         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3578         if result is None:
3579             raise ExtractorError(u'ERROR: unable to extract embed page')
3580
3581         embed_page_url = result.group(0).strip()
3582         video_id = result.group('videoid')
3583
3584         webpage = self._download_webpage(embed_page_url, video_id)
3585
3586         # Get the video URL
3587         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3588         if result is None:
3589             raise ExtractorError(u'ERROR: unable to extract video url')
3590         video_url = result.group('source')
3591
3592         info = {'id': video_id,
3593                 'url': video_url,
3594                 'title': video_title,
3595                 'ext': 'flv',
3596                 'format': 'flv',
3597                 'player_url': embed_page_url}
3598
3599         return [info]
3600
3601 class EightTracksIE(InfoExtractor):
3602     IE_NAME = '8tracks'
3603     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3604
3605     def _real_extract(self, url):
3606         mobj = re.match(self._VALID_URL, url)
3607         if mobj is None:
3608             raise ExtractorError(u'Invalid URL: %s' % url)
3609         playlist_id = mobj.group('id')
3610
3611         webpage = self._download_webpage(url, playlist_id)
3612
3613         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3614         if not m:
3615             raise ExtractorError(u'Cannot find trax information')
3616         json_like = m.group(1)
3617         data = json.loads(json_like)
3618
3619         session = str(random.randint(0, 1000000000))
3620         mix_id = data['id']
3621         track_count = data['tracks_count']
3622         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3623         next_url = first_url
3624         res = []
3625         for i in itertools.count():
3626             api_json = self._download_webpage(next_url, playlist_id,
3627                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3628                 errnote=u'Failed to download song information')
3629             api_data = json.loads(api_json)
3630             track_data = api_data[u'set']['track']
3631             info = {
3632                 'id': track_data['id'],
3633                 'url': track_data['track_file_stream_url'],
3634                 'title': track_data['performer'] + u' - ' + track_data['name'],
3635                 'raw_title': track_data['name'],
3636                 'uploader_id': data['user']['login'],
3637                 'ext': 'm4a',
3638             }
3639             res.append(info)
3640             if api_data['set']['at_last_track']:
3641                 break
3642             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3643         return res
3644
3645 class KeekIE(InfoExtractor):
3646     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3647     IE_NAME = u'keek'
3648
3649     def _real_extract(self, url):
3650         m = re.match(self._VALID_URL, url)
3651         video_id = m.group('videoID')
3652         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3653         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3654         webpage = self._download_webpage(url, video_id)
3655         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3656         title = unescapeHTML(m.group('title'))
3657         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3658         uploader = clean_html(m.group('uploader'))
3659         info = {
3660                 'id': video_id,
3661                 'url': video_url,
3662                 'ext': 'mp4',
3663                 'title': title,
3664                 'thumbnail': thumbnail,
3665                 'uploader': uploader
3666         }
3667         return [info]
3668
3669 class TEDIE(InfoExtractor):
3670     _VALID_URL=r'''http://www\.ted\.com/
3671                    (
3672                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3673                         |
3674                         ((?P<type_talk>talks)) # We have a simple talk
3675                    )
3676                    (/lang/(.*?))? # The url may contain the language
3677                    /(?P<name>\w+) # Here goes the name and then ".html"
3678                    '''
3679
3680     @classmethod
3681     def suitable(cls, url):
3682         """Receives a URL and returns True if suitable for this IE."""
3683         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3684
3685     def _real_extract(self, url):
3686         m=re.match(self._VALID_URL, url, re.VERBOSE)
3687         if m.group('type_talk'):
3688             return [self._talk_info(url)]
3689         else :
3690             playlist_id=m.group('playlist_id')
3691             name=m.group('name')
3692             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3693             return [self._playlist_videos_info(url,name,playlist_id)]
3694
3695     def _talk_video_link(self,mediaSlug):
3696         '''Returns the video link for that mediaSlug'''
3697         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3698
3699     def _playlist_videos_info(self,url,name,playlist_id=0):
3700         '''Returns the videos of the playlist'''
3701         video_RE=r'''
3702                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3703                      ([.\s]*?)data-playlist_item_id="(\d+)"
3704                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3705                      '''
3706         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3707         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3708         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3709         m_names=re.finditer(video_name_RE,webpage)
3710
3711         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3712         m_playlist = re.search(playlist_RE, webpage)
3713         playlist_title = m_playlist.group('playlist_title')
3714
3715         playlist_entries = []
3716         for m_video, m_name in zip(m_videos,m_names):
3717             video_id=m_video.group('video_id')
3718             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3719             playlist_entries.append(self.url_result(talk_url, 'TED'))
3720         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3721
3722     def _talk_info(self, url, video_id=0):
3723         """Return the video for the talk in the url"""
3724         m=re.match(self._VALID_URL, url,re.VERBOSE)
3725         videoName=m.group('name')
3726         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3727         # If the url includes the language we get the title translated
3728         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3729         title=re.search(title_RE, webpage).group('title')
3730         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3731                         "id":(?P<videoID>[\d]+).*?
3732                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3733         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3734         thumb_match=re.search(thumb_RE,webpage)
3735         info_match=re.search(info_RE,webpage,re.VERBOSE)
3736         video_id=info_match.group('videoID')
3737         mediaSlug=info_match.group('mediaSlug')
3738         video_url=self._talk_video_link(mediaSlug)
3739         info = {
3740                 'id': video_id,
3741                 'url': video_url,
3742                 'ext': 'mp4',
3743                 'title': title,
3744                 'thumbnail': thumb_match.group('thumbnail')
3745                 }
3746         return info
3747
3748 class MySpassIE(InfoExtractor):
3749     _VALID_URL = r'http://www.myspass.de/.*'
3750
3751     def _real_extract(self, url):
3752         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3753
3754         # video id is the last path element of the URL
3755         # usually there is a trailing slash, so also try the second but last
3756         url_path = compat_urllib_parse_urlparse(url).path
3757         url_parent_path, video_id = os.path.split(url_path)
3758         if not video_id:
3759             _, video_id = os.path.split(url_parent_path)
3760
3761         # get metadata
3762         metadata_url = META_DATA_URL_TEMPLATE % video_id
3763         metadata_text = self._download_webpage(metadata_url, video_id)
3764         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3765
3766         # extract values from metadata
3767         url_flv_el = metadata.find('url_flv')
3768         if url_flv_el is None:
3769             raise ExtractorError(u'Unable to extract download url')
3770         video_url = url_flv_el.text
3771         extension = os.path.splitext(video_url)[1][1:]
3772         title_el = metadata.find('title')
3773         if title_el is None:
3774             raise ExtractorError(u'Unable to extract title')
3775         title = title_el.text
3776         format_id_el = metadata.find('format_id')
3777         if format_id_el is None:
3778             format = ext
3779         else:
3780             format = format_id_el.text
3781         description_el = metadata.find('description')
3782         if description_el is not None:
3783             description = description_el.text
3784         else:
3785             description = None
3786         imagePreview_el = metadata.find('imagePreview')
3787         if imagePreview_el is not None:
3788             thumbnail = imagePreview_el.text
3789         else:
3790             thumbnail = None
3791         info = {
3792             'id': video_id,
3793             'url': video_url,
3794             'title': title,
3795             'ext': extension,
3796             'format': format,
3797             'thumbnail': thumbnail,
3798             'description': description
3799         }
3800         return [info]
3801
3802 class SpiegelIE(InfoExtractor):
3803     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3804
3805     def _real_extract(self, url):
3806         m = re.match(self._VALID_URL, url)
3807         video_id = m.group('videoID')
3808
3809         webpage = self._download_webpage(url, video_id)
3810         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3811         if not m:
3812             raise ExtractorError(u'Cannot find title')
3813         video_title = unescapeHTML(m.group(1))
3814
3815         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3816         xml_code = self._download_webpage(xml_url, video_id,
3817                     note=u'Downloading XML', errnote=u'Failed to download XML')
3818
3819         idoc = xml.etree.ElementTree.fromstring(xml_code)
3820         last_type = idoc[-1]
3821         filename = last_type.findall('./filename')[0].text
3822         duration = float(last_type.findall('./duration')[0].text)
3823
3824         video_url = 'http://video2.spiegel.de/flash/' + filename
3825         video_ext = filename.rpartition('.')[2]
3826         info = {
3827             'id': video_id,
3828             'url': video_url,
3829             'ext': video_ext,
3830             'title': video_title,
3831             'duration': duration,
3832         }
3833         return [info]
3834
3835 class LiveLeakIE(InfoExtractor):
3836
3837     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3838     IE_NAME = u'liveleak'
3839
3840     def _real_extract(self, url):
3841         mobj = re.match(self._VALID_URL, url)
3842         if mobj is None:
3843             raise ExtractorError(u'Invalid URL: %s' % url)
3844
3845         video_id = mobj.group('video_id')
3846
3847         webpage = self._download_webpage(url, video_id)
3848
3849         m = re.search(r'file: "(.*?)",', webpage)
3850         if not m:
3851             raise ExtractorError(u'Unable to find video url')
3852         video_url = m.group(1)
3853
3854         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3855         if not m:
3856             raise ExtractorError(u'Cannot find video title')
3857         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3858
3859         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3860         if m:
3861             desc = unescapeHTML(m.group('desc'))
3862         else:
3863             desc = None
3864
3865         m = re.search(r'By:.*?(\w+)</a>', webpage)
3866         if m:
3867             uploader = clean_html(m.group(1))
3868         else:
3869             uploader = None
3870
3871         info = {
3872             'id':  video_id,
3873             'url': video_url,
3874             'ext': 'mp4',
3875             'title': title,
3876             'description': desc,
3877             'uploader': uploader
3878         }
3879
3880         return [info]
3881
3882 class ARDIE(InfoExtractor):
3883     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3884     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3885     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3886
3887     def _real_extract(self, url):
3888         # determine video id from url
3889         m = re.match(self._VALID_URL, url)
3890
3891         numid = re.search(r'documentId=([0-9]+)', url)
3892         if numid:
3893             video_id = numid.group(1)
3894         else:
3895             video_id = m.group('video_id')
3896
3897         # determine title and media streams from webpage
3898         html = self._download_webpage(url, video_id)
3899         title = re.search(self._TITLE, html).group('title')
3900         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3901         if not streams:
3902             assert '"fsk"' in html
3903             raise ExtractorError(u'This video is only available after 8:00 pm')
3904
3905         # choose default media type and highest quality for now
3906         stream = max([s for s in streams if int(s["media_type"]) == 0],
3907                      key=lambda s: int(s["quality"]))
3908
3909         # there's two possibilities: RTMP stream or HTTP download
3910         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3911         if stream['rtmp_url']:
3912             self.to_screen(u'RTMP download detected')
3913             assert stream['video_url'].startswith('mp4:')
3914             info["url"] = stream["rtmp_url"]
3915             info["play_path"] = stream['video_url']
3916         else:
3917             assert stream["video_url"].endswith('.mp4')
3918             info["url"] = stream["video_url"]
3919         return [info]
3920
3921 class TumblrIE(InfoExtractor):
3922     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3923
3924     def _real_extract(self, url):
3925         m_url = re.match(self._VALID_URL, url)
3926         video_id = m_url.group('id')
3927         blog = m_url.group('blog_name')
3928
3929         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3930         webpage = self._download_webpage(url, video_id)
3931
3932         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3933         video = re.search(re_video, webpage)
3934         if video is None:
3935             self.to_screen("No video founded")
3936             return []
3937         video_url = video.group('video_url')
3938         ext = video.group('ext')
3939
3940         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3941         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3942
3943         # The only place where you can get a title, it's not complete,
3944         # but searching in other places doesn't work for all videos
3945         re_title = r'<title>(?P<title>.*?)</title>'
3946         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3947
3948         return [{'id': video_id,
3949                  'url': video_url,
3950                  'title': title,
3951                  'thumbnail': thumb,
3952                  'ext': ext
3953                  }]
3954
3955 class BandcampIE(InfoExtractor):
3956     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3957
3958     def _real_extract(self, url):
3959         mobj = re.match(self._VALID_URL, url)
3960         title = mobj.group('title')
3961         webpage = self._download_webpage(url, title)
3962         # We get the link to the free download page
3963         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3964         if m_download is None:
3965             raise ExtractorError(u'No free songs founded')
3966
3967         download_link = m_download.group(1)
3968         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
3969                        webpage, re.MULTILINE|re.DOTALL).group('id')
3970
3971         download_webpage = self._download_webpage(download_link, id,
3972                                                   'Downloading free downloads page')
3973         # We get the dictionary of the track from some javascrip code
3974         info = re.search(r'items: (.*?),$',
3975                          download_webpage, re.MULTILINE).group(1)
3976         info = json.loads(info)[0]
3977         # We pick mp3-320 for now, until format selection can be easily implemented.
3978         mp3_info = info[u'downloads'][u'mp3-320']
3979         # If we try to use this url it says the link has expired
3980         initial_url = mp3_info[u'url']
3981         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3982         m_url = re.match(re_url, initial_url)
3983         #We build the url we will use to get the final track url
3984         # This url is build in Bandcamp in the script download_bunde_*.js
3985         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3986         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3987         # If we could correctly generate the .rand field the url would be
3988         #in the "download_url" key
3989         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3990
3991         track_info = {'id':id,
3992                       'title' : info[u'title'],
3993                       'ext' : 'mp3',
3994                       'url' : final_url,
3995                       'thumbnail' : info[u'thumb_url'],
3996                       'uploader' : info[u'artist']
3997                       }
3998
3999         return [track_info]
4000
4001 class RedTubeIE(InfoExtractor):
4002     """Information Extractor for redtube"""
4003     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4004
4005     def _real_extract(self,url):
4006         mobj = re.match(self._VALID_URL, url)
4007         if mobj is None:
4008             raise ExtractorError(u'Invalid URL: %s' % url)
4009
4010         video_id = mobj.group('id')
4011         video_extension = 'mp4'        
4012         webpage = self._download_webpage(url, video_id)
4013         self.report_extraction(video_id)
4014         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4015
4016         if mobj is None:
4017             raise ExtractorError(u'Unable to extract media URL')
4018
4019         video_url = mobj.group(1)
4020         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4021         if mobj is None:
4022             raise ExtractorError(u'Unable to extract title')
4023         video_title = mobj.group(1)
4024
4025         return [{
4026             'id':       video_id,
4027             'url':      video_url,
4028             'ext':      video_extension,
4029             'title':    video_title,
4030         }]
4031         
4032 class InaIE(InfoExtractor):
4033     """Information Extractor for Ina.fr"""
4034     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4035
4036     def _real_extract(self,url):
4037         mobj = re.match(self._VALID_URL, url)
4038
4039         video_id = mobj.group('id')
4040         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4041         video_extension = 'mp4'
4042         webpage = self._download_webpage(mrss_url, video_id)
4043
4044         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4045         if mobj is None:
4046             raise ExtractorError(u'Unable to extract media URL')
4047         video_url = mobj.group(1)
4048
4049         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4050         if mobj is None:
4051             raise ExtractorError(u'Unable to extract title')
4052         video_title = mobj.group(1)
4053
4054         return [{
4055             'id':       video_id,
4056             'url':      video_url,
4057             'ext':      video_extension,
4058             'title':    video_title,
4059         }]
4060
4061 def gen_extractors():
4062     """ Return a list of an instance of every supported extractor.
4063     The order does matter; the first extractor matched is the one handling the URL.
4064     """
4065     return [
4066         YoutubePlaylistIE(),
4067         YoutubeChannelIE(),
4068         YoutubeUserIE(),
4069         YoutubeSearchIE(),
4070         YoutubeIE(),
4071         MetacafeIE(),
4072         DailymotionIE(),
4073         GoogleSearchIE(),
4074         PhotobucketIE(),
4075         YahooIE(),
4076         YahooSearchIE(),
4077         DepositFilesIE(),
4078         FacebookIE(),
4079         BlipTVUserIE(),
4080         BlipTVIE(),
4081         VimeoIE(),
4082         MyVideoIE(),
4083         ComedyCentralIE(),
4084         EscapistIE(),
4085         CollegeHumorIE(),
4086         XVideosIE(),
4087         SoundcloudSetIE(),
4088         SoundcloudIE(),
4089         InfoQIE(),
4090         MixcloudIE(),
4091         StanfordOpenClassroomIE(),
4092         MTVIE(),
4093         YoukuIE(),
4094         XNXXIE(),
4095         YouJizzIE(),
4096         PornotubeIE(),
4097         YouPornIE(),
4098         GooglePlusIE(),
4099         ArteTvIE(),
4100         NBAIE(),
4101         WorldStarHipHopIE(),
4102         JustinTVIE(),
4103         FunnyOrDieIE(),
4104         SteamIE(),
4105         UstreamIE(),
4106         RBMARadioIE(),
4107         EightTracksIE(),
4108         KeekIE(),
4109         TEDIE(),
4110         MySpassIE(),
4111         SpiegelIE(),
4112         LiveLeakIE(),
4113         ARDIE(),
4114         TumblrIE(),
4115         BandcampIE(),
4116         RedTubeIE(),
4117         InaIE(),
4118         GenericIE()
4119     ]
4120
4121 def get_info_extractor(ie_name):
4122     """Returns the info extractor class with the given ie_name"""
4123     return globals()[ie_name+'IE']