Add support for Howcast.com - closes #835
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191 class SearchInfoExtractor(InfoExtractor):
192     """
193     Base class for paged search queries extractors.
194     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
195     Instances should define _SEARCH_KEY and _MAX_RESULTS.
196     """
197
198     @classmethod
199     def _make_valid_url(cls):
200         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
201
202     @classmethod
203     def suitable(cls, url):
204         return re.match(cls._make_valid_url(), url) is not None
205
206     def _real_extract(self, query):
207         mobj = re.match(self._make_valid_url(), query)
208         if mobj is None:
209             raise ExtractorError(u'Invalid search query "%s"' % query)
210
211         prefix = mobj.group('prefix')
212         query = mobj.group('query')
213         if prefix == '':
214             return self._get_n_results(query, 1)
215         elif prefix == 'all':
216             return self._get_n_results(query, self._MAX_RESULTS)
217         else:
218             n = int(prefix)
219             if n <= 0:
220                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
221             elif n > self._MAX_RESULTS:
222                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
223                 n = self._MAX_RESULTS
224             return self._get_n_results(query, n)
225
226     def _get_n_results(self, query, n):
227         """Get a specified number of results for a query"""
228         raise NotImplementedError("This method must be implemented by sublclasses")
229
230
231 class YoutubeIE(InfoExtractor):
232     """Information extractor for youtube.com."""
233
234     _VALID_URL = r"""^
235                      (
236                          (?:https?://)?                                       # http(s):// (optional)
237                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
238                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
239                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
240                          (?:                                                  # the various things that can precede the ID:
241                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
242                              |(?:                                             # or the v= param in all its forms
243                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
244                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
245                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
246                                  v=
247                              )
248                          )?                                                   # optional -> youtube.com/xxxx is OK
249                      )?                                                       # all until now is optional -> you can pass the naked ID
250                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
251                      (?(1).+)?                                                # if we found the ID, everything can follow
252                      $"""
253     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
254     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
255     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
256     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
257     _NETRC_MACHINE = 'youtube'
258     # Listed in order of quality
259     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
260     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
261     _video_extensions = {
262         '13': '3gp',
263         '17': 'mp4',
264         '18': 'mp4',
265         '22': 'mp4',
266         '37': 'mp4',
267         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
268         '43': 'webm',
269         '44': 'webm',
270         '45': 'webm',
271         '46': 'webm',
272     }
273     _video_dimensions = {
274         '5': '240x400',
275         '6': '???',
276         '13': '???',
277         '17': '144x176',
278         '18': '360x640',
279         '22': '720x1280',
280         '34': '360x640',
281         '35': '480x854',
282         '37': '1080x1920',
283         '38': '3072x4096',
284         '43': '360x640',
285         '44': '480x854',
286         '45': '720x1280',
287         '46': '1080x1920',
288     }
289     IE_NAME = u'youtube'
290
291     @classmethod
292     def suitable(cls, url):
293         """Receives a URL and returns True if suitable for this IE."""
294         if YoutubePlaylistIE.suitable(url): return False
295         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
296
297     def report_lang(self):
298         """Report attempt to set language."""
299         self.to_screen(u'Setting language')
300
301     def report_login(self):
302         """Report attempt to log in."""
303         self.to_screen(u'Logging in')
304
305     def report_video_webpage_download(self, video_id):
306         """Report attempt to download video webpage."""
307         self.to_screen(u'%s: Downloading video webpage' % video_id)
308
309     def report_video_info_webpage_download(self, video_id):
310         """Report attempt to download video info webpage."""
311         self.to_screen(u'%s: Downloading video info webpage' % video_id)
312
313     def report_video_subtitles_download(self, video_id):
314         """Report attempt to download video info webpage."""
315         self.to_screen(u'%s: Checking available subtitles' % video_id)
316
317     def report_video_subtitles_request(self, video_id, sub_lang, format):
318         """Report attempt to download video info webpage."""
319         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
320
321     def report_video_subtitles_available(self, video_id, sub_lang_list):
322         """Report available subtitles."""
323         sub_lang = ",".join(list(sub_lang_list.keys()))
324         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
325
326     def report_information_extraction(self, video_id):
327         """Report attempt to extract video information."""
328         self.to_screen(u'%s: Extracting video information' % video_id)
329
330     def report_unavailable_format(self, video_id, format):
331         """Report extracted video URL."""
332         self.to_screen(u'%s: Format %s not available' % (video_id, format))
333
334     def report_rtmp_download(self):
335         """Indicate the download will use the RTMP protocol."""
336         self.to_screen(u'RTMP download detected')
337
338     def _get_available_subtitles(self, video_id):
339         self.report_video_subtitles_download(video_id)
340         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
341         try:
342             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344             return (u'unable to download video subtitles: %s' % compat_str(err), None)
345         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
346         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
347         if not sub_lang_list:
348             return (u'video doesn\'t have subtitles', None)
349         return sub_lang_list
350
351     def _list_available_subtitles(self, video_id):
352         sub_lang_list = self._get_available_subtitles(video_id)
353         self.report_video_subtitles_available(video_id, sub_lang_list)
354
355     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
356         """
357         Return tuple:
358         (error_message, sub_lang, sub)
359         """
360         self.report_video_subtitles_request(video_id, sub_lang, format)
361         params = compat_urllib_parse.urlencode({
362             'lang': sub_lang,
363             'name': sub_name,
364             'v': video_id,
365             'fmt': format,
366         })
367         url = 'http://www.youtube.com/api/timedtext?' + params
368         try:
369             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
372         if not sub:
373             return (u'Did not fetch video subtitles', None, None)
374         return (None, sub_lang, sub)
375
376     def _extract_subtitle(self, video_id):
377         """
378         Return a list with a tuple:
379         [(error_message, sub_lang, sub)]
380         """
381         sub_lang_list = self._get_available_subtitles(video_id)
382         sub_format = self._downloader.params.get('subtitlesformat')
383         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
384             return [(sub_lang_list[0], None, None)]
385         if self._downloader.params.get('subtitleslang', False):
386             sub_lang = self._downloader.params.get('subtitleslang')
387         elif 'en' in sub_lang_list:
388             sub_lang = 'en'
389         else:
390             sub_lang = list(sub_lang_list.keys())[0]
391         if not sub_lang in sub_lang_list:
392             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
393
394         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
395         return [subtitle]
396
397     def _extract_all_subtitles(self, video_id):
398         sub_lang_list = self._get_available_subtitles(video_id)
399         sub_format = self._downloader.params.get('subtitlesformat')
400         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
401             return [(sub_lang_list[0], None, None)]
402         subtitles = []
403         for sub_lang in sub_lang_list:
404             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
405             subtitles.append(subtitle)
406         return subtitles
407
408     def _print_formats(self, formats):
409         print('Available formats:')
410         for x in formats:
411             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
412
413     def _real_initialize(self):
414         if self._downloader is None:
415             return
416
417         username = None
418         password = None
419         downloader_params = self._downloader.params
420
421         # Attempt to use provided username and password or .netrc data
422         if downloader_params.get('username', None) is not None:
423             username = downloader_params['username']
424             password = downloader_params['password']
425         elif downloader_params.get('usenetrc', False):
426             try:
427                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
428                 if info is not None:
429                     username = info[0]
430                     password = info[2]
431                 else:
432                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
433             except (IOError, netrc.NetrcParseError) as err:
434                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
435                 return
436
437         # Set language
438         request = compat_urllib_request.Request(self._LANG_URL)
439         try:
440             self.report_lang()
441             compat_urllib_request.urlopen(request).read()
442         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
443             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
444             return
445
446         # No authentication to be performed
447         if username is None:
448             return
449
450         request = compat_urllib_request.Request(self._LOGIN_URL)
451         try:
452             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
455             return
456
457         galx = None
458         dsh = None
459         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
460         if match:
461           galx = match.group(1)
462
463         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
464         if match:
465           dsh = match.group(1)
466
467         # Log in
468         login_form_strs = {
469                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
470                 u'Email': username,
471                 u'GALX': galx,
472                 u'Passwd': password,
473                 u'PersistentCookie': u'yes',
474                 u'_utf8': u'霱',
475                 u'bgresponse': u'js_disabled',
476                 u'checkConnection': u'',
477                 u'checkedDomains': u'youtube',
478                 u'dnConn': u'',
479                 u'dsh': dsh,
480                 u'pstMsg': u'0',
481                 u'rmShown': u'1',
482                 u'secTok': u'',
483                 u'signIn': u'Sign in',
484                 u'timeStmp': u'',
485                 u'service': u'youtube',
486                 u'uilel': u'3',
487                 u'hl': u'en_US',
488         }
489         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
490         # chokes on unicode
491         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
492         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
493         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
494         try:
495             self.report_login()
496             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
497             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
498                 self._downloader.report_warning(u'unable to log in: bad username or password')
499                 return
500         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
501             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
502             return
503
504         # Confirm age
505         age_form = {
506                 'next_url':     '/',
507                 'action_confirm':   'Confirm',
508                 }
509         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
510         try:
511             self.report_age_confirmation()
512             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
515
516     def _extract_id(self, url):
517         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
518         if mobj is None:
519             raise ExtractorError(u'Invalid URL: %s' % url)
520         video_id = mobj.group(2)
521         return video_id
522
523     def _real_extract(self, url):
524         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
525         mobj = re.search(self._NEXT_URL_RE, url)
526         if mobj:
527             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
528         video_id = self._extract_id(url)
529
530         # Get video webpage
531         self.report_video_webpage_download(video_id)
532         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
533         request = compat_urllib_request.Request(url)
534         try:
535             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
538
539         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
540
541         # Attempt to extract SWF player URL
542         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
543         if mobj is not None:
544             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
545         else:
546             player_url = None
547
548         # Get video info
549         self.report_video_info_webpage_download(video_id)
550         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
551             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
552                     % (video_id, el_type))
553             video_info_webpage = self._download_webpage(video_info_url, video_id,
554                                     note=False,
555                                     errnote='unable to download video info webpage')
556             video_info = compat_parse_qs(video_info_webpage)
557             if 'token' in video_info:
558                 break
559         if 'token' not in video_info:
560             if 'reason' in video_info:
561                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
562             else:
563                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
564
565         # Check for "rental" videos
566         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
567             raise ExtractorError(u'"rental" videos not supported')
568
569         # Start extracting information
570         self.report_information_extraction(video_id)
571
572         # uploader
573         if 'author' not in video_info:
574             raise ExtractorError(u'Unable to extract uploader name')
575         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
576
577         # uploader_id
578         video_uploader_id = None
579         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
580         if mobj is not None:
581             video_uploader_id = mobj.group(1)
582         else:
583             self._downloader.report_warning(u'unable to extract uploader nickname')
584
585         # title
586         if 'title' not in video_info:
587             raise ExtractorError(u'Unable to extract video title')
588         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
589
590         # thumbnail image
591         if 'thumbnail_url' not in video_info:
592             self._downloader.report_warning(u'unable to extract video thumbnail')
593             video_thumbnail = ''
594         else:   # don't panic if we can't find it
595             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
596
597         # upload date
598         upload_date = None
599         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
600         if mobj is not None:
601             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
602             upload_date = unified_strdate(upload_date)
603
604         # description
605         video_description = get_element_by_id("eow-description", video_webpage)
606         if video_description:
607             video_description = clean_html(video_description)
608         else:
609             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
610             if fd_mobj:
611                 video_description = unescapeHTML(fd_mobj.group(1))
612             else:
613                 video_description = u''
614
615         # subtitles
616         video_subtitles = None
617
618         if self._downloader.params.get('writesubtitles', False):
619             video_subtitles = self._extract_subtitle(video_id)
620             if video_subtitles:
621                 (sub_error, sub_lang, sub) = video_subtitles[0]
622                 if sub_error:
623                     self._downloader.report_error(sub_error)
624
625         if self._downloader.params.get('allsubtitles', False):
626             video_subtitles = self._extract_all_subtitles(video_id)
627             for video_subtitle in video_subtitles:
628                 (sub_error, sub_lang, sub) = video_subtitle
629                 if sub_error:
630                     self._downloader.report_error(sub_error)
631
632         if self._downloader.params.get('listsubtitles', False):
633             sub_lang_list = self._list_available_subtitles(video_id)
634             return
635
636         if 'length_seconds' not in video_info:
637             self._downloader.report_warning(u'unable to extract video duration')
638             video_duration = ''
639         else:
640             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
641
642         # token
643         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
644
645         # Decide which formats to download
646         req_format = self._downloader.params.get('format', None)
647
648         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
649             self.report_rtmp_download()
650             video_url_list = [(None, video_info['conn'][0])]
651         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
652             url_map = {}
653             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
654                 url_data = compat_parse_qs(url_data_str)
655                 if 'itag' in url_data and 'url' in url_data:
656                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
657                     if not 'ratebypass' in url: url += '&ratebypass=yes'
658                     url_map[url_data['itag'][0]] = url
659
660             format_limit = self._downloader.params.get('format_limit', None)
661             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
662             if format_limit is not None and format_limit in available_formats:
663                 format_list = available_formats[available_formats.index(format_limit):]
664             else:
665                 format_list = available_formats
666             existing_formats = [x for x in format_list if x in url_map]
667             if len(existing_formats) == 0:
668                 raise ExtractorError(u'no known formats available for video')
669             if self._downloader.params.get('listformats', None):
670                 self._print_formats(existing_formats)
671                 return
672             if req_format is None or req_format == 'best':
673                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
674             elif req_format == 'worst':
675                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
676             elif req_format in ('-1', 'all'):
677                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
678             else:
679                 # Specific formats. We pick the first in a slash-delimeted sequence.
680                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
681                 req_formats = req_format.split('/')
682                 video_url_list = None
683                 for rf in req_formats:
684                     if rf in url_map:
685                         video_url_list = [(rf, url_map[rf])]
686                         break
687                 if video_url_list is None:
688                     raise ExtractorError(u'requested format not available')
689         else:
690             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
691
692         results = []
693         for format_param, video_real_url in video_url_list:
694             # Extension
695             video_extension = self._video_extensions.get(format_param, 'flv')
696
697             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
698                                               self._video_dimensions.get(format_param, '???'))
699
700             results.append({
701                 'id':       video_id,
702                 'url':      video_real_url,
703                 'uploader': video_uploader,
704                 'uploader_id': video_uploader_id,
705                 'upload_date':  upload_date,
706                 'title':    video_title,
707                 'ext':      video_extension,
708                 'format':   video_format,
709                 'thumbnail':    video_thumbnail,
710                 'description':  video_description,
711                 'player_url':   player_url,
712                 'subtitles':    video_subtitles,
713                 'duration':     video_duration
714             })
715         return results
716
717
718 class MetacafeIE(InfoExtractor):
719     """Information Extractor for metacafe.com."""
720
721     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
722     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
723     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
724     IE_NAME = u'metacafe'
725
726     def report_disclaimer(self):
727         """Report disclaimer retrieval."""
728         self.to_screen(u'Retrieving disclaimer')
729
730     def _real_initialize(self):
731         # Retrieve disclaimer
732         request = compat_urllib_request.Request(self._DISCLAIMER)
733         try:
734             self.report_disclaimer()
735             disclaimer = compat_urllib_request.urlopen(request).read()
736         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
737             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
738
739         # Confirm age
740         disclaimer_form = {
741             'filters': '0',
742             'submit': "Continue - I'm over 18",
743             }
744         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
745         try:
746             self.report_age_confirmation()
747             disclaimer = compat_urllib_request.urlopen(request).read()
748         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
749             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
750
751     def _real_extract(self, url):
752         # Extract id and simplified title from URL
753         mobj = re.match(self._VALID_URL, url)
754         if mobj is None:
755             raise ExtractorError(u'Invalid URL: %s' % url)
756
757         video_id = mobj.group(1)
758
759         # Check if video comes from YouTube
760         mobj2 = re.match(r'^yt-(.*)$', video_id)
761         if mobj2 is not None:
762             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
763
764         # Retrieve video webpage to extract further information
765         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
766
767         # Extract URL, uploader and title from webpage
768         self.report_extraction(video_id)
769         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
770         if mobj is not None:
771             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
772             video_extension = mediaURL[-3:]
773
774             # Extract gdaKey if available
775             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
776             if mobj is None:
777                 video_url = mediaURL
778             else:
779                 gdaKey = mobj.group(1)
780                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
781         else:
782             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
783             if mobj is None:
784                 raise ExtractorError(u'Unable to extract media URL')
785             vardict = compat_parse_qs(mobj.group(1))
786             if 'mediaData' not in vardict:
787                 raise ExtractorError(u'Unable to extract media URL')
788             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
789             if mobj is None:
790                 raise ExtractorError(u'Unable to extract media URL')
791             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
792             video_extension = mediaURL[-3:]
793             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
794
795         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
796         if mobj is None:
797             raise ExtractorError(u'Unable to extract title')
798         video_title = mobj.group(1).decode('utf-8')
799
800         mobj = re.search(r'submitter=(.*?);', webpage)
801         if mobj is None:
802             raise ExtractorError(u'Unable to extract uploader nickname')
803         video_uploader = mobj.group(1)
804
805         return [{
806             'id':       video_id.decode('utf-8'),
807             'url':      video_url.decode('utf-8'),
808             'uploader': video_uploader.decode('utf-8'),
809             'upload_date':  None,
810             'title':    video_title,
811             'ext':      video_extension.decode('utf-8'),
812         }]
813
814 class DailymotionIE(InfoExtractor):
815     """Information Extractor for Dailymotion"""
816
817     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
818     IE_NAME = u'dailymotion'
819
820     def _real_extract(self, url):
821         # Extract id and simplified title from URL
822         mobj = re.match(self._VALID_URL, url)
823         if mobj is None:
824             raise ExtractorError(u'Invalid URL: %s' % url)
825
826         video_id = mobj.group(1).split('_')[0].split('?')[0]
827
828         video_extension = 'mp4'
829
830         # Retrieve video webpage to extract further information
831         request = compat_urllib_request.Request(url)
832         request.add_header('Cookie', 'family_filter=off')
833         webpage = self._download_webpage(request, video_id)
834
835         # Extract URL, uploader and title from webpage
836         self.report_extraction(video_id)
837         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
838         if mobj is None:
839             raise ExtractorError(u'Unable to extract media URL')
840         flashvars = compat_urllib_parse.unquote(mobj.group(1))
841
842         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
843             if key in flashvars:
844                 max_quality = key
845                 self.to_screen(u'Using %s' % key)
846                 break
847         else:
848             raise ExtractorError(u'Unable to extract video URL')
849
850         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
851         if mobj is None:
852             raise ExtractorError(u'Unable to extract video URL')
853
854         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
855
856         # TODO: support choosing qualities
857
858         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
859         if mobj is None:
860             raise ExtractorError(u'Unable to extract title')
861         video_title = unescapeHTML(mobj.group('title'))
862
863         video_uploader = None
864         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
865         if mobj is None:
866             # lookin for official user
867             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
868             if mobj_official is None:
869                 self._downloader.report_warning(u'unable to extract uploader nickname')
870             else:
871                 video_uploader = mobj_official.group(1)
872         else:
873             video_uploader = mobj.group(1)
874
875         video_upload_date = None
876         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
877         if mobj is not None:
878             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
879
880         return [{
881             'id':       video_id,
882             'url':      video_url,
883             'uploader': video_uploader,
884             'upload_date':  video_upload_date,
885             'title':    video_title,
886             'ext':      video_extension,
887         }]
888
889
890 class PhotobucketIE(InfoExtractor):
891     """Information extractor for photobucket.com."""
892
893     # TODO: the original _VALID_URL was:
894     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
895     # Check if it's necessary to keep the old extracion process
896     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
897     IE_NAME = u'photobucket'
898
899     def _real_extract(self, url):
900         # Extract id from URL
901         mobj = re.match(self._VALID_URL, url)
902         if mobj is None:
903             raise ExtractorError(u'Invalid URL: %s' % url)
904
905         video_id = mobj.group('id')
906
907         video_extension = mobj.group('ext')
908
909         # Retrieve video webpage to extract further information
910         webpage = self._download_webpage(url, video_id)
911
912         # Extract URL, uploader, and title from webpage
913         self.report_extraction(video_id)
914         # We try first by looking the javascript code:
915         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
916         if mobj is not None:
917             info = json.loads(mobj.group('json'))
918             return [{
919                 'id':       video_id,
920                 'url':      info[u'downloadUrl'],
921                 'uploader': info[u'username'],
922                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
923                 'title':    info[u'title'],
924                 'ext':      video_extension,
925                 'thumbnail': info[u'thumbUrl'],
926             }]
927
928         # We try looking in other parts of the webpage
929         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
930         if mobj is None:
931             raise ExtractorError(u'Unable to extract media URL')
932         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
933
934         video_url = mediaURL
935
936         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
937         if mobj is None:
938             raise ExtractorError(u'Unable to extract title')
939         video_title = mobj.group(1).decode('utf-8')
940
941         video_uploader = mobj.group(2).decode('utf-8')
942
943         return [{
944             'id':       video_id.decode('utf-8'),
945             'url':      video_url.decode('utf-8'),
946             'uploader': video_uploader,
947             'upload_date':  None,
948             'title':    video_title,
949             'ext':      video_extension.decode('utf-8'),
950         }]
951
952
953 class YahooIE(InfoExtractor):
954     """Information extractor for screen.yahoo.com."""
955     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
956
957     def _real_extract(self, url):
958         mobj = re.match(self._VALID_URL, url)
959         if mobj is None:
960             raise ExtractorError(u'Invalid URL: %s' % url)
961         video_id = mobj.group('id')
962         webpage = self._download_webpage(url, video_id)
963         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
964
965         if m_id is None: 
966             # TODO: Check which url parameters are required
967             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
968             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
969             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
970                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
971                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
972                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
973                         '''
974             self.report_extraction(video_id)
975             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
976             if m_info is None:
977                 raise ExtractorError(u'Unable to extract video info')
978             video_title = m_info.group('title')
979             video_description = m_info.group('description')
980             video_thumb = m_info.group('thumb')
981             video_date = m_info.group('date')
982             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
983     
984             # TODO: Find a way to get mp4 videos
985             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
986             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
987             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
988             video_url = m_rest.group('url')
989             video_path = m_rest.group('path')
990             if m_rest is None:
991                 raise ExtractorError(u'Unable to extract video url')
992
993         else: # We have to use a different method if another id is defined
994             long_id = m_id.group('new_id')
995             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
996             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
997             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
998             info = json.loads(json_str)
999             res = info[u'query'][u'results'][u'mediaObj'][0]
1000             stream = res[u'streams'][0]
1001             video_path = stream[u'path']
1002             video_url = stream[u'host']
1003             meta = res[u'meta']
1004             video_title = meta[u'title']
1005             video_description = meta[u'description']
1006             video_thumb = meta[u'thumbnail']
1007             video_date = None # I can't find it
1008
1009         info_dict = {
1010                      'id': video_id,
1011                      'url': video_url,
1012                      'play_path': video_path,
1013                      'title':video_title,
1014                      'description': video_description,
1015                      'thumbnail': video_thumb,
1016                      'upload_date': video_date,
1017                      'ext': 'flv',
1018                      }
1019         return info_dict
1020
1021 class VimeoIE(InfoExtractor):
1022     """Information extractor for vimeo.com."""
1023
1024     # _VALID_URL matches Vimeo URLs
1025     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1026     IE_NAME = u'vimeo'
1027
1028     def _real_extract(self, url, new_video=True):
1029         # Extract ID from URL
1030         mobj = re.match(self._VALID_URL, url)
1031         if mobj is None:
1032             raise ExtractorError(u'Invalid URL: %s' % url)
1033
1034         video_id = mobj.group('id')
1035         if not mobj.group('proto'):
1036             url = 'https://' + url
1037         if mobj.group('direct_link'):
1038             url = 'https://vimeo.com/' + video_id
1039
1040         # Retrieve video webpage to extract further information
1041         request = compat_urllib_request.Request(url, None, std_headers)
1042         webpage = self._download_webpage(request, video_id)
1043
1044         # Now we begin extracting as much information as we can from what we
1045         # retrieved. First we extract the information common to all extractors,
1046         # and latter we extract those that are Vimeo specific.
1047         self.report_extraction(video_id)
1048
1049         # Extract the config JSON
1050         try:
1051             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1052             config = json.loads(config)
1053         except:
1054             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1055                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1056             else:
1057                 raise ExtractorError(u'Unable to extract info section')
1058
1059         # Extract title
1060         video_title = config["video"]["title"]
1061
1062         # Extract uploader and uploader_id
1063         video_uploader = config["video"]["owner"]["name"]
1064         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1065
1066         # Extract video thumbnail
1067         video_thumbnail = config["video"]["thumbnail"]
1068
1069         # Extract video description
1070         video_description = get_element_by_attribute("itemprop", "description", webpage)
1071         if video_description: video_description = clean_html(video_description)
1072         else: video_description = u''
1073
1074         # Extract upload date
1075         video_upload_date = None
1076         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1077         if mobj is not None:
1078             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1079
1080         # Vimeo specific: extract request signature and timestamp
1081         sig = config['request']['signature']
1082         timestamp = config['request']['timestamp']
1083
1084         # Vimeo specific: extract video codec and quality information
1085         # First consider quality, then codecs, then take everything
1086         # TODO bind to format param
1087         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1088         files = { 'hd': [], 'sd': [], 'other': []}
1089         for codec_name, codec_extension in codecs:
1090             if codec_name in config["video"]["files"]:
1091                 if 'hd' in config["video"]["files"][codec_name]:
1092                     files['hd'].append((codec_name, codec_extension, 'hd'))
1093                 elif 'sd' in config["video"]["files"][codec_name]:
1094                     files['sd'].append((codec_name, codec_extension, 'sd'))
1095                 else:
1096                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1097
1098         for quality in ('hd', 'sd', 'other'):
1099             if len(files[quality]) > 0:
1100                 video_quality = files[quality][0][2]
1101                 video_codec = files[quality][0][0]
1102                 video_extension = files[quality][0][1]
1103                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1104                 break
1105         else:
1106             raise ExtractorError(u'No known codec found')
1107
1108         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1109                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1110
1111         return [{
1112             'id':       video_id,
1113             'url':      video_url,
1114             'uploader': video_uploader,
1115             'uploader_id': video_uploader_id,
1116             'upload_date':  video_upload_date,
1117             'title':    video_title,
1118             'ext':      video_extension,
1119             'thumbnail':    video_thumbnail,
1120             'description':  video_description,
1121         }]
1122
1123
1124 class ArteTvIE(InfoExtractor):
1125     """arte.tv information extractor."""
1126
1127     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1128     _LIVE_URL = r'index-[0-9]+\.html$'
1129
1130     IE_NAME = u'arte.tv'
1131
1132     def fetch_webpage(self, url):
1133         request = compat_urllib_request.Request(url)
1134         try:
1135             self.report_download_webpage(url)
1136             webpage = compat_urllib_request.urlopen(request).read()
1137         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1139         except ValueError as err:
1140             raise ExtractorError(u'Invalid URL: %s' % url)
1141         return webpage
1142
1143     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1144         page = self.fetch_webpage(url)
1145         mobj = re.search(regex, page, regexFlags)
1146         info = {}
1147
1148         if mobj is None:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150
1151         for (i, key, err) in matchTuples:
1152             if mobj.group(i) is None:
1153                 raise ExtractorError(err)
1154             else:
1155                 info[key] = mobj.group(i)
1156
1157         return info
1158
1159     def extractLiveStream(self, url):
1160         video_lang = url.split('/')[-4]
1161         info = self.grep_webpage(
1162             url,
1163             r'src="(.*?/videothek_js.*?\.js)',
1164             0,
1165             [
1166                 (1, 'url', u'Invalid URL: %s' % url)
1167             ]
1168         )
1169         http_host = url.split('/')[2]
1170         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1171         info = self.grep_webpage(
1172             next_url,
1173             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1174                 '(http://.*?\.swf).*?' +
1175                 '(rtmp://.*?)\'',
1176             re.DOTALL,
1177             [
1178                 (1, 'path',   u'could not extract video path: %s' % url),
1179                 (2, 'player', u'could not extract video player: %s' % url),
1180                 (3, 'url',    u'could not extract video url: %s' % url)
1181             ]
1182         )
1183         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1184
1185     def extractPlus7Stream(self, url):
1186         video_lang = url.split('/')[-3]
1187         info = self.grep_webpage(
1188             url,
1189             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1190             0,
1191             [
1192                 (1, 'url', u'Invalid URL: %s' % url)
1193             ]
1194         )
1195         next_url = compat_urllib_parse.unquote(info.get('url'))
1196         info = self.grep_webpage(
1197             next_url,
1198             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1199             0,
1200             [
1201                 (1, 'url', u'Could not find <video> tag: %s' % url)
1202             ]
1203         )
1204         next_url = compat_urllib_parse.unquote(info.get('url'))
1205
1206         info = self.grep_webpage(
1207             next_url,
1208             r'<video id="(.*?)".*?>.*?' +
1209                 '<name>(.*?)</name>.*?' +
1210                 '<dateVideo>(.*?)</dateVideo>.*?' +
1211                 '<url quality="hd">(.*?)</url>',
1212             re.DOTALL,
1213             [
1214                 (1, 'id',    u'could not extract video id: %s' % url),
1215                 (2, 'title', u'could not extract video title: %s' % url),
1216                 (3, 'date',  u'could not extract video date: %s' % url),
1217                 (4, 'url',   u'could not extract video url: %s' % url)
1218             ]
1219         )
1220
1221         return {
1222             'id':           info.get('id'),
1223             'url':          compat_urllib_parse.unquote(info.get('url')),
1224             'uploader':     u'arte.tv',
1225             'upload_date':  unified_strdate(info.get('date')),
1226             'title':        info.get('title').decode('utf-8'),
1227             'ext':          u'mp4',
1228             'format':       u'NA',
1229             'player_url':   None,
1230         }
1231
1232     def _real_extract(self, url):
1233         video_id = url.split('/')[-1]
1234         self.report_extraction(video_id)
1235
1236         if re.search(self._LIVE_URL, video_id) is not None:
1237             self.extractLiveStream(url)
1238             return
1239         else:
1240             info = self.extractPlus7Stream(url)
1241
1242         return [info]
1243
1244
1245 class GenericIE(InfoExtractor):
1246     """Generic last-resort information extractor."""
1247
1248     _VALID_URL = r'.*'
1249     IE_NAME = u'generic'
1250
1251     def report_download_webpage(self, video_id):
1252         """Report webpage download."""
1253         if not self._downloader.params.get('test', False):
1254             self._downloader.report_warning(u'Falling back on generic information extractor.')
1255         super(GenericIE, self).report_download_webpage(video_id)
1256
1257     def report_following_redirect(self, new_url):
1258         """Report information extraction."""
1259         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1260
1261     def _test_redirect(self, url):
1262         """Check if it is a redirect, like url shorteners, in case return the new url."""
1263         class HeadRequest(compat_urllib_request.Request):
1264             def get_method(self):
1265                 return "HEAD"
1266
1267         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1268             """
1269             Subclass the HTTPRedirectHandler to make it use our
1270             HeadRequest also on the redirected URL
1271             """
1272             def redirect_request(self, req, fp, code, msg, headers, newurl):
1273                 if code in (301, 302, 303, 307):
1274                     newurl = newurl.replace(' ', '%20')
1275                     newheaders = dict((k,v) for k,v in req.headers.items()
1276                                       if k.lower() not in ("content-length", "content-type"))
1277                     return HeadRequest(newurl,
1278                                        headers=newheaders,
1279                                        origin_req_host=req.get_origin_req_host(),
1280                                        unverifiable=True)
1281                 else:
1282                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1283
1284         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1285             """
1286             Fallback to GET if HEAD is not allowed (405 HTTP error)
1287             """
1288             def http_error_405(self, req, fp, code, msg, headers):
1289                 fp.read()
1290                 fp.close()
1291
1292                 newheaders = dict((k,v) for k,v in req.headers.items()
1293                                   if k.lower() not in ("content-length", "content-type"))
1294                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1295                                                  headers=newheaders,
1296                                                  origin_req_host=req.get_origin_req_host(),
1297                                                  unverifiable=True))
1298
1299         # Build our opener
1300         opener = compat_urllib_request.OpenerDirector()
1301         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1302                         HTTPMethodFallback, HEADRedirectHandler,
1303                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1304             opener.add_handler(handler())
1305
1306         response = opener.open(HeadRequest(url))
1307         if response is None:
1308             raise ExtractorError(u'Invalid URL protocol')
1309         new_url = response.geturl()
1310
1311         if url == new_url:
1312             return False
1313
1314         self.report_following_redirect(new_url)
1315         return new_url
1316
1317     def _real_extract(self, url):
1318         new_url = self._test_redirect(url)
1319         if new_url: return [self.url_result(new_url)]
1320
1321         video_id = url.split('/')[-1]
1322         try:
1323             webpage = self._download_webpage(url, video_id)
1324         except ValueError as err:
1325             # since this is the last-resort InfoExtractor, if
1326             # this error is thrown, it'll be thrown here
1327             raise ExtractorError(u'Invalid URL: %s' % url)
1328
1329         self.report_extraction(video_id)
1330         # Start with something easy: JW Player in SWFObject
1331         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1332         if mobj is None:
1333             # Broaden the search a little bit
1334             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1335         if mobj is None:
1336             # Broaden the search a little bit: JWPlayer JS loader
1337             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1338         if mobj is None:
1339             raise ExtractorError(u'Invalid URL: %s' % url)
1340
1341         # It's possible that one of the regexes
1342         # matched, but returned an empty group:
1343         if mobj.group(1) is None:
1344             raise ExtractorError(u'Invalid URL: %s' % url)
1345
1346         video_url = compat_urllib_parse.unquote(mobj.group(1))
1347         video_id = os.path.basename(video_url)
1348
1349         # here's a fun little line of code for you:
1350         video_extension = os.path.splitext(video_id)[1][1:]
1351         video_id = os.path.splitext(video_id)[0]
1352
1353         # it's tempting to parse this further, but you would
1354         # have to take into account all the variations like
1355         #   Video Title - Site Name
1356         #   Site Name | Video Title
1357         #   Video Title - Tagline | Site Name
1358         # and so on and so forth; it's just not practical
1359         mobj = re.search(r'<title>(.*)</title>', webpage)
1360         if mobj is None:
1361             raise ExtractorError(u'Unable to extract title')
1362         video_title = mobj.group(1)
1363
1364         # video uploader is domain name
1365         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1366         if mobj is None:
1367             raise ExtractorError(u'Unable to extract title')
1368         video_uploader = mobj.group(1)
1369
1370         return [{
1371             'id':       video_id,
1372             'url':      video_url,
1373             'uploader': video_uploader,
1374             'upload_date':  None,
1375             'title':    video_title,
1376             'ext':      video_extension,
1377         }]
1378
1379
1380 class YoutubeSearchIE(SearchInfoExtractor):
1381     """Information Extractor for YouTube search queries."""
1382     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1383     _MAX_RESULTS = 1000
1384     IE_NAME = u'youtube:search'
1385     _SEARCH_KEY = 'ytsearch'
1386
1387     def report_download_page(self, query, pagenum):
1388         """Report attempt to download search page with given number."""
1389         query = query.decode(preferredencoding())
1390         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1391
1392     def _get_n_results(self, query, n):
1393         """Get a specified number of results for a query"""
1394
1395         video_ids = []
1396         pagenum = 0
1397         limit = n
1398
1399         while (50 * pagenum) < limit:
1400             self.report_download_page(query, pagenum+1)
1401             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1402             request = compat_urllib_request.Request(result_url)
1403             try:
1404                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1405             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1406                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1407             api_response = json.loads(data)['data']
1408
1409             if not 'items' in api_response:
1410                 raise ExtractorError(u'[youtube] No video results')
1411
1412             new_ids = list(video['id'] for video in api_response['items'])
1413             video_ids += new_ids
1414
1415             limit = min(n, api_response['totalItems'])
1416             pagenum += 1
1417
1418         if len(video_ids) > n:
1419             video_ids = video_ids[:n]
1420         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1421         return self.playlist_result(videos, query)
1422
1423
1424 class GoogleSearchIE(SearchInfoExtractor):
1425     """Information Extractor for Google Video search queries."""
1426     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1427     _MAX_RESULTS = 1000
1428     IE_NAME = u'video.google:search'
1429     _SEARCH_KEY = 'gvsearch'
1430
1431     def _get_n_results(self, query, n):
1432         """Get a specified number of results for a query"""
1433
1434         res = {
1435             '_type': 'playlist',
1436             'id': query,
1437             'entries': []
1438         }
1439
1440         for pagenum in itertools.count(1):
1441             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1442             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1443                                              note='Downloading result page ' + str(pagenum))
1444
1445             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1446                 e = {
1447                     '_type': 'url',
1448                     'url': mobj.group(1)
1449                 }
1450                 res['entries'].append(e)
1451
1452             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1453                 return res
1454
1455 class YahooSearchIE(SearchInfoExtractor):
1456     """Information Extractor for Yahoo! Video search queries."""
1457
1458     _MAX_RESULTS = 1000
1459     IE_NAME = u'screen.yahoo:search'
1460     _SEARCH_KEY = 'yvsearch'
1461
1462     def _get_n_results(self, query, n):
1463         """Get a specified number of results for a query"""
1464
1465         res = {
1466             '_type': 'playlist',
1467             'id': query,
1468             'entries': []
1469         }
1470         for pagenum in itertools.count(0): 
1471             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1472             webpage = self._download_webpage(result_url, query,
1473                                              note='Downloading results page '+str(pagenum+1))
1474             info = json.loads(webpage)
1475             m = info[u'm']
1476             results = info[u'results']
1477
1478             for (i, r) in enumerate(results):
1479                 if (pagenum * 30) +i >= n:
1480                     break
1481                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1482                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1483                 res['entries'].append(e)
1484             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1485                 break
1486
1487         return res
1488
1489
1490 class YoutubePlaylistIE(InfoExtractor):
1491     """Information Extractor for YouTube playlists."""
1492
1493     _VALID_URL = r"""(?:
1494                         (?:https?://)?
1495                         (?:\w+\.)?
1496                         youtube\.com/
1497                         (?:
1498                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1499                            \? (?:.*?&)*? (?:p|a|list)=
1500                         |  p/
1501                         )
1502                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1503                         .*
1504                      |
1505                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1506                      )"""
1507     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1508     _MAX_RESULTS = 50
1509     IE_NAME = u'youtube:playlist'
1510
1511     @classmethod
1512     def suitable(cls, url):
1513         """Receives a URL and returns True if suitable for this IE."""
1514         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1515
1516     def _real_extract(self, url):
1517         # Extract playlist id
1518         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1519         if mobj is None:
1520             raise ExtractorError(u'Invalid URL: %s' % url)
1521
1522         # Download playlist videos from API
1523         playlist_id = mobj.group(1) or mobj.group(2)
1524         page_num = 1
1525         videos = []
1526
1527         while True:
1528             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1529             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1530
1531             try:
1532                 response = json.loads(page)
1533             except ValueError as err:
1534                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1535
1536             if 'feed' not in response:
1537                 raise ExtractorError(u'Got a malformed response from YouTube API')
1538             playlist_title = response['feed']['title']['$t']
1539             if 'entry' not in response['feed']:
1540                 # Number of videos is a multiple of self._MAX_RESULTS
1541                 break
1542
1543             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1544                         for entry in response['feed']['entry']
1545                         if 'content' in entry ]
1546
1547             if len(response['feed']['entry']) < self._MAX_RESULTS:
1548                 break
1549             page_num += 1
1550
1551         videos = [v[1] for v in sorted(videos)]
1552
1553         url_results = [self.url_result(url, 'Youtube') for url in videos]
1554         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1555
1556
1557 class YoutubeChannelIE(InfoExtractor):
1558     """Information Extractor for YouTube channels."""
1559
1560     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1561     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1562     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1563     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1564     IE_NAME = u'youtube:channel'
1565
1566     def extract_videos_from_page(self, page):
1567         ids_in_page = []
1568         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1569             if mobj.group(1) not in ids_in_page:
1570                 ids_in_page.append(mobj.group(1))
1571         return ids_in_page
1572
1573     def _real_extract(self, url):
1574         # Extract channel id
1575         mobj = re.match(self._VALID_URL, url)
1576         if mobj is None:
1577             raise ExtractorError(u'Invalid URL: %s' % url)
1578
1579         # Download channel page
1580         channel_id = mobj.group(1)
1581         video_ids = []
1582         pagenum = 1
1583
1584         url = self._TEMPLATE_URL % (channel_id, pagenum)
1585         page = self._download_webpage(url, channel_id,
1586                                       u'Downloading page #%s' % pagenum)
1587
1588         # Extract video identifiers
1589         ids_in_page = self.extract_videos_from_page(page)
1590         video_ids.extend(ids_in_page)
1591
1592         # Download any subsequent channel pages using the json-based channel_ajax query
1593         if self._MORE_PAGES_INDICATOR in page:
1594             while True:
1595                 pagenum = pagenum + 1
1596
1597                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1598                 page = self._download_webpage(url, channel_id,
1599                                               u'Downloading page #%s' % pagenum)
1600
1601                 page = json.loads(page)
1602
1603                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1604                 video_ids.extend(ids_in_page)
1605
1606                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1607                     break
1608
1609         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1610
1611         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1612         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1613         return [self.playlist_result(url_entries, channel_id)]
1614
1615
1616 class YoutubeUserIE(InfoExtractor):
1617     """Information Extractor for YouTube users."""
1618
1619     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1620     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1621     _GDATA_PAGE_SIZE = 50
1622     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1623     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1624     IE_NAME = u'youtube:user'
1625
1626     def _real_extract(self, url):
1627         # Extract username
1628         mobj = re.match(self._VALID_URL, url)
1629         if mobj is None:
1630             raise ExtractorError(u'Invalid URL: %s' % url)
1631
1632         username = mobj.group(1)
1633
1634         # Download video ids using YouTube Data API. Result size per
1635         # query is limited (currently to 50 videos) so we need to query
1636         # page by page until there are no video ids - it means we got
1637         # all of them.
1638
1639         video_ids = []
1640         pagenum = 0
1641
1642         while True:
1643             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1644
1645             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1646             page = self._download_webpage(gdata_url, username,
1647                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1648
1649             # Extract video identifiers
1650             ids_in_page = []
1651
1652             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653                 if mobj.group(1) not in ids_in_page:
1654                     ids_in_page.append(mobj.group(1))
1655
1656             video_ids.extend(ids_in_page)
1657
1658             # A little optimization - if current page is not
1659             # "full", ie. does not contain PAGE_SIZE video ids then
1660             # we can assume that this page is the last one - there
1661             # are no more ids on further pages - no need to query
1662             # again.
1663
1664             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1665                 break
1666
1667             pagenum += 1
1668
1669         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1670         url_results = [self.url_result(url, 'Youtube') for url in urls]
1671         return [self.playlist_result(url_results, playlist_title = username)]
1672
1673
1674 class BlipTVUserIE(InfoExtractor):
1675     """Information Extractor for blip.tv users."""
1676
1677     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1678     _PAGE_SIZE = 12
1679     IE_NAME = u'blip.tv:user'
1680
1681     def _real_extract(self, url):
1682         # Extract username
1683         mobj = re.match(self._VALID_URL, url)
1684         if mobj is None:
1685             raise ExtractorError(u'Invalid URL: %s' % url)
1686
1687         username = mobj.group(1)
1688
1689         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1690
1691         page = self._download_webpage(url, username, u'Downloading user page')
1692         mobj = re.search(r'data-users-id="([^"]+)"', page)
1693         page_base = page_base % mobj.group(1)
1694
1695
1696         # Download video ids using BlipTV Ajax calls. Result size per
1697         # query is limited (currently to 12 videos) so we need to query
1698         # page by page until there are no video ids - it means we got
1699         # all of them.
1700
1701         video_ids = []
1702         pagenum = 1
1703
1704         while True:
1705             url = page_base + "&page=" + str(pagenum)
1706             page = self._download_webpage(url, username,
1707                                           u'Downloading video ids from page %d' % pagenum)
1708
1709             # Extract video identifiers
1710             ids_in_page = []
1711
1712             for mobj in re.finditer(r'href="/([^"]+)"', page):
1713                 if mobj.group(1) not in ids_in_page:
1714                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1715
1716             video_ids.extend(ids_in_page)
1717
1718             # A little optimization - if current page is not
1719             # "full", ie. does not contain PAGE_SIZE video ids then
1720             # we can assume that this page is the last one - there
1721             # are no more ids on further pages - no need to query
1722             # again.
1723
1724             if len(ids_in_page) < self._PAGE_SIZE:
1725                 break
1726
1727             pagenum += 1
1728
1729         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1730         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1731         return [self.playlist_result(url_entries, playlist_title = username)]
1732
1733
1734 class DepositFilesIE(InfoExtractor):
1735     """Information extractor for depositfiles.com"""
1736
1737     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1738
1739     def _real_extract(self, url):
1740         file_id = url.split('/')[-1]
1741         # Rebuild url in english locale
1742         url = 'http://depositfiles.com/en/files/' + file_id
1743
1744         # Retrieve file webpage with 'Free download' button pressed
1745         free_download_indication = { 'gateway_result' : '1' }
1746         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1747         try:
1748             self.report_download_webpage(file_id)
1749             webpage = compat_urllib_request.urlopen(request).read()
1750         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1751             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1752
1753         # Search for the real file URL
1754         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1755         if (mobj is None) or (mobj.group(1) is None):
1756             # Try to figure out reason of the error.
1757             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1758             if (mobj is not None) and (mobj.group(1) is not None):
1759                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1760                 raise ExtractorError(u'%s' % restriction_message)
1761             else:
1762                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1763
1764         file_url = mobj.group(1)
1765         file_extension = os.path.splitext(file_url)[1][1:]
1766
1767         # Search for file title
1768         mobj = re.search(r'<b title="(.*?)">', webpage)
1769         if mobj is None:
1770             raise ExtractorError(u'Unable to extract title')
1771         file_title = mobj.group(1).decode('utf-8')
1772
1773         return [{
1774             'id':       file_id.decode('utf-8'),
1775             'url':      file_url.decode('utf-8'),
1776             'uploader': None,
1777             'upload_date':  None,
1778             'title':    file_title,
1779             'ext':      file_extension.decode('utf-8'),
1780         }]
1781
1782
1783 class FacebookIE(InfoExtractor):
1784     """Information Extractor for Facebook"""
1785
1786     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788     _NETRC_MACHINE = 'facebook'
1789     IE_NAME = u'facebook'
1790
1791     def report_login(self):
1792         """Report attempt to log in."""
1793         self.to_screen(u'Logging in')
1794
1795     def _real_initialize(self):
1796         if self._downloader is None:
1797             return
1798
1799         useremail = None
1800         password = None
1801         downloader_params = self._downloader.params
1802
1803         # Attempt to use provided username and password or .netrc data
1804         if downloader_params.get('username', None) is not None:
1805             useremail = downloader_params['username']
1806             password = downloader_params['password']
1807         elif downloader_params.get('usenetrc', False):
1808             try:
1809                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1810                 if info is not None:
1811                     useremail = info[0]
1812                     password = info[2]
1813                 else:
1814                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1815             except (IOError, netrc.NetrcParseError) as err:
1816                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1817                 return
1818
1819         if useremail is None:
1820             return
1821
1822         # Log in
1823         login_form = {
1824             'email': useremail,
1825             'pass': password,
1826             'login': 'Log+In'
1827             }
1828         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1829         try:
1830             self.report_login()
1831             login_results = compat_urllib_request.urlopen(request).read()
1832             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1833                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1834                 return
1835         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1837             return
1838
1839     def _real_extract(self, url):
1840         mobj = re.match(self._VALID_URL, url)
1841         if mobj is None:
1842             raise ExtractorError(u'Invalid URL: %s' % url)
1843         video_id = mobj.group('ID')
1844
1845         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1846         webpage = self._download_webpage(url, video_id)
1847
1848         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1849         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1850         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1851         if not m:
1852             raise ExtractorError(u'Cannot parse data')
1853         data = dict(json.loads(m.group(1)))
1854         params_raw = compat_urllib_parse.unquote(data['params'])
1855         params = json.loads(params_raw)
1856         video_data = params['video_data'][0]
1857         video_url = video_data.get('hd_src')
1858         if not video_url:
1859             video_url = video_data['sd_src']
1860         if not video_url:
1861             raise ExtractorError(u'Cannot find video URL')
1862         video_duration = int(video_data['video_duration'])
1863         thumbnail = video_data['thumbnail_src']
1864
1865         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1866         if not m:
1867             raise ExtractorError(u'Cannot find title in webpage')
1868         video_title = unescapeHTML(m.group(1))
1869
1870         info = {
1871             'id': video_id,
1872             'title': video_title,
1873             'url': video_url,
1874             'ext': 'mp4',
1875             'duration': video_duration,
1876             'thumbnail': thumbnail,
1877         }
1878         return [info]
1879
1880
1881 class BlipTVIE(InfoExtractor):
1882     """Information extractor for blip.tv"""
1883
1884     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1885     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1886     IE_NAME = u'blip.tv'
1887
1888     def report_direct_download(self, title):
1889         """Report information extraction."""
1890         self.to_screen(u'%s: Direct download detected' % title)
1891
1892     def _real_extract(self, url):
1893         mobj = re.match(self._VALID_URL, url)
1894         if mobj is None:
1895             raise ExtractorError(u'Invalid URL: %s' % url)
1896
1897         urlp = compat_urllib_parse_urlparse(url)
1898         if urlp.path.startswith('/play/'):
1899             request = compat_urllib_request.Request(url)
1900             response = compat_urllib_request.urlopen(request)
1901             redirecturl = response.geturl()
1902             rurlp = compat_urllib_parse_urlparse(redirecturl)
1903             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1904             url = 'http://blip.tv/a/a-' + file_id
1905             return self._real_extract(url)
1906
1907
1908         if '?' in url:
1909             cchar = '&'
1910         else:
1911             cchar = '?'
1912         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1913         request = compat_urllib_request.Request(json_url)
1914         request.add_header('User-Agent', 'iTunes/10.6.1')
1915         self.report_extraction(mobj.group(1))
1916         info = None
1917         try:
1918             urlh = compat_urllib_request.urlopen(request)
1919             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1920                 basename = url.split('/')[-1]
1921                 title,ext = os.path.splitext(basename)
1922                 title = title.decode('UTF-8')
1923                 ext = ext.replace('.', '')
1924                 self.report_direct_download(title)
1925                 info = {
1926                     'id': title,
1927                     'url': url,
1928                     'uploader': None,
1929                     'upload_date': None,
1930                     'title': title,
1931                     'ext': ext,
1932                     'urlhandle': urlh
1933                 }
1934         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1935             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1936         if info is None: # Regular URL
1937             try:
1938                 json_code_bytes = urlh.read()
1939                 json_code = json_code_bytes.decode('utf-8')
1940             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1942
1943             try:
1944                 json_data = json.loads(json_code)
1945                 if 'Post' in json_data:
1946                     data = json_data['Post']
1947                 else:
1948                     data = json_data
1949
1950                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1951                 video_url = data['media']['url']
1952                 umobj = re.match(self._URL_EXT, video_url)
1953                 if umobj is None:
1954                     raise ValueError('Can not determine filename extension')
1955                 ext = umobj.group(1)
1956
1957                 info = {
1958                     'id': data['item_id'],
1959                     'url': video_url,
1960                     'uploader': data['display_name'],
1961                     'upload_date': upload_date,
1962                     'title': data['title'],
1963                     'ext': ext,
1964                     'format': data['media']['mimeType'],
1965                     'thumbnail': data['thumbnailUrl'],
1966                     'description': data['description'],
1967                     'player_url': data['embedUrl'],
1968                     'user_agent': 'iTunes/10.6.1',
1969                 }
1970             except (ValueError,KeyError) as err:
1971                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1972
1973         return [info]
1974
1975
1976 class MyVideoIE(InfoExtractor):
1977     """Information Extractor for myvideo.de."""
1978
1979     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1980     IE_NAME = u'myvideo'
1981
1982     def _real_extract(self,url):
1983         mobj = re.match(self._VALID_URL, url)
1984         if mobj is None:
1985             raise ExtractorError(u'Invalid URL: %s' % url)
1986
1987         video_id = mobj.group(1)
1988
1989         # Get video webpage
1990         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1991         webpage = self._download_webpage(webpage_url, video_id)
1992
1993         self.report_extraction(video_id)
1994         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
1995                  webpage)
1996         if mobj is None:
1997             raise ExtractorError(u'Unable to extract media URL')
1998         video_url = mobj.group(1) + ('/%s.flv' % video_id)
1999
2000         mobj = re.search('<title>([^<]+)</title>', webpage)
2001         if mobj is None:
2002             raise ExtractorError(u'Unable to extract title')
2003
2004         video_title = mobj.group(1)
2005
2006         return [{
2007             'id':       video_id,
2008             'url':      video_url,
2009             'uploader': None,
2010             'upload_date':  None,
2011             'title':    video_title,
2012             'ext':      u'flv',
2013         }]
2014
2015 class ComedyCentralIE(InfoExtractor):
2016     """Information extractor for The Daily Show and Colbert Report """
2017
2018     # urls can be abbreviations like :thedailyshow or :colbert
2019     # urls for episodes like:
2020     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2021     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2022     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2023     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2024                       |(https?://)?(www\.)?
2025                           (?P<showname>thedailyshow|colbertnation)\.com/
2026                          (full-episodes/(?P<episode>.*)|
2027                           (?P<clip>
2028                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2029                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2030                      $"""
2031
2032     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2033
2034     _video_extensions = {
2035         '3500': 'mp4',
2036         '2200': 'mp4',
2037         '1700': 'mp4',
2038         '1200': 'mp4',
2039         '750': 'mp4',
2040         '400': 'mp4',
2041     }
2042     _video_dimensions = {
2043         '3500': '1280x720',
2044         '2200': '960x540',
2045         '1700': '768x432',
2046         '1200': '640x360',
2047         '750': '512x288',
2048         '400': '384x216',
2049     }
2050
2051     @classmethod
2052     def suitable(cls, url):
2053         """Receives a URL and returns True if suitable for this IE."""
2054         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2055
2056     def _print_formats(self, formats):
2057         print('Available formats:')
2058         for x in formats:
2059             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2060
2061
2062     def _real_extract(self, url):
2063         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2064         if mobj is None:
2065             raise ExtractorError(u'Invalid URL: %s' % url)
2066
2067         if mobj.group('shortname'):
2068             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2069                 url = u'http://www.thedailyshow.com/full-episodes/'
2070             else:
2071                 url = u'http://www.colbertnation.com/full-episodes/'
2072             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2073             assert mobj is not None
2074
2075         if mobj.group('clip'):
2076             if mobj.group('showname') == 'thedailyshow':
2077                 epTitle = mobj.group('tdstitle')
2078             else:
2079                 epTitle = mobj.group('cntitle')
2080             dlNewest = False
2081         else:
2082             dlNewest = not mobj.group('episode')
2083             if dlNewest:
2084                 epTitle = mobj.group('showname')
2085             else:
2086                 epTitle = mobj.group('episode')
2087
2088         self.report_extraction(epTitle)
2089         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2090         if dlNewest:
2091             url = htmlHandle.geturl()
2092             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2093             if mobj is None:
2094                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2095             if mobj.group('episode') == '':
2096                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2097             epTitle = mobj.group('episode')
2098
2099         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2100
2101         if len(mMovieParams) == 0:
2102             # The Colbert Report embeds the information in a without
2103             # a URL prefix; so extract the alternate reference
2104             # and then add the URL prefix manually.
2105
2106             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2107             if len(altMovieParams) == 0:
2108                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2109             else:
2110                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2111
2112         uri = mMovieParams[0][1]
2113         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2114         indexXml = self._download_webpage(indexUrl, epTitle,
2115                                           u'Downloading show index',
2116                                           u'unable to download episode index')
2117
2118         results = []
2119
2120         idoc = xml.etree.ElementTree.fromstring(indexXml)
2121         itemEls = idoc.findall('.//item')
2122         for partNum,itemEl in enumerate(itemEls):
2123             mediaId = itemEl.findall('./guid')[0].text
2124             shortMediaId = mediaId.split(':')[-1]
2125             showId = mediaId.split(':')[-2].replace('.com', '')
2126             officialTitle = itemEl.findall('./title')[0].text
2127             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2128
2129             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2130                         compat_urllib_parse.urlencode({'uri': mediaId}))
2131             configXml = self._download_webpage(configUrl, epTitle,
2132                                                u'Downloading configuration for %s' % shortMediaId)
2133
2134             cdoc = xml.etree.ElementTree.fromstring(configXml)
2135             turls = []
2136             for rendition in cdoc.findall('.//rendition'):
2137                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2138                 turls.append(finfo)
2139
2140             if len(turls) == 0:
2141                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2142                 continue
2143
2144             if self._downloader.params.get('listformats', None):
2145                 self._print_formats([i[0] for i in turls])
2146                 return
2147
2148             # For now, just pick the highest bitrate
2149             format,rtmp_video_url = turls[-1]
2150
2151             # Get the format arg from the arg stream
2152             req_format = self._downloader.params.get('format', None)
2153
2154             # Select format if we can find one
2155             for f,v in turls:
2156                 if f == req_format:
2157                     format, rtmp_video_url = f, v
2158                     break
2159
2160             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2161             if not m:
2162                 raise ExtractorError(u'Cannot transform RTMP url')
2163             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2164             video_url = base + m.group('finalid')
2165
2166             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2167             info = {
2168                 'id': shortMediaId,
2169                 'url': video_url,
2170                 'uploader': showId,
2171                 'upload_date': officialDate,
2172                 'title': effTitle,
2173                 'ext': 'mp4',
2174                 'format': format,
2175                 'thumbnail': None,
2176                 'description': officialTitle,
2177             }
2178             results.append(info)
2179
2180         return results
2181
2182
2183 class EscapistIE(InfoExtractor):
2184     """Information extractor for The Escapist """
2185
2186     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2187     IE_NAME = u'escapist'
2188
2189     def _real_extract(self, url):
2190         mobj = re.match(self._VALID_URL, url)
2191         if mobj is None:
2192             raise ExtractorError(u'Invalid URL: %s' % url)
2193         showName = mobj.group('showname')
2194         videoId = mobj.group('episode')
2195
2196         self.report_extraction(showName)
2197         webPage = self._download_webpage(url, showName)
2198
2199         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2200         description = unescapeHTML(descMatch.group(1))
2201         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2202         imgUrl = unescapeHTML(imgMatch.group(1))
2203         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2204         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2205         configUrlMatch = re.search('config=(.*)$', playerUrl)
2206         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2207
2208         configJSON = self._download_webpage(configUrl, showName,
2209                                             u'Downloading configuration',
2210                                             u'unable to download configuration')
2211
2212         # Technically, it's JavaScript, not JSON
2213         configJSON = configJSON.replace("'", '"')
2214
2215         try:
2216             config = json.loads(configJSON)
2217         except (ValueError,) as err:
2218             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2219
2220         playlist = config['playlist']
2221         videoUrl = playlist[1]['url']
2222
2223         info = {
2224             'id': videoId,
2225             'url': videoUrl,
2226             'uploader': showName,
2227             'upload_date': None,
2228             'title': showName,
2229             'ext': 'mp4',
2230             'thumbnail': imgUrl,
2231             'description': description,
2232             'player_url': playerUrl,
2233         }
2234
2235         return [info]
2236
2237 class CollegeHumorIE(InfoExtractor):
2238     """Information extractor for collegehumor.com"""
2239
2240     _WORKING = False
2241     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2242     IE_NAME = u'collegehumor'
2243
2244     def report_manifest(self, video_id):
2245         """Report information extraction."""
2246         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2247
2248     def _real_extract(self, url):
2249         mobj = re.match(self._VALID_URL, url)
2250         if mobj is None:
2251             raise ExtractorError(u'Invalid URL: %s' % url)
2252         video_id = mobj.group('videoid')
2253
2254         info = {
2255             'id': video_id,
2256             'uploader': None,
2257             'upload_date': None,
2258         }
2259
2260         self.report_extraction(video_id)
2261         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2262         try:
2263             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2266
2267         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2268         try:
2269             videoNode = mdoc.findall('./video')[0]
2270             info['description'] = videoNode.findall('./description')[0].text
2271             info['title'] = videoNode.findall('./caption')[0].text
2272             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2273             manifest_url = videoNode.findall('./file')[0].text
2274         except IndexError:
2275             raise ExtractorError(u'Invalid metadata XML file')
2276
2277         manifest_url += '?hdcore=2.10.3'
2278         self.report_manifest(video_id)
2279         try:
2280             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2281         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2282             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2283
2284         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2285         try:
2286             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2287             node_id = media_node.attrib['url']
2288             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2289         except IndexError as err:
2290             raise ExtractorError(u'Invalid manifest file')
2291
2292         url_pr = compat_urllib_parse_urlparse(manifest_url)
2293         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2294
2295         info['url'] = url
2296         info['ext'] = 'f4f'
2297         return [info]
2298
2299
2300 class XVideosIE(InfoExtractor):
2301     """Information extractor for xvideos.com"""
2302
2303     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2304     IE_NAME = u'xvideos'
2305
2306     def _real_extract(self, url):
2307         mobj = re.match(self._VALID_URL, url)
2308         if mobj is None:
2309             raise ExtractorError(u'Invalid URL: %s' % url)
2310         video_id = mobj.group(1)
2311
2312         webpage = self._download_webpage(url, video_id)
2313
2314         self.report_extraction(video_id)
2315
2316
2317         # Extract video URL
2318         mobj = re.search(r'flv_url=(.+?)&', webpage)
2319         if mobj is None:
2320             raise ExtractorError(u'Unable to extract video url')
2321         video_url = compat_urllib_parse.unquote(mobj.group(1))
2322
2323
2324         # Extract title
2325         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2326         if mobj is None:
2327             raise ExtractorError(u'Unable to extract video title')
2328         video_title = mobj.group(1)
2329
2330
2331         # Extract video thumbnail
2332         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2333         if mobj is None:
2334             raise ExtractorError(u'Unable to extract video thumbnail')
2335         video_thumbnail = mobj.group(0)
2336
2337         info = {
2338             'id': video_id,
2339             'url': video_url,
2340             'uploader': None,
2341             'upload_date': None,
2342             'title': video_title,
2343             'ext': 'flv',
2344             'thumbnail': video_thumbnail,
2345             'description': None,
2346         }
2347
2348         return [info]
2349
2350
2351 class SoundcloudIE(InfoExtractor):
2352     """Information extractor for soundcloud.com
2353        To access the media, the uid of the song and a stream token
2354        must be extracted from the page source and the script must make
2355        a request to media.soundcloud.com/crossdomain.xml. Then
2356        the media can be grabbed by requesting from an url composed
2357        of the stream token and uid
2358      """
2359
2360     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2361     IE_NAME = u'soundcloud'
2362
2363     def report_resolve(self, video_id):
2364         """Report information extraction."""
2365         self.to_screen(u'%s: Resolving id' % video_id)
2366
2367     def _real_extract(self, url):
2368         mobj = re.match(self._VALID_URL, url)
2369         if mobj is None:
2370             raise ExtractorError(u'Invalid URL: %s' % url)
2371
2372         # extract uploader (which is in the url)
2373         uploader = mobj.group(1)
2374         # extract simple title (uploader + slug of song title)
2375         slug_title =  mobj.group(2)
2376         simple_title = uploader + u'-' + slug_title
2377         full_title = '%s/%s' % (uploader, slug_title)
2378
2379         self.report_resolve(full_title)
2380
2381         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2382         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2383         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2384
2385         info = json.loads(info_json)
2386         video_id = info['id']
2387         self.report_extraction(full_title)
2388
2389         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2390         stream_json = self._download_webpage(streams_url, full_title,
2391                                              u'Downloading stream definitions',
2392                                              u'unable to download stream definitions')
2393
2394         streams = json.loads(stream_json)
2395         mediaURL = streams['http_mp3_128_url']
2396         upload_date = unified_strdate(info['created_at'])
2397
2398         return [{
2399             'id':       info['id'],
2400             'url':      mediaURL,
2401             'uploader': info['user']['username'],
2402             'upload_date': upload_date,
2403             'title':    info['title'],
2404             'ext':      u'mp3',
2405             'description': info['description'],
2406         }]
2407
2408 class SoundcloudSetIE(InfoExtractor):
2409     """Information extractor for soundcloud.com sets
2410        To access the media, the uid of the song and a stream token
2411        must be extracted from the page source and the script must make
2412        a request to media.soundcloud.com/crossdomain.xml. Then
2413        the media can be grabbed by requesting from an url composed
2414        of the stream token and uid
2415      """
2416
2417     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2418     IE_NAME = u'soundcloud:set'
2419
2420     def report_resolve(self, video_id):
2421         """Report information extraction."""
2422         self.to_screen(u'%s: Resolving id' % video_id)
2423
2424     def _real_extract(self, url):
2425         mobj = re.match(self._VALID_URL, url)
2426         if mobj is None:
2427             raise ExtractorError(u'Invalid URL: %s' % url)
2428
2429         # extract uploader (which is in the url)
2430         uploader = mobj.group(1)
2431         # extract simple title (uploader + slug of song title)
2432         slug_title =  mobj.group(2)
2433         simple_title = uploader + u'-' + slug_title
2434         full_title = '%s/sets/%s' % (uploader, slug_title)
2435
2436         self.report_resolve(full_title)
2437
2438         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2439         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2440         info_json = self._download_webpage(resolv_url, full_title)
2441
2442         videos = []
2443         info = json.loads(info_json)
2444         if 'errors' in info:
2445             for err in info['errors']:
2446                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2447             return
2448
2449         self.report_extraction(full_title)
2450         for track in info['tracks']:
2451             video_id = track['id']
2452
2453             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2454             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2455
2456             self.report_extraction(video_id)
2457             streams = json.loads(stream_json)
2458             mediaURL = streams['http_mp3_128_url']
2459
2460             videos.append({
2461                 'id':       video_id,
2462                 'url':      mediaURL,
2463                 'uploader': track['user']['username'],
2464                 'upload_date':  unified_strdate(track['created_at']),
2465                 'title':    track['title'],
2466                 'ext':      u'mp3',
2467                 'description': track['description'],
2468             })
2469         return videos
2470
2471
2472 class InfoQIE(InfoExtractor):
2473     """Information extractor for infoq.com"""
2474     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2475
2476     def _real_extract(self, url):
2477         mobj = re.match(self._VALID_URL, url)
2478         if mobj is None:
2479             raise ExtractorError(u'Invalid URL: %s' % url)
2480
2481         webpage = self._download_webpage(url, video_id=url)
2482         self.report_extraction(url)
2483
2484         # Extract video URL
2485         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2486         if mobj is None:
2487             raise ExtractorError(u'Unable to extract video url')
2488         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2489         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2490
2491         # Extract title
2492         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2493         if mobj is None:
2494             raise ExtractorError(u'Unable to extract video title')
2495         video_title = mobj.group(1)
2496
2497         # Extract description
2498         video_description = u'No description available.'
2499         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2500         if mobj is not None:
2501             video_description = mobj.group(1)
2502
2503         video_filename = video_url.split('/')[-1]
2504         video_id, extension = video_filename.split('.')
2505
2506         info = {
2507             'id': video_id,
2508             'url': video_url,
2509             'uploader': None,
2510             'upload_date': None,
2511             'title': video_title,
2512             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2513             'thumbnail': None,
2514             'description': video_description,
2515         }
2516
2517         return [info]
2518
2519 class MixcloudIE(InfoExtractor):
2520     """Information extractor for www.mixcloud.com"""
2521
2522     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2523     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2524     IE_NAME = u'mixcloud'
2525
2526     def report_download_json(self, file_id):
2527         """Report JSON download."""
2528         self.to_screen(u'Downloading json')
2529
2530     def get_urls(self, jsonData, fmt, bitrate='best'):
2531         """Get urls from 'audio_formats' section in json"""
2532         file_url = None
2533         try:
2534             bitrate_list = jsonData[fmt]
2535             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2536                 bitrate = max(bitrate_list) # select highest
2537
2538             url_list = jsonData[fmt][bitrate]
2539         except TypeError: # we have no bitrate info.
2540             url_list = jsonData[fmt]
2541         return url_list
2542
2543     def check_urls(self, url_list):
2544         """Returns 1st active url from list"""
2545         for url in url_list:
2546             try:
2547                 compat_urllib_request.urlopen(url)
2548                 return url
2549             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550                 url = None
2551
2552         return None
2553
2554     def _print_formats(self, formats):
2555         print('Available formats:')
2556         for fmt in formats.keys():
2557             for b in formats[fmt]:
2558                 try:
2559                     ext = formats[fmt][b][0]
2560                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2561                 except TypeError: # we have no bitrate info
2562                     ext = formats[fmt][0]
2563                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2564                     break
2565
2566     def _real_extract(self, url):
2567         mobj = re.match(self._VALID_URL, url)
2568         if mobj is None:
2569             raise ExtractorError(u'Invalid URL: %s' % url)
2570         # extract uploader & filename from url
2571         uploader = mobj.group(1).decode('utf-8')
2572         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2573
2574         # construct API request
2575         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2576         # retrieve .json file with links to files
2577         request = compat_urllib_request.Request(file_url)
2578         try:
2579             self.report_download_json(file_url)
2580             jsonData = compat_urllib_request.urlopen(request).read()
2581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2583
2584         # parse JSON
2585         json_data = json.loads(jsonData)
2586         player_url = json_data['player_swf_url']
2587         formats = dict(json_data['audio_formats'])
2588
2589         req_format = self._downloader.params.get('format', None)
2590         bitrate = None
2591
2592         if self._downloader.params.get('listformats', None):
2593             self._print_formats(formats)
2594             return
2595
2596         if req_format is None or req_format == 'best':
2597             for format_param in formats.keys():
2598                 url_list = self.get_urls(formats, format_param)
2599                 # check urls
2600                 file_url = self.check_urls(url_list)
2601                 if file_url is not None:
2602                     break # got it!
2603         else:
2604             if req_format not in formats:
2605                 raise ExtractorError(u'Format is not available')
2606
2607             url_list = self.get_urls(formats, req_format)
2608             file_url = self.check_urls(url_list)
2609             format_param = req_format
2610
2611         return [{
2612             'id': file_id.decode('utf-8'),
2613             'url': file_url.decode('utf-8'),
2614             'uploader': uploader.decode('utf-8'),
2615             'upload_date': None,
2616             'title': json_data['name'],
2617             'ext': file_url.split('.')[-1].decode('utf-8'),
2618             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2619             'thumbnail': json_data['thumbnail_url'],
2620             'description': json_data['description'],
2621             'player_url': player_url.decode('utf-8'),
2622         }]
2623
2624 class StanfordOpenClassroomIE(InfoExtractor):
2625     """Information extractor for Stanford's Open ClassRoom"""
2626
2627     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2628     IE_NAME = u'stanfordoc'
2629
2630     def _real_extract(self, url):
2631         mobj = re.match(self._VALID_URL, url)
2632         if mobj is None:
2633             raise ExtractorError(u'Invalid URL: %s' % url)
2634
2635         if mobj.group('course') and mobj.group('video'): # A specific video
2636             course = mobj.group('course')
2637             video = mobj.group('video')
2638             info = {
2639                 'id': course + '_' + video,
2640                 'uploader': None,
2641                 'upload_date': None,
2642             }
2643
2644             self.report_extraction(info['id'])
2645             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2646             xmlUrl = baseUrl + video + '.xml'
2647             try:
2648                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2651             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2652             try:
2653                 info['title'] = mdoc.findall('./title')[0].text
2654                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2655             except IndexError:
2656                 raise ExtractorError(u'Invalid metadata XML file')
2657             info['ext'] = info['url'].rpartition('.')[2]
2658             return [info]
2659         elif mobj.group('course'): # A course page
2660             course = mobj.group('course')
2661             info = {
2662                 'id': course,
2663                 'type': 'playlist',
2664                 'uploader': None,
2665                 'upload_date': None,
2666             }
2667
2668             coursepage = self._download_webpage(url, info['id'],
2669                                         note='Downloading course info page',
2670                                         errnote='Unable to download course info page')
2671
2672             m = re.search('<h1>([^<]+)</h1>', coursepage)
2673             if m:
2674                 info['title'] = unescapeHTML(m.group(1))
2675             else:
2676                 info['title'] = info['id']
2677
2678             m = re.search('<description>([^<]+)</description>', coursepage)
2679             if m:
2680                 info['description'] = unescapeHTML(m.group(1))
2681
2682             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2683             info['list'] = [
2684                 {
2685                     'type': 'reference',
2686                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2687                 }
2688                     for vpage in links]
2689             results = []
2690             for entry in info['list']:
2691                 assert entry['type'] == 'reference'
2692                 results += self.extract(entry['url'])
2693             return results
2694         else: # Root page
2695             info = {
2696                 'id': 'Stanford OpenClassroom',
2697                 'type': 'playlist',
2698                 'uploader': None,
2699                 'upload_date': None,
2700             }
2701
2702             self.report_download_webpage(info['id'])
2703             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2704             try:
2705                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2706             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2708
2709             info['title'] = info['id']
2710
2711             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2712             info['list'] = [
2713                 {
2714                     'type': 'reference',
2715                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2716                 }
2717                     for cpage in links]
2718
2719             results = []
2720             for entry in info['list']:
2721                 assert entry['type'] == 'reference'
2722                 results += self.extract(entry['url'])
2723             return results
2724
2725 class MTVIE(InfoExtractor):
2726     """Information extractor for MTV.com"""
2727
2728     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2729     IE_NAME = u'mtv'
2730
2731     def _real_extract(self, url):
2732         mobj = re.match(self._VALID_URL, url)
2733         if mobj is None:
2734             raise ExtractorError(u'Invalid URL: %s' % url)
2735         if not mobj.group('proto'):
2736             url = 'http://' + url
2737         video_id = mobj.group('videoid')
2738
2739         webpage = self._download_webpage(url, video_id)
2740
2741         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2742         if mobj is None:
2743             raise ExtractorError(u'Unable to extract song name')
2744         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2745         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2746         if mobj is None:
2747             raise ExtractorError(u'Unable to extract performer')
2748         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2749         video_title = performer + ' - ' + song_name
2750
2751         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2752         if mobj is None:
2753             raise ExtractorError(u'Unable to mtvn_uri')
2754         mtvn_uri = mobj.group(1)
2755
2756         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2757         if mobj is None:
2758             raise ExtractorError(u'Unable to extract content id')
2759         content_id = mobj.group(1)
2760
2761         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2762         self.report_extraction(video_id)
2763         request = compat_urllib_request.Request(videogen_url)
2764         try:
2765             metadataXml = compat_urllib_request.urlopen(request).read()
2766         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2767             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2768
2769         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2770         renditions = mdoc.findall('.//rendition')
2771
2772         # For now, always pick the highest quality.
2773         rendition = renditions[-1]
2774
2775         try:
2776             _,_,ext = rendition.attrib['type'].partition('/')
2777             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2778             video_url = rendition.find('./src').text
2779         except KeyError:
2780             raise ExtractorError('Invalid rendition field.')
2781
2782         info = {
2783             'id': video_id,
2784             'url': video_url,
2785             'uploader': performer,
2786             'upload_date': None,
2787             'title': video_title,
2788             'ext': ext,
2789             'format': format,
2790         }
2791
2792         return [info]
2793
2794
2795 class YoukuIE(InfoExtractor):
2796     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2797
2798     def _gen_sid(self):
2799         nowTime = int(time.time() * 1000)
2800         random1 = random.randint(1000,1998)
2801         random2 = random.randint(1000,9999)
2802
2803         return "%d%d%d" %(nowTime,random1,random2)
2804
2805     def _get_file_ID_mix_string(self, seed):
2806         mixed = []
2807         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2808         seed = float(seed)
2809         for i in range(len(source)):
2810             seed  =  (seed * 211 + 30031 ) % 65536
2811             index  =  math.floor(seed / 65536 * len(source) )
2812             mixed.append(source[int(index)])
2813             source.remove(source[int(index)])
2814         #return ''.join(mixed)
2815         return mixed
2816
2817     def _get_file_id(self, fileId, seed):
2818         mixed = self._get_file_ID_mix_string(seed)
2819         ids = fileId.split('*')
2820         realId = []
2821         for ch in ids:
2822             if ch:
2823                 realId.append(mixed[int(ch)])
2824         return ''.join(realId)
2825
2826     def _real_extract(self, url):
2827         mobj = re.match(self._VALID_URL, url)
2828         if mobj is None:
2829             raise ExtractorError(u'Invalid URL: %s' % url)
2830         video_id = mobj.group('ID')
2831
2832         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2833
2834         jsondata = self._download_webpage(info_url, video_id)
2835
2836         self.report_extraction(video_id)
2837         try:
2838             config = json.loads(jsondata)
2839
2840             video_title =  config['data'][0]['title']
2841             seed = config['data'][0]['seed']
2842
2843             format = self._downloader.params.get('format', None)
2844             supported_format = list(config['data'][0]['streamfileids'].keys())
2845
2846             if format is None or format == 'best':
2847                 if 'hd2' in supported_format:
2848                     format = 'hd2'
2849                 else:
2850                     format = 'flv'
2851                 ext = u'flv'
2852             elif format == 'worst':
2853                 format = 'mp4'
2854                 ext = u'mp4'
2855             else:
2856                 format = 'flv'
2857                 ext = u'flv'
2858
2859
2860             fileid = config['data'][0]['streamfileids'][format]
2861             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2862         except (UnicodeDecodeError, ValueError, KeyError):
2863             raise ExtractorError(u'Unable to extract info section')
2864
2865         files_info=[]
2866         sid = self._gen_sid()
2867         fileid = self._get_file_id(fileid, seed)
2868
2869         #column 8,9 of fileid represent the segment number
2870         #fileid[7:9] should be changed
2871         for index, key in enumerate(keys):
2872
2873             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2874             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2875
2876             info = {
2877                 'id': '%s_part%02d' % (video_id, index),
2878                 'url': download_url,
2879                 'uploader': None,
2880                 'upload_date': None,
2881                 'title': video_title,
2882                 'ext': ext,
2883             }
2884             files_info.append(info)
2885
2886         return files_info
2887
2888
2889 class XNXXIE(InfoExtractor):
2890     """Information extractor for xnxx.com"""
2891
2892     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2893     IE_NAME = u'xnxx'
2894     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2895     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2896     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2897
2898     def _real_extract(self, url):
2899         mobj = re.match(self._VALID_URL, url)
2900         if mobj is None:
2901             raise ExtractorError(u'Invalid URL: %s' % url)
2902         video_id = mobj.group(1)
2903
2904         # Get webpage content
2905         webpage = self._download_webpage(url, video_id)
2906
2907         result = re.search(self.VIDEO_URL_RE, webpage)
2908         if result is None:
2909             raise ExtractorError(u'Unable to extract video url')
2910         video_url = compat_urllib_parse.unquote(result.group(1))
2911
2912         result = re.search(self.VIDEO_TITLE_RE, webpage)
2913         if result is None:
2914             raise ExtractorError(u'Unable to extract video title')
2915         video_title = result.group(1)
2916
2917         result = re.search(self.VIDEO_THUMB_RE, webpage)
2918         if result is None:
2919             raise ExtractorError(u'Unable to extract video thumbnail')
2920         video_thumbnail = result.group(1)
2921
2922         return [{
2923             'id': video_id,
2924             'url': video_url,
2925             'uploader': None,
2926             'upload_date': None,
2927             'title': video_title,
2928             'ext': 'flv',
2929             'thumbnail': video_thumbnail,
2930             'description': None,
2931         }]
2932
2933
2934 class GooglePlusIE(InfoExtractor):
2935     """Information extractor for plus.google.com."""
2936
2937     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2938     IE_NAME = u'plus.google'
2939
2940     def report_extract_entry(self, url):
2941         """Report downloading extry"""
2942         self.to_screen(u'Downloading entry: %s' % url)
2943
2944     def report_date(self, upload_date):
2945         """Report downloading extry"""
2946         self.to_screen(u'Entry date: %s' % upload_date)
2947
2948     def report_uploader(self, uploader):
2949         """Report downloading extry"""
2950         self.to_screen(u'Uploader: %s' % uploader)
2951
2952     def report_title(self, video_title):
2953         """Report downloading extry"""
2954         self.to_screen(u'Title: %s' % video_title)
2955
2956     def report_extract_vid_page(self, video_page):
2957         """Report information extraction."""
2958         self.to_screen(u'Extracting video page: %s' % video_page)
2959
2960     def _real_extract(self, url):
2961         # Extract id from URL
2962         mobj = re.match(self._VALID_URL, url)
2963         if mobj is None:
2964             raise ExtractorError(u'Invalid URL: %s' % url)
2965
2966         post_url = mobj.group(0)
2967         video_id = mobj.group(1)
2968
2969         video_extension = 'flv'
2970
2971         # Step 1, Retrieve post webpage to extract further information
2972         self.report_extract_entry(post_url)
2973         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2974
2975         # Extract update date
2976         upload_date = None
2977         pattern = 'title="Timestamp">(.*?)</a>'
2978         mobj = re.search(pattern, webpage)
2979         if mobj:
2980             upload_date = mobj.group(1)
2981             # Convert timestring to a format suitable for filename
2982             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2983             upload_date = upload_date.strftime('%Y%m%d')
2984         self.report_date(upload_date)
2985
2986         # Extract uploader
2987         uploader = None
2988         pattern = r'rel\="author".*?>(.*?)</a>'
2989         mobj = re.search(pattern, webpage)
2990         if mobj:
2991             uploader = mobj.group(1)
2992         self.report_uploader(uploader)
2993
2994         # Extract title
2995         # Get the first line for title
2996         video_title = u'NA'
2997         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
2998         mobj = re.search(pattern, webpage)
2999         if mobj:
3000             video_title = mobj.group(1)
3001         self.report_title(video_title)
3002
3003         # Step 2, Stimulate clicking the image box to launch video
3004         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3005         mobj = re.search(pattern, webpage)
3006         if mobj is None:
3007             raise ExtractorError(u'Unable to extract video page URL')
3008
3009         video_page = mobj.group(1)
3010         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3011         self.report_extract_vid_page(video_page)
3012
3013
3014         # Extract video links on video page
3015         """Extract video links of all sizes"""
3016         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3017         mobj = re.findall(pattern, webpage)
3018         if len(mobj) == 0:
3019             raise ExtractorError(u'Unable to extract video links')
3020
3021         # Sort in resolution
3022         links = sorted(mobj)
3023
3024         # Choose the lowest of the sort, i.e. highest resolution
3025         video_url = links[-1]
3026         # Only get the url. The resolution part in the tuple has no use anymore
3027         video_url = video_url[-1]
3028         # Treat escaped \u0026 style hex
3029         try:
3030             video_url = video_url.decode("unicode_escape")
3031         except AttributeError: # Python 3
3032             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3033
3034
3035         return [{
3036             'id':       video_id,
3037             'url':      video_url,
3038             'uploader': uploader,
3039             'upload_date':  upload_date,
3040             'title':    video_title,
3041             'ext':      video_extension,
3042         }]
3043
3044 class NBAIE(InfoExtractor):
3045     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3046     IE_NAME = u'nba'
3047
3048     def _real_extract(self, url):
3049         mobj = re.match(self._VALID_URL, url)
3050         if mobj is None:
3051             raise ExtractorError(u'Invalid URL: %s' % url)
3052
3053         video_id = mobj.group(1)
3054         if video_id.endswith('/index.html'):
3055             video_id = video_id[:-len('/index.html')]
3056
3057         webpage = self._download_webpage(url, video_id)
3058
3059         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3060         def _findProp(rexp, default=None):
3061             m = re.search(rexp, webpage)
3062             if m:
3063                 return unescapeHTML(m.group(1))
3064             else:
3065                 return default
3066
3067         shortened_video_id = video_id.rpartition('/')[2]
3068         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3069         info = {
3070             'id': shortened_video_id,
3071             'url': video_url,
3072             'ext': 'mp4',
3073             'title': title,
3074             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3075             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3076         }
3077         return [info]
3078
3079 class JustinTVIE(InfoExtractor):
3080     """Information extractor for justin.tv and twitch.tv"""
3081     # TODO: One broadcast may be split into multiple videos. The key
3082     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3083     # starts at 1 and increases. Can we treat all parts as one video?
3084
3085     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3086         (?:
3087             (?P<channelid>[^/]+)|
3088             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3089             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3090         )
3091         /?(?:\#.*)?$
3092         """
3093     _JUSTIN_PAGE_LIMIT = 100
3094     IE_NAME = u'justin.tv'
3095
3096     def report_download_page(self, channel, offset):
3097         """Report attempt to download a single page of videos."""
3098         self.to_screen(u'%s: Downloading video information from %d to %d' %
3099                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3100
3101     # Return count of items, list of *valid* items
3102     def _parse_page(self, url, video_id):
3103         webpage = self._download_webpage(url, video_id,
3104                                          u'Downloading video info JSON',
3105                                          u'unable to download video info JSON')
3106
3107         response = json.loads(webpage)
3108         if type(response) != list:
3109             error_text = response.get('error', 'unknown error')
3110             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3111         info = []
3112         for clip in response:
3113             video_url = clip['video_file_url']
3114             if video_url:
3115                 video_extension = os.path.splitext(video_url)[1][1:]
3116                 video_date = re.sub('-', '', clip['start_time'][:10])
3117                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3118                 video_id = clip['id']
3119                 video_title = clip.get('title', video_id)
3120                 info.append({
3121                     'id': video_id,
3122                     'url': video_url,
3123                     'title': video_title,
3124                     'uploader': clip.get('channel_name', video_uploader_id),
3125                     'uploader_id': video_uploader_id,
3126                     'upload_date': video_date,
3127                     'ext': video_extension,
3128                 })
3129         return (len(response), info)
3130
3131     def _real_extract(self, url):
3132         mobj = re.match(self._VALID_URL, url)
3133         if mobj is None:
3134             raise ExtractorError(u'invalid URL: %s' % url)
3135
3136         api_base = 'http://api.justin.tv'
3137         paged = False
3138         if mobj.group('channelid'):
3139             paged = True
3140             video_id = mobj.group('channelid')
3141             api = api_base + '/channel/archives/%s.json' % video_id
3142         elif mobj.group('chapterid'):
3143             chapter_id = mobj.group('chapterid')
3144
3145             webpage = self._download_webpage(url, chapter_id)
3146             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3147             if not m:
3148                 raise ExtractorError(u'Cannot find archive of a chapter')
3149             archive_id = m.group(1)
3150
3151             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3152             chapter_info_xml = self._download_webpage(api, chapter_id,
3153                                              note=u'Downloading chapter information',
3154                                              errnote=u'Chapter information download failed')
3155             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3156             for a in doc.findall('.//archive'):
3157                 if archive_id == a.find('./id').text:
3158                     break
3159             else:
3160                 raise ExtractorError(u'Could not find chapter in chapter information')
3161
3162             video_url = a.find('./video_file_url').text
3163             video_ext = video_url.rpartition('.')[2] or u'flv'
3164
3165             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3166             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3167                                    note='Downloading chapter metadata',
3168                                    errnote='Download of chapter metadata failed')
3169             chapter_info = json.loads(chapter_info_json)
3170
3171             bracket_start = int(doc.find('.//bracket_start').text)
3172             bracket_end = int(doc.find('.//bracket_end').text)
3173
3174             # TODO determine start (and probably fix up file)
3175             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3176             #video_url += u'?start=' + TODO:start_timestamp
3177             # bracket_start is 13290, but we want 51670615
3178             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3179                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3180
3181             info = {
3182                 'id': u'c' + chapter_id,
3183                 'url': video_url,
3184                 'ext': video_ext,
3185                 'title': chapter_info['title'],
3186                 'thumbnail': chapter_info['preview'],
3187                 'description': chapter_info['description'],
3188                 'uploader': chapter_info['channel']['display_name'],
3189                 'uploader_id': chapter_info['channel']['name'],
3190             }
3191             return [info]
3192         else:
3193             video_id = mobj.group('videoid')
3194             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3195
3196         self.report_extraction(video_id)
3197
3198         info = []
3199         offset = 0
3200         limit = self._JUSTIN_PAGE_LIMIT
3201         while True:
3202             if paged:
3203                 self.report_download_page(video_id, offset)
3204             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3205             page_count, page_info = self._parse_page(page_url, video_id)
3206             info.extend(page_info)
3207             if not paged or page_count != limit:
3208                 break
3209             offset += limit
3210         return info
3211
3212 class FunnyOrDieIE(InfoExtractor):
3213     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3214
3215     def _real_extract(self, url):
3216         mobj = re.match(self._VALID_URL, url)
3217         if mobj is None:
3218             raise ExtractorError(u'invalid URL: %s' % url)
3219
3220         video_id = mobj.group('id')
3221         webpage = self._download_webpage(url, video_id)
3222
3223         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3224         if not m:
3225             raise ExtractorError(u'Unable to find video information')
3226         video_url = unescapeHTML(m.group('url'))
3227
3228         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3229         if not m:
3230             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3231             if not m:
3232                 raise ExtractorError(u'Cannot find video title')
3233         title = clean_html(m.group('title'))
3234
3235         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3236         if m:
3237             desc = unescapeHTML(m.group('desc'))
3238         else:
3239             desc = None
3240
3241         info = {
3242             'id': video_id,
3243             'url': video_url,
3244             'ext': 'mp4',
3245             'title': title,
3246             'description': desc,
3247         }
3248         return [info]
3249
3250 class SteamIE(InfoExtractor):
3251     _VALID_URL = r"""http://store\.steampowered\.com/
3252                 (agecheck/)?
3253                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3254                 (?P<gameID>\d+)/?
3255                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3256                 """
3257
3258     @classmethod
3259     def suitable(cls, url):
3260         """Receives a URL and returns True if suitable for this IE."""
3261         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3262
3263     def _real_extract(self, url):
3264         m = re.match(self._VALID_URL, url, re.VERBOSE)
3265         gameID = m.group('gameID')
3266         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3267         self.report_age_confirmation()
3268         webpage = self._download_webpage(videourl, gameID)
3269         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3270         
3271         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3272         mweb = re.finditer(urlRE, webpage)
3273         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3274         titles = re.finditer(namesRE, webpage)
3275         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3276         thumbs = re.finditer(thumbsRE, webpage)
3277         videos = []
3278         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3279             video_id = vid.group('videoID')
3280             title = vtitle.group('videoName')
3281             video_url = vid.group('videoURL')
3282             video_thumb = thumb.group('thumbnail')
3283             if not video_url:
3284                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3285             info = {
3286                 'id':video_id,
3287                 'url':video_url,
3288                 'ext': 'flv',
3289                 'title': unescapeHTML(title),
3290                 'thumbnail': video_thumb
3291                   }
3292             videos.append(info)
3293         return [self.playlist_result(videos, gameID, game_title)]
3294
3295 class UstreamIE(InfoExtractor):
3296     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3297     IE_NAME = u'ustream'
3298
3299     def _real_extract(self, url):
3300         m = re.match(self._VALID_URL, url)
3301         video_id = m.group('videoID')
3302         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3303         webpage = self._download_webpage(url, video_id)
3304         self.report_extraction(video_id)
3305         try:
3306             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3307             title = m.group('title')
3308             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3309                           webpage, re.DOTALL)
3310             uploader = unescapeHTML(m.group('uploader').strip())
3311             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3312             thumb = m.group('thumb')
3313         except AttributeError:
3314             raise ExtractorError(u'Unable to extract info')
3315         info = {
3316                 'id':video_id,
3317                 'url':video_url,
3318                 'ext': 'flv',
3319                 'title': title,
3320                 'uploader': uploader,
3321                 'thumbnail': thumb,
3322                   }
3323         return info
3324
3325 class WorldStarHipHopIE(InfoExtractor):
3326     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3327     IE_NAME = u'WorldStarHipHop'
3328
3329     def _real_extract(self, url):
3330         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3331
3332         m = re.match(self._VALID_URL, url)
3333         video_id = m.group('id')
3334
3335         webpage_src = self._download_webpage(url, video_id) 
3336
3337         mobj = re.search(_src_url, webpage_src)
3338
3339         if mobj is not None:
3340             video_url = mobj.group(1)
3341             if 'mp4' in video_url:
3342                 ext = 'mp4'
3343             else:
3344                 ext = 'flv'
3345         else:
3346             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3347
3348         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3349
3350         if mobj is None:
3351             raise ExtractorError(u'Cannot determine title')
3352         title = mobj.group(1)
3353
3354         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3355         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3356         if mobj is not None:
3357             thumbnail = mobj.group(1)
3358         else:
3359             _title = r"""candytitles.*>(.*)</span>"""
3360             mobj = re.search(_title, webpage_src)
3361             if mobj is not None:
3362                 title = mobj.group(1)
3363             thumbnail = None
3364
3365         results = [{
3366                     'id': video_id,
3367                     'url' : video_url,
3368                     'title' : title,
3369                     'thumbnail' : thumbnail,
3370                     'ext' : ext,
3371                     }]
3372         return results
3373
3374 class RBMARadioIE(InfoExtractor):
3375     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3376
3377     def _real_extract(self, url):
3378         m = re.match(self._VALID_URL, url)
3379         video_id = m.group('videoID')
3380
3381         webpage = self._download_webpage(url, video_id)
3382         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3383         if not m:
3384             raise ExtractorError(u'Cannot find metadata')
3385         json_data = m.group(1)
3386
3387         try:
3388             data = json.loads(json_data)
3389         except ValueError as e:
3390             raise ExtractorError(u'Invalid JSON: ' + str(e))
3391
3392         video_url = data['akamai_url'] + '&cbr=256'
3393         url_parts = compat_urllib_parse_urlparse(video_url)
3394         video_ext = url_parts.path.rpartition('.')[2]
3395         info = {
3396                 'id': video_id,
3397                 'url': video_url,
3398                 'ext': video_ext,
3399                 'title': data['title'],
3400                 'description': data.get('teaser_text'),
3401                 'location': data.get('country_of_origin'),
3402                 'uploader': data.get('host', {}).get('name'),
3403                 'uploader_id': data.get('host', {}).get('slug'),
3404                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3405                 'duration': data.get('duration'),
3406         }
3407         return [info]
3408
3409
3410 class YouPornIE(InfoExtractor):
3411     """Information extractor for youporn.com."""
3412     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3413
3414     def _print_formats(self, formats):
3415         """Print all available formats"""
3416         print(u'Available formats:')
3417         print(u'ext\t\tformat')
3418         print(u'---------------------------------')
3419         for format in formats:
3420             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3421
3422     def _specific(self, req_format, formats):
3423         for x in formats:
3424             if(x["format"]==req_format):
3425                 return x
3426         return None
3427
3428     def _real_extract(self, url):
3429         mobj = re.match(self._VALID_URL, url)
3430         if mobj is None:
3431             raise ExtractorError(u'Invalid URL: %s' % url)
3432
3433         video_id = mobj.group('videoid')
3434
3435         req = compat_urllib_request.Request(url)
3436         req.add_header('Cookie', 'age_verified=1')
3437         webpage = self._download_webpage(req, video_id)
3438
3439         # Get the video title
3440         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3441         if result is None:
3442             raise ExtractorError(u'Unable to extract video title')
3443         video_title = result.group('title').strip()
3444
3445         # Get the video date
3446         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3447         if result is None:
3448             self._downloader.report_warning(u'unable to extract video date')
3449             upload_date = None
3450         else:
3451             upload_date = unified_strdate(result.group('date').strip())
3452
3453         # Get the video uploader
3454         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3455         if result is None:
3456             self._downloader.report_warning(u'unable to extract uploader')
3457             video_uploader = None
3458         else:
3459             video_uploader = result.group('uploader').strip()
3460             video_uploader = clean_html( video_uploader )
3461
3462         # Get all of the formats available
3463         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3464         result = re.search(DOWNLOAD_LIST_RE, webpage)
3465         if result is None:
3466             raise ExtractorError(u'Unable to extract download list')
3467         download_list_html = result.group('download_list').strip()
3468
3469         # Get all of the links from the page
3470         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3471         links = re.findall(LINK_RE, download_list_html)
3472         if(len(links) == 0):
3473             raise ExtractorError(u'ERROR: no known formats available for video')
3474
3475         self.to_screen(u'Links found: %d' % len(links))
3476
3477         formats = []
3478         for link in links:
3479
3480             # A link looks like this:
3481             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3482             # A path looks like this:
3483             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3484             video_url = unescapeHTML( link )
3485             path = compat_urllib_parse_urlparse( video_url ).path
3486             extension = os.path.splitext( path )[1][1:]
3487             format = path.split('/')[4].split('_')[:2]
3488             size = format[0]
3489             bitrate = format[1]
3490             format = "-".join( format )
3491             title = u'%s-%s-%s' % (video_title, size, bitrate)
3492
3493             formats.append({
3494                 'id': video_id,
3495                 'url': video_url,
3496                 'uploader': video_uploader,
3497                 'upload_date': upload_date,
3498                 'title': title,
3499                 'ext': extension,
3500                 'format': format,
3501                 'thumbnail': None,
3502                 'description': None,
3503                 'player_url': None
3504             })
3505
3506         if self._downloader.params.get('listformats', None):
3507             self._print_formats(formats)
3508             return
3509
3510         req_format = self._downloader.params.get('format', None)
3511         self.to_screen(u'Format: %s' % req_format)
3512
3513         if req_format is None or req_format == 'best':
3514             return [formats[0]]
3515         elif req_format == 'worst':
3516             return [formats[-1]]
3517         elif req_format in ('-1', 'all'):
3518             return formats
3519         else:
3520             format = self._specific( req_format, formats )
3521             if result is None:
3522                 raise ExtractorError(u'Requested format not available')
3523             return [format]
3524
3525
3526
3527 class PornotubeIE(InfoExtractor):
3528     """Information extractor for pornotube.com."""
3529     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3530
3531     def _real_extract(self, url):
3532         mobj = re.match(self._VALID_URL, url)
3533         if mobj is None:
3534             raise ExtractorError(u'Invalid URL: %s' % url)
3535
3536         video_id = mobj.group('videoid')
3537         video_title = mobj.group('title')
3538
3539         # Get webpage content
3540         webpage = self._download_webpage(url, video_id)
3541
3542         # Get the video URL
3543         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3544         result = re.search(VIDEO_URL_RE, webpage)
3545         if result is None:
3546             raise ExtractorError(u'Unable to extract video url')
3547         video_url = compat_urllib_parse.unquote(result.group('url'))
3548
3549         #Get the uploaded date
3550         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3551         result = re.search(VIDEO_UPLOADED_RE, webpage)
3552         if result is None:
3553             raise ExtractorError(u'Unable to extract video title')
3554         upload_date = unified_strdate(result.group('date'))
3555
3556         info = {'id': video_id,
3557                 'url': video_url,
3558                 'uploader': None,
3559                 'upload_date': upload_date,
3560                 'title': video_title,
3561                 'ext': 'flv',
3562                 'format': 'flv'}
3563
3564         return [info]
3565
3566 class YouJizzIE(InfoExtractor):
3567     """Information extractor for youjizz.com."""
3568     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3569
3570     def _real_extract(self, url):
3571         mobj = re.match(self._VALID_URL, url)
3572         if mobj is None:
3573             raise ExtractorError(u'Invalid URL: %s' % url)
3574
3575         video_id = mobj.group('videoid')
3576
3577         # Get webpage content
3578         webpage = self._download_webpage(url, video_id)
3579
3580         # Get the video title
3581         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3582         if result is None:
3583             raise ExtractorError(u'ERROR: unable to extract video title')
3584         video_title = result.group('title').strip()
3585
3586         # Get the embed page
3587         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3588         if result is None:
3589             raise ExtractorError(u'ERROR: unable to extract embed page')
3590
3591         embed_page_url = result.group(0).strip()
3592         video_id = result.group('videoid')
3593
3594         webpage = self._download_webpage(embed_page_url, video_id)
3595
3596         # Get the video URL
3597         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3598         if result is None:
3599             raise ExtractorError(u'ERROR: unable to extract video url')
3600         video_url = result.group('source')
3601
3602         info = {'id': video_id,
3603                 'url': video_url,
3604                 'title': video_title,
3605                 'ext': 'flv',
3606                 'format': 'flv',
3607                 'player_url': embed_page_url}
3608
3609         return [info]
3610
3611 class EightTracksIE(InfoExtractor):
3612     IE_NAME = '8tracks'
3613     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3614
3615     def _real_extract(self, url):
3616         mobj = re.match(self._VALID_URL, url)
3617         if mobj is None:
3618             raise ExtractorError(u'Invalid URL: %s' % url)
3619         playlist_id = mobj.group('id')
3620
3621         webpage = self._download_webpage(url, playlist_id)
3622
3623         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3624         if not m:
3625             raise ExtractorError(u'Cannot find trax information')
3626         json_like = m.group(1)
3627         data = json.loads(json_like)
3628
3629         session = str(random.randint(0, 1000000000))
3630         mix_id = data['id']
3631         track_count = data['tracks_count']
3632         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3633         next_url = first_url
3634         res = []
3635         for i in itertools.count():
3636             api_json = self._download_webpage(next_url, playlist_id,
3637                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3638                 errnote=u'Failed to download song information')
3639             api_data = json.loads(api_json)
3640             track_data = api_data[u'set']['track']
3641             info = {
3642                 'id': track_data['id'],
3643                 'url': track_data['track_file_stream_url'],
3644                 'title': track_data['performer'] + u' - ' + track_data['name'],
3645                 'raw_title': track_data['name'],
3646                 'uploader_id': data['user']['login'],
3647                 'ext': 'm4a',
3648             }
3649             res.append(info)
3650             if api_data['set']['at_last_track']:
3651                 break
3652             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3653         return res
3654
3655 class KeekIE(InfoExtractor):
3656     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3657     IE_NAME = u'keek'
3658
3659     def _real_extract(self, url):
3660         m = re.match(self._VALID_URL, url)
3661         video_id = m.group('videoID')
3662         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3663         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3664         webpage = self._download_webpage(url, video_id)
3665         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3666         title = unescapeHTML(m.group('title'))
3667         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3668         uploader = clean_html(m.group('uploader'))
3669         info = {
3670                 'id': video_id,
3671                 'url': video_url,
3672                 'ext': 'mp4',
3673                 'title': title,
3674                 'thumbnail': thumbnail,
3675                 'uploader': uploader
3676         }
3677         return [info]
3678
3679 class TEDIE(InfoExtractor):
3680     _VALID_URL=r'''http://www\.ted\.com/
3681                    (
3682                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3683                         |
3684                         ((?P<type_talk>talks)) # We have a simple talk
3685                    )
3686                    (/lang/(.*?))? # The url may contain the language
3687                    /(?P<name>\w+) # Here goes the name and then ".html"
3688                    '''
3689
3690     @classmethod
3691     def suitable(cls, url):
3692         """Receives a URL and returns True if suitable for this IE."""
3693         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3694
3695     def _real_extract(self, url):
3696         m=re.match(self._VALID_URL, url, re.VERBOSE)
3697         if m.group('type_talk'):
3698             return [self._talk_info(url)]
3699         else :
3700             playlist_id=m.group('playlist_id')
3701             name=m.group('name')
3702             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3703             return [self._playlist_videos_info(url,name,playlist_id)]
3704
3705     def _talk_video_link(self,mediaSlug):
3706         '''Returns the video link for that mediaSlug'''
3707         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3708
3709     def _playlist_videos_info(self,url,name,playlist_id=0):
3710         '''Returns the videos of the playlist'''
3711         video_RE=r'''
3712                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3713                      ([.\s]*?)data-playlist_item_id="(\d+)"
3714                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3715                      '''
3716         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3717         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3718         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3719         m_names=re.finditer(video_name_RE,webpage)
3720
3721         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3722         m_playlist = re.search(playlist_RE, webpage)
3723         playlist_title = m_playlist.group('playlist_title')
3724
3725         playlist_entries = []
3726         for m_video, m_name in zip(m_videos,m_names):
3727             video_id=m_video.group('video_id')
3728             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3729             playlist_entries.append(self.url_result(talk_url, 'TED'))
3730         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3731
3732     def _talk_info(self, url, video_id=0):
3733         """Return the video for the talk in the url"""
3734         m=re.match(self._VALID_URL, url,re.VERBOSE)
3735         videoName=m.group('name')
3736         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3737         # If the url includes the language we get the title translated
3738         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3739         title=re.search(title_RE, webpage).group('title')
3740         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3741                         "id":(?P<videoID>[\d]+).*?
3742                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3743         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3744         thumb_match=re.search(thumb_RE,webpage)
3745         info_match=re.search(info_RE,webpage,re.VERBOSE)
3746         video_id=info_match.group('videoID')
3747         mediaSlug=info_match.group('mediaSlug')
3748         video_url=self._talk_video_link(mediaSlug)
3749         info = {
3750                 'id': video_id,
3751                 'url': video_url,
3752                 'ext': 'mp4',
3753                 'title': title,
3754                 'thumbnail': thumb_match.group('thumbnail')
3755                 }
3756         return info
3757
3758 class MySpassIE(InfoExtractor):
3759     _VALID_URL = r'http://www.myspass.de/.*'
3760
3761     def _real_extract(self, url):
3762         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3763
3764         # video id is the last path element of the URL
3765         # usually there is a trailing slash, so also try the second but last
3766         url_path = compat_urllib_parse_urlparse(url).path
3767         url_parent_path, video_id = os.path.split(url_path)
3768         if not video_id:
3769             _, video_id = os.path.split(url_parent_path)
3770
3771         # get metadata
3772         metadata_url = META_DATA_URL_TEMPLATE % video_id
3773         metadata_text = self._download_webpage(metadata_url, video_id)
3774         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3775
3776         # extract values from metadata
3777         url_flv_el = metadata.find('url_flv')
3778         if url_flv_el is None:
3779             raise ExtractorError(u'Unable to extract download url')
3780         video_url = url_flv_el.text
3781         extension = os.path.splitext(video_url)[1][1:]
3782         title_el = metadata.find('title')
3783         if title_el is None:
3784             raise ExtractorError(u'Unable to extract title')
3785         title = title_el.text
3786         format_id_el = metadata.find('format_id')
3787         if format_id_el is None:
3788             format = ext
3789         else:
3790             format = format_id_el.text
3791         description_el = metadata.find('description')
3792         if description_el is not None:
3793             description = description_el.text
3794         else:
3795             description = None
3796         imagePreview_el = metadata.find('imagePreview')
3797         if imagePreview_el is not None:
3798             thumbnail = imagePreview_el.text
3799         else:
3800             thumbnail = None
3801         info = {
3802             'id': video_id,
3803             'url': video_url,
3804             'title': title,
3805             'ext': extension,
3806             'format': format,
3807             'thumbnail': thumbnail,
3808             'description': description
3809         }
3810         return [info]
3811
3812 class SpiegelIE(InfoExtractor):
3813     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3814
3815     def _real_extract(self, url):
3816         m = re.match(self._VALID_URL, url)
3817         video_id = m.group('videoID')
3818
3819         webpage = self._download_webpage(url, video_id)
3820         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3821         if not m:
3822             raise ExtractorError(u'Cannot find title')
3823         video_title = unescapeHTML(m.group(1))
3824
3825         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3826         xml_code = self._download_webpage(xml_url, video_id,
3827                     note=u'Downloading XML', errnote=u'Failed to download XML')
3828
3829         idoc = xml.etree.ElementTree.fromstring(xml_code)
3830         last_type = idoc[-1]
3831         filename = last_type.findall('./filename')[0].text
3832         duration = float(last_type.findall('./duration')[0].text)
3833
3834         video_url = 'http://video2.spiegel.de/flash/' + filename
3835         video_ext = filename.rpartition('.')[2]
3836         info = {
3837             'id': video_id,
3838             'url': video_url,
3839             'ext': video_ext,
3840             'title': video_title,
3841             'duration': duration,
3842         }
3843         return [info]
3844
3845 class LiveLeakIE(InfoExtractor):
3846
3847     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3848     IE_NAME = u'liveleak'
3849
3850     def _real_extract(self, url):
3851         mobj = re.match(self._VALID_URL, url)
3852         if mobj is None:
3853             raise ExtractorError(u'Invalid URL: %s' % url)
3854
3855         video_id = mobj.group('video_id')
3856
3857         webpage = self._download_webpage(url, video_id)
3858
3859         m = re.search(r'file: "(.*?)",', webpage)
3860         if not m:
3861             raise ExtractorError(u'Unable to find video url')
3862         video_url = m.group(1)
3863
3864         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3865         if not m:
3866             raise ExtractorError(u'Cannot find video title')
3867         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3868
3869         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3870         if m:
3871             desc = unescapeHTML(m.group('desc'))
3872         else:
3873             desc = None
3874
3875         m = re.search(r'By:.*?(\w+)</a>', webpage)
3876         if m:
3877             uploader = clean_html(m.group(1))
3878         else:
3879             uploader = None
3880
3881         info = {
3882             'id':  video_id,
3883             'url': video_url,
3884             'ext': 'mp4',
3885             'title': title,
3886             'description': desc,
3887             'uploader': uploader
3888         }
3889
3890         return [info]
3891
3892 class ARDIE(InfoExtractor):
3893     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3894     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3895     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3896
3897     def _real_extract(self, url):
3898         # determine video id from url
3899         m = re.match(self._VALID_URL, url)
3900
3901         numid = re.search(r'documentId=([0-9]+)', url)
3902         if numid:
3903             video_id = numid.group(1)
3904         else:
3905             video_id = m.group('video_id')
3906
3907         # determine title and media streams from webpage
3908         html = self._download_webpage(url, video_id)
3909         title = re.search(self._TITLE, html).group('title')
3910         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3911         if not streams:
3912             assert '"fsk"' in html
3913             raise ExtractorError(u'This video is only available after 8:00 pm')
3914
3915         # choose default media type and highest quality for now
3916         stream = max([s for s in streams if int(s["media_type"]) == 0],
3917                      key=lambda s: int(s["quality"]))
3918
3919         # there's two possibilities: RTMP stream or HTTP download
3920         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3921         if stream['rtmp_url']:
3922             self.to_screen(u'RTMP download detected')
3923             assert stream['video_url'].startswith('mp4:')
3924             info["url"] = stream["rtmp_url"]
3925             info["play_path"] = stream['video_url']
3926         else:
3927             assert stream["video_url"].endswith('.mp4')
3928             info["url"] = stream["video_url"]
3929         return [info]
3930
3931 class TumblrIE(InfoExtractor):
3932     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3933
3934     def _real_extract(self, url):
3935         m_url = re.match(self._VALID_URL, url)
3936         video_id = m_url.group('id')
3937         blog = m_url.group('blog_name')
3938
3939         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3940         webpage = self._download_webpage(url, video_id)
3941
3942         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3943         video = re.search(re_video, webpage)
3944         if video is None:
3945             self.to_screen("No video founded")
3946             return []
3947         video_url = video.group('video_url')
3948         ext = video.group('ext')
3949
3950         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3951         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3952
3953         # The only place where you can get a title, it's not complete,
3954         # but searching in other places doesn't work for all videos
3955         re_title = r'<title>(?P<title>.*?)</title>'
3956         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3957
3958         return [{'id': video_id,
3959                  'url': video_url,
3960                  'title': title,
3961                  'thumbnail': thumb,
3962                  'ext': ext
3963                  }]
3964
3965 class BandcampIE(InfoExtractor):
3966     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3967
3968     def _real_extract(self, url):
3969         mobj = re.match(self._VALID_URL, url)
3970         title = mobj.group('title')
3971         webpage = self._download_webpage(url, title)
3972         # We get the link to the free download page
3973         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3974         if m_download is None:
3975             raise ExtractorError(u'No free songs founded')
3976
3977         download_link = m_download.group(1)
3978         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
3979                        webpage, re.MULTILINE|re.DOTALL).group('id')
3980
3981         download_webpage = self._download_webpage(download_link, id,
3982                                                   'Downloading free downloads page')
3983         # We get the dictionary of the track from some javascrip code
3984         info = re.search(r'items: (.*?),$',
3985                          download_webpage, re.MULTILINE).group(1)
3986         info = json.loads(info)[0]
3987         # We pick mp3-320 for now, until format selection can be easily implemented.
3988         mp3_info = info[u'downloads'][u'mp3-320']
3989         # If we try to use this url it says the link has expired
3990         initial_url = mp3_info[u'url']
3991         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3992         m_url = re.match(re_url, initial_url)
3993         #We build the url we will use to get the final track url
3994         # This url is build in Bandcamp in the script download_bunde_*.js
3995         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3996         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3997         # If we could correctly generate the .rand field the url would be
3998         #in the "download_url" key
3999         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4000
4001         track_info = {'id':id,
4002                       'title' : info[u'title'],
4003                       'ext' : 'mp3',
4004                       'url' : final_url,
4005                       'thumbnail' : info[u'thumb_url'],
4006                       'uploader' : info[u'artist']
4007                       }
4008
4009         return [track_info]
4010
4011 class RedTubeIE(InfoExtractor):
4012     """Information Extractor for redtube"""
4013     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4014
4015     def _real_extract(self,url):
4016         mobj = re.match(self._VALID_URL, url)
4017         if mobj is None:
4018             raise ExtractorError(u'Invalid URL: %s' % url)
4019
4020         video_id = mobj.group('id')
4021         video_extension = 'mp4'        
4022         webpage = self._download_webpage(url, video_id)
4023         self.report_extraction(video_id)
4024         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4025
4026         if mobj is None:
4027             raise ExtractorError(u'Unable to extract media URL')
4028
4029         video_url = mobj.group(1)
4030         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4031         if mobj is None:
4032             raise ExtractorError(u'Unable to extract title')
4033         video_title = mobj.group(1)
4034
4035         return [{
4036             'id':       video_id,
4037             'url':      video_url,
4038             'ext':      video_extension,
4039             'title':    video_title,
4040         }]
4041         
4042 class InaIE(InfoExtractor):
4043     """Information Extractor for Ina.fr"""
4044     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4045
4046     def _real_extract(self,url):
4047         mobj = re.match(self._VALID_URL, url)
4048
4049         video_id = mobj.group('id')
4050         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4051         video_extension = 'mp4'
4052         webpage = self._download_webpage(mrss_url, video_id)
4053
4054         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4055         if mobj is None:
4056             raise ExtractorError(u'Unable to extract media URL')
4057         video_url = mobj.group(1)
4058
4059         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4060         if mobj is None:
4061             raise ExtractorError(u'Unable to extract title')
4062         video_title = mobj.group(1)
4063
4064         return [{
4065             'id':       video_id,
4066             'url':      video_url,
4067             'ext':      video_extension,
4068             'title':    video_title,
4069         }]
4070
4071 class HowcastIE(InfoExtractor):
4072     """Information Extractor for Ina.fr"""
4073     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>[\d]+)'
4074
4075     def _real_extract(self, url):
4076         mobj = re.match(self._VALID_URL, url)
4077
4078         video_id = mobj.group('id')
4079         webpage_url = 'http://www.howcast.com/videos/' + video_id
4080         webpage = self._download_webpage(webpage_url, video_id)
4081
4082         mobj = re.search(r'\'file\': "(http://mobile-media\.howcast\.com/\d+\.mp4)"', webpage)
4083         if mobj is None:
4084             raise ExtractorError(u'Unable to extract video URL')
4085         video_url = mobj.group(1)
4086
4087         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4088         if mobj is None:
4089             raise ExtractorError(u'Unable to extract title')
4090         video_title = mobj.group(1) or mobj.group(2)
4091
4092         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4093         if mobj is None:
4094             self._downloader.report_warning(u'unable to extract description')
4095             video_description = None
4096         else:
4097             video_description = mobj.group(1) or mobj.group(2)
4098
4099         return [{
4100             'id':       video_id,
4101             'url':      video_url,
4102             'ext':      'mp4',
4103             'title':    video_title,
4104             'description': video_description,
4105         }]
4106
4107 def gen_extractors():
4108     """ Return a list of an instance of every supported extractor.
4109     The order does matter; the first extractor matched is the one handling the URL.
4110     """
4111     return [
4112         YoutubePlaylistIE(),
4113         YoutubeChannelIE(),
4114         YoutubeUserIE(),
4115         YoutubeSearchIE(),
4116         YoutubeIE(),
4117         MetacafeIE(),
4118         DailymotionIE(),
4119         GoogleSearchIE(),
4120         PhotobucketIE(),
4121         YahooIE(),
4122         YahooSearchIE(),
4123         DepositFilesIE(),
4124         FacebookIE(),
4125         BlipTVUserIE(),
4126         BlipTVIE(),
4127         VimeoIE(),
4128         MyVideoIE(),
4129         ComedyCentralIE(),
4130         EscapistIE(),
4131         CollegeHumorIE(),
4132         XVideosIE(),
4133         SoundcloudSetIE(),
4134         SoundcloudIE(),
4135         InfoQIE(),
4136         MixcloudIE(),
4137         StanfordOpenClassroomIE(),
4138         MTVIE(),
4139         YoukuIE(),
4140         XNXXIE(),
4141         YouJizzIE(),
4142         PornotubeIE(),
4143         YouPornIE(),
4144         GooglePlusIE(),
4145         ArteTvIE(),
4146         NBAIE(),
4147         WorldStarHipHopIE(),
4148         JustinTVIE(),
4149         FunnyOrDieIE(),
4150         SteamIE(),
4151         UstreamIE(),
4152         RBMARadioIE(),
4153         EightTracksIE(),
4154         KeekIE(),
4155         TEDIE(),
4156         MySpassIE(),
4157         SpiegelIE(),
4158         LiveLeakIE(),
4159         ARDIE(),
4160         TumblrIE(),
4161         BandcampIE(),
4162         RedTubeIE(),
4163         InaIE(),
4164         HowcastIE(),
4165         GenericIE()
4166     ]
4167
4168 def get_info_extractor(ie_name):
4169     """Returns the info extractor class with the given ie_name"""
4170     return globals()[ie_name+'IE']