Retry to disable YT ratelimit to unlock full bandwidth
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191
192 class YoutubeIE(InfoExtractor):
193     """Information extractor for youtube.com."""
194
195     _VALID_URL = r"""^
196                      (
197                          (?:https?://)?                                       # http(s):// (optional)
198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
201                          (?:                                                  # the various things that can precede the ID:
202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
203                              |(?:                                             # or the v= param in all its forms
204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
207                                  v=
208                              )
209                          )?                                                   # optional -> youtube.com/xxxx is OK
210                      )?                                                       # all until now is optional -> you can pass the naked ID
211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
212                      (?(1).+)?                                                # if we found the ID, everything can follow
213                      $"""
214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218     _NETRC_MACHINE = 'youtube'
219     # Listed in order of quality
220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222     _video_extensions = {
223         '13': '3gp',
224         '17': 'mp4',
225         '18': 'mp4',
226         '22': 'mp4',
227         '37': 'mp4',
228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229         '43': 'webm',
230         '44': 'webm',
231         '45': 'webm',
232         '46': 'webm',
233     }
234     _video_dimensions = {
235         '5': '240x400',
236         '6': '???',
237         '13': '???',
238         '17': '144x176',
239         '18': '360x640',
240         '22': '720x1280',
241         '34': '360x640',
242         '35': '480x854',
243         '37': '1080x1920',
244         '38': '3072x4096',
245         '43': '360x640',
246         '44': '480x854',
247         '45': '720x1280',
248         '46': '1080x1920',
249     }
250     IE_NAME = u'youtube'
251
252     @classmethod
253     def suitable(cls, url):
254         """Receives a URL and returns True if suitable for this IE."""
255         if YoutubePlaylistIE.suitable(url): return False
256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
257
258     def report_lang(self):
259         """Report attempt to set language."""
260         self.to_screen(u'Setting language')
261
262     def report_login(self):
263         """Report attempt to log in."""
264         self.to_screen(u'Logging in')
265
266     def report_video_webpage_download(self, video_id):
267         """Report attempt to download video webpage."""
268         self.to_screen(u'%s: Downloading video webpage' % video_id)
269
270     def report_video_info_webpage_download(self, video_id):
271         """Report attempt to download video info webpage."""
272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
273
274     def report_video_subtitles_download(self, video_id):
275         """Report attempt to download video info webpage."""
276         self.to_screen(u'%s: Checking available subtitles' % video_id)
277
278     def report_video_subtitles_request(self, video_id, sub_lang, format):
279         """Report attempt to download video info webpage."""
280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
281
282     def report_video_subtitles_available(self, video_id, sub_lang_list):
283         """Report available subtitles."""
284         sub_lang = ",".join(list(sub_lang_list.keys()))
285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
286
287     def report_information_extraction(self, video_id):
288         """Report attempt to extract video information."""
289         self.to_screen(u'%s: Extracting video information' % video_id)
290
291     def report_unavailable_format(self, video_id, format):
292         """Report extracted video URL."""
293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295     def report_rtmp_download(self):
296         """Indicate the download will use the RTMP protocol."""
297         self.to_screen(u'RTMP download detected')
298
299     def _get_available_subtitles(self, video_id):
300         self.report_video_subtitles_download(video_id)
301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302         try:
303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308         if not sub_lang_list:
309             return (u'video doesn\'t have subtitles', None)
310         return sub_lang_list
311
312     def _list_available_subtitles(self, video_id):
313         sub_lang_list = self._get_available_subtitles(video_id)
314         self.report_video_subtitles_available(video_id, sub_lang_list)
315
316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
317         """
318         Return tuple:
319         (error_message, sub_lang, sub)
320         """
321         self.report_video_subtitles_request(video_id, sub_lang, format)
322         params = compat_urllib_parse.urlencode({
323             'lang': sub_lang,
324             'name': sub_name,
325             'v': video_id,
326             'fmt': format,
327         })
328         url = 'http://www.youtube.com/api/timedtext?' + params
329         try:
330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
333         if not sub:
334             return (u'Did not fetch video subtitles', None, None)
335         return (None, sub_lang, sub)
336
337     def _extract_subtitle(self, video_id):
338         """
339         Return a list with a tuple:
340         [(error_message, sub_lang, sub)]
341         """
342         sub_lang_list = self._get_available_subtitles(video_id)
343         sub_format = self._downloader.params.get('subtitlesformat')
344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345             return [(sub_lang_list[0], None, None)]
346         if self._downloader.params.get('subtitleslang', False):
347             sub_lang = self._downloader.params.get('subtitleslang')
348         elif 'en' in sub_lang_list:
349             sub_lang = 'en'
350         else:
351             sub_lang = list(sub_lang_list.keys())[0]
352         if not sub_lang in sub_lang_list:
353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
354
355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
356         return [subtitle]
357
358     def _extract_all_subtitles(self, video_id):
359         sub_lang_list = self._get_available_subtitles(video_id)
360         sub_format = self._downloader.params.get('subtitlesformat')
361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362             return [(sub_lang_list[0], None, None)]
363         subtitles = []
364         for sub_lang in sub_lang_list:
365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366             subtitles.append(subtitle)
367         return subtitles
368
369     def _print_formats(self, formats):
370         print('Available formats:')
371         for x in formats:
372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374     def _real_initialize(self):
375         if self._downloader is None:
376             return
377
378         username = None
379         password = None
380         downloader_params = self._downloader.params
381
382         # Attempt to use provided username and password or .netrc data
383         if downloader_params.get('username', None) is not None:
384             username = downloader_params['username']
385             password = downloader_params['password']
386         elif downloader_params.get('usenetrc', False):
387             try:
388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389                 if info is not None:
390                     username = info[0]
391                     password = info[2]
392                 else:
393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394             except (IOError, netrc.NetrcParseError) as err:
395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
396                 return
397
398         # Set language
399         request = compat_urllib_request.Request(self._LANG_URL)
400         try:
401             self.report_lang()
402             compat_urllib_request.urlopen(request).read()
403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
405             return
406
407         # No authentication to be performed
408         if username is None:
409             return
410
411         request = compat_urllib_request.Request(self._LOGIN_URL)
412         try:
413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
416             return
417
418         galx = None
419         dsh = None
420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421         if match:
422           galx = match.group(1)
423
424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425         if match:
426           dsh = match.group(1)
427
428         # Log in
429         login_form_strs = {
430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
431                 u'Email': username,
432                 u'GALX': galx,
433                 u'Passwd': password,
434                 u'PersistentCookie': u'yes',
435                 u'_utf8': u'霱',
436                 u'bgresponse': u'js_disabled',
437                 u'checkConnection': u'',
438                 u'checkedDomains': u'youtube',
439                 u'dnConn': u'',
440                 u'dsh': dsh,
441                 u'pstMsg': u'0',
442                 u'rmShown': u'1',
443                 u'secTok': u'',
444                 u'signIn': u'Sign in',
445                 u'timeStmp': u'',
446                 u'service': u'youtube',
447                 u'uilel': u'3',
448                 u'hl': u'en_US',
449         }
450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451         # chokes on unicode
452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
455         try:
456             self.report_login()
457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459                 self._downloader.report_warning(u'unable to log in: bad username or password')
460                 return
461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463             return
464
465         # Confirm age
466         age_form = {
467                 'next_url':     '/',
468                 'action_confirm':   'Confirm',
469                 }
470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471         try:
472             self.report_age_confirmation()
473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
476
477     def _extract_id(self, url):
478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
479         if mobj is None:
480             raise ExtractorError(u'Invalid URL: %s' % url)
481         video_id = mobj.group(2)
482         return video_id
483
484     def _real_extract(self, url):
485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486         mobj = re.search(self._NEXT_URL_RE, url)
487         if mobj:
488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489         video_id = self._extract_id(url)
490
491         # Get video webpage
492         self.report_video_webpage_download(video_id)
493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494         request = compat_urllib_request.Request(url)
495         try:
496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
499
500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
501
502         # Attempt to extract SWF player URL
503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
504         if mobj is not None:
505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
506         else:
507             player_url = None
508
509         # Get video info
510         self.report_video_info_webpage_download(video_id)
511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513                     % (video_id, el_type))
514             video_info_webpage = self._download_webpage(video_info_url, video_id,
515                                     note=False,
516                                     errnote='unable to download video info webpage')
517             video_info = compat_parse_qs(video_info_webpage)
518             if 'token' in video_info:
519                 break
520         if 'token' not in video_info:
521             if 'reason' in video_info:
522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
523             else:
524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
525
526         # Check for "rental" videos
527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528             raise ExtractorError(u'"rental" videos not supported')
529
530         # Start extracting information
531         self.report_information_extraction(video_id)
532
533         # uploader
534         if 'author' not in video_info:
535             raise ExtractorError(u'Unable to extract uploader name')
536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
537
538         # uploader_id
539         video_uploader_id = None
540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
541         if mobj is not None:
542             video_uploader_id = mobj.group(1)
543         else:
544             self._downloader.report_warning(u'unable to extract uploader nickname')
545
546         # title
547         if 'title' not in video_info:
548             raise ExtractorError(u'Unable to extract video title')
549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550
551         # thumbnail image
552         if 'thumbnail_url' not in video_info:
553             self._downloader.report_warning(u'unable to extract video thumbnail')
554             video_thumbnail = ''
555         else:   # don't panic if we can't find it
556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
557
558         # upload date
559         upload_date = None
560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
561         if mobj is not None:
562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563             upload_date = unified_strdate(upload_date)
564
565         # description
566         video_description = get_element_by_id("eow-description", video_webpage)
567         if video_description:
568             video_description = clean_html(video_description)
569         else:
570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
571             if fd_mobj:
572                 video_description = unescapeHTML(fd_mobj.group(1))
573             else:
574                 video_description = u''
575
576         # subtitles
577         video_subtitles = None
578
579         if self._downloader.params.get('writesubtitles', False):
580             video_subtitles = self._extract_subtitle(video_id)
581             if video_subtitles:
582                 (sub_error, sub_lang, sub) = video_subtitles[0]
583                 if sub_error:
584                     self._downloader.report_error(sub_error)
585
586         if self._downloader.params.get('allsubtitles', False):
587             video_subtitles = self._extract_all_subtitles(video_id)
588             for video_subtitle in video_subtitles:
589                 (sub_error, sub_lang, sub) = video_subtitle
590                 if sub_error:
591                     self._downloader.report_error(sub_error)
592
593         if self._downloader.params.get('listsubtitles', False):
594             sub_lang_list = self._list_available_subtitles(video_id)
595             return
596
597         if 'length_seconds' not in video_info:
598             self._downloader.report_warning(u'unable to extract video duration')
599             video_duration = ''
600         else:
601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
602
603         # token
604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605
606         # Decide which formats to download
607         req_format = self._downloader.params.get('format', None)
608
609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610             self.report_rtmp_download()
611             video_url_list = [(None, video_info['conn'][0])]
612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613             url_map = {}
614             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615                 url_data = compat_parse_qs(url_data_str)
616                 if 'itag' in url_data and 'url' in url_data:
617                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618                     if not 'ratebypass' in url: url += '&ratebypass=yes'
619                     url_map[url_data['itag'][0]] = url
620
621             format_limit = self._downloader.params.get('format_limit', None)
622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623             if format_limit is not None and format_limit in available_formats:
624                 format_list = available_formats[available_formats.index(format_limit):]
625             else:
626                 format_list = available_formats
627             existing_formats = [x for x in format_list if x in url_map]
628             if len(existing_formats) == 0:
629                 raise ExtractorError(u'no known formats available for video')
630             if self._downloader.params.get('listformats', None):
631                 self._print_formats(existing_formats)
632                 return
633             if req_format is None or req_format == 'best':
634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635             elif req_format == 'worst':
636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637             elif req_format in ('-1', 'all'):
638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639             else:
640                 # Specific formats. We pick the first in a slash-delimeted sequence.
641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642                 req_formats = req_format.split('/')
643                 video_url_list = None
644                 for rf in req_formats:
645                     if rf in url_map:
646                         video_url_list = [(rf, url_map[rf])]
647                         break
648                 if video_url_list is None:
649                     raise ExtractorError(u'requested format not available')
650         else:
651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
652
653         results = []
654         for format_param, video_real_url in video_url_list:
655             # Extension
656             video_extension = self._video_extensions.get(format_param, 'flv')
657
658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659                                               self._video_dimensions.get(format_param, '???'))
660
661             results.append({
662                 'id':       video_id,
663                 'url':      video_real_url,
664                 'uploader': video_uploader,
665                 'uploader_id': video_uploader_id,
666                 'upload_date':  upload_date,
667                 'title':    video_title,
668                 'ext':      video_extension,
669                 'format':   video_format,
670                 'thumbnail':    video_thumbnail,
671                 'description':  video_description,
672                 'player_url':   player_url,
673                 'subtitles':    video_subtitles,
674                 'duration':     video_duration
675             })
676         return results
677
678
679 class MetacafeIE(InfoExtractor):
680     """Information Extractor for metacafe.com."""
681
682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685     IE_NAME = u'metacafe'
686
687     def report_disclaimer(self):
688         """Report disclaimer retrieval."""
689         self.to_screen(u'Retrieving disclaimer')
690
691     def _real_initialize(self):
692         # Retrieve disclaimer
693         request = compat_urllib_request.Request(self._DISCLAIMER)
694         try:
695             self.report_disclaimer()
696             disclaimer = compat_urllib_request.urlopen(request).read()
697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
699
700         # Confirm age
701         disclaimer_form = {
702             'filters': '0',
703             'submit': "Continue - I'm over 18",
704             }
705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706         try:
707             self.report_age_confirmation()
708             disclaimer = compat_urllib_request.urlopen(request).read()
709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
711
712     def _real_extract(self, url):
713         # Extract id and simplified title from URL
714         mobj = re.match(self._VALID_URL, url)
715         if mobj is None:
716             raise ExtractorError(u'Invalid URL: %s' % url)
717
718         video_id = mobj.group(1)
719
720         # Check if video comes from YouTube
721         mobj2 = re.match(r'^yt-(.*)$', video_id)
722         if mobj2 is not None:
723             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
724
725         # Retrieve video webpage to extract further information
726         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
727
728         # Extract URL, uploader and title from webpage
729         self.report_extraction(video_id)
730         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
731         if mobj is not None:
732             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733             video_extension = mediaURL[-3:]
734
735             # Extract gdaKey if available
736             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737             if mobj is None:
738                 video_url = mediaURL
739             else:
740                 gdaKey = mobj.group(1)
741                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
742         else:
743             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
744             if mobj is None:
745                 raise ExtractorError(u'Unable to extract media URL')
746             vardict = compat_parse_qs(mobj.group(1))
747             if 'mediaData' not in vardict:
748                 raise ExtractorError(u'Unable to extract media URL')
749             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
750             if mobj is None:
751                 raise ExtractorError(u'Unable to extract media URL')
752             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753             video_extension = mediaURL[-3:]
754             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
755
756         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
757         if mobj is None:
758             raise ExtractorError(u'Unable to extract title')
759         video_title = mobj.group(1).decode('utf-8')
760
761         mobj = re.search(r'submitter=(.*?);', webpage)
762         if mobj is None:
763             raise ExtractorError(u'Unable to extract uploader nickname')
764         video_uploader = mobj.group(1)
765
766         return [{
767             'id':       video_id.decode('utf-8'),
768             'url':      video_url.decode('utf-8'),
769             'uploader': video_uploader.decode('utf-8'),
770             'upload_date':  None,
771             'title':    video_title,
772             'ext':      video_extension.decode('utf-8'),
773         }]
774
775 class DailymotionIE(InfoExtractor):
776     """Information Extractor for Dailymotion"""
777
778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779     IE_NAME = u'dailymotion'
780
781     def _real_extract(self, url):
782         # Extract id and simplified title from URL
783         mobj = re.match(self._VALID_URL, url)
784         if mobj is None:
785             raise ExtractorError(u'Invalid URL: %s' % url)
786
787         video_id = mobj.group(1).split('_')[0].split('?')[0]
788
789         video_extension = 'mp4'
790
791         # Retrieve video webpage to extract further information
792         request = compat_urllib_request.Request(url)
793         request.add_header('Cookie', 'family_filter=off')
794         webpage = self._download_webpage(request, video_id)
795
796         # Extract URL, uploader and title from webpage
797         self.report_extraction(video_id)
798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
799         if mobj is None:
800             raise ExtractorError(u'Unable to extract media URL')
801         flashvars = compat_urllib_parse.unquote(mobj.group(1))
802
803         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
804             if key in flashvars:
805                 max_quality = key
806                 self.to_screen(u'Using %s' % key)
807                 break
808         else:
809             raise ExtractorError(u'Unable to extract video URL')
810
811         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
812         if mobj is None:
813             raise ExtractorError(u'Unable to extract video URL')
814
815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
816
817         # TODO: support choosing qualities
818
819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
820         if mobj is None:
821             raise ExtractorError(u'Unable to extract title')
822         video_title = unescapeHTML(mobj.group('title'))
823
824         video_uploader = None
825         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
826         if mobj is None:
827             # lookin for official user
828             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829             if mobj_official is None:
830                 self._downloader.report_warning(u'unable to extract uploader nickname')
831             else:
832                 video_uploader = mobj_official.group(1)
833         else:
834             video_uploader = mobj.group(1)
835
836         video_upload_date = None
837         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
838         if mobj is not None:
839             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
840
841         return [{
842             'id':       video_id,
843             'url':      video_url,
844             'uploader': video_uploader,
845             'upload_date':  video_upload_date,
846             'title':    video_title,
847             'ext':      video_extension,
848         }]
849
850
851 class PhotobucketIE(InfoExtractor):
852     """Information extractor for photobucket.com."""
853
854     # TODO: the original _VALID_URL was:
855     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856     # Check if it's necessary to keep the old extracion process
857     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858     IE_NAME = u'photobucket'
859
860     def _real_extract(self, url):
861         # Extract id from URL
862         mobj = re.match(self._VALID_URL, url)
863         if mobj is None:
864             raise ExtractorError(u'Invalid URL: %s' % url)
865
866         video_id = mobj.group('id')
867
868         video_extension = mobj.group('ext')
869
870         # Retrieve video webpage to extract further information
871         webpage = self._download_webpage(url, video_id)
872
873         # Extract URL, uploader, and title from webpage
874         self.report_extraction(video_id)
875         # We try first by looking the javascript code:
876         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
877         if mobj is not None:
878             info = json.loads(mobj.group('json'))
879             return [{
880                 'id':       video_id,
881                 'url':      info[u'downloadUrl'],
882                 'uploader': info[u'username'],
883                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884                 'title':    info[u'title'],
885                 'ext':      video_extension,
886                 'thumbnail': info[u'thumbUrl'],
887             }]
888
889         # We try looking in other parts of the webpage
890         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
891         if mobj is None:
892             raise ExtractorError(u'Unable to extract media URL')
893         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894
895         video_url = mediaURL
896
897         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
898         if mobj is None:
899             raise ExtractorError(u'Unable to extract title')
900         video_title = mobj.group(1).decode('utf-8')
901
902         video_uploader = mobj.group(2).decode('utf-8')
903
904         return [{
905             'id':       video_id.decode('utf-8'),
906             'url':      video_url.decode('utf-8'),
907             'uploader': video_uploader,
908             'upload_date':  None,
909             'title':    video_title,
910             'ext':      video_extension.decode('utf-8'),
911         }]
912
913
914 class YahooIE(InfoExtractor):
915     """Information extractor for video.yahoo.com."""
916
917     _WORKING = False
918     # _VALID_URL matches all Yahoo! Video URLs
919     # _VPAGE_URL matches only the extractable '/watch/' URLs
920     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
921     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
922     IE_NAME = u'video.yahoo'
923
924     def _real_extract(self, url, new_video=True):
925         # Extract ID from URL
926         mobj = re.match(self._VALID_URL, url)
927         if mobj is None:
928             raise ExtractorError(u'Invalid URL: %s' % url)
929
930         video_id = mobj.group(2)
931         video_extension = 'flv'
932
933         # Rewrite valid but non-extractable URLs as
934         # extractable English language /watch/ URLs
935         if re.match(self._VPAGE_URL, url) is None:
936             request = compat_urllib_request.Request(url)
937             try:
938                 webpage = compat_urllib_request.urlopen(request).read()
939             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
940                 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
941
942             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
943             if mobj is None:
944                 raise ExtractorError(u'Unable to extract id field')
945             yahoo_id = mobj.group(1)
946
947             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
948             if mobj is None:
949                 raise ExtractorError(u'Unable to extract vid field')
950             yahoo_vid = mobj.group(1)
951
952             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
953             return self._real_extract(url, new_video=False)
954
955         # Retrieve video webpage to extract further information
956         request = compat_urllib_request.Request(url)
957         try:
958             self.report_download_webpage(video_id)
959             webpage = compat_urllib_request.urlopen(request).read()
960         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
961             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
962
963         # Extract uploader and title from webpage
964         self.report_extraction(video_id)
965         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
966         if mobj is None:
967             raise ExtractorError(u'Unable to extract video title')
968         video_title = mobj.group(1).decode('utf-8')
969
970         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
971         if mobj is None:
972             raise ExtractorError(u'Unable to extract video uploader')
973         video_uploader = mobj.group(1).decode('utf-8')
974
975         # Extract video thumbnail
976         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
977         if mobj is None:
978             raise ExtractorError(u'Unable to extract video thumbnail')
979         video_thumbnail = mobj.group(1).decode('utf-8')
980
981         # Extract video description
982         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
983         if mobj is None:
984             raise ExtractorError(u'Unable to extract video description')
985         video_description = mobj.group(1).decode('utf-8')
986         if not video_description:
987             video_description = 'No description available.'
988
989         # Extract video height and width
990         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
991         if mobj is None:
992             raise ExtractorError(u'Unable to extract video height')
993         yv_video_height = mobj.group(1)
994
995         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
996         if mobj is None:
997             raise ExtractorError(u'Unable to extract video width')
998         yv_video_width = mobj.group(1)
999
1000         # Retrieve video playlist to extract media URL
1001         # I'm not completely sure what all these options are, but we
1002         # seem to need most of them, otherwise the server sends a 401.
1003         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1004         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1005         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1008         try:
1009             self.report_download_webpage(video_id)
1010             webpage = compat_urllib_request.urlopen(request).read()
1011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1012             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1013
1014         # Extract media URL from playlist XML
1015         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1016         if mobj is None:
1017             raise ExtractorError(u'Unable to extract media URL')
1018         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019         video_url = unescapeHTML(video_url)
1020
1021         return [{
1022             'id':       video_id.decode('utf-8'),
1023             'url':      video_url,
1024             'uploader': video_uploader,
1025             'upload_date':  None,
1026             'title':    video_title,
1027             'ext':      video_extension.decode('utf-8'),
1028             'thumbnail':    video_thumbnail.decode('utf-8'),
1029             'description':  video_description,
1030         }]
1031
1032
1033 class VimeoIE(InfoExtractor):
1034     """Information extractor for vimeo.com."""
1035
1036     # _VALID_URL matches Vimeo URLs
1037     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1038     IE_NAME = u'vimeo'
1039
1040     def _real_extract(self, url, new_video=True):
1041         # Extract ID from URL
1042         mobj = re.match(self._VALID_URL, url)
1043         if mobj is None:
1044             raise ExtractorError(u'Invalid URL: %s' % url)
1045
1046         video_id = mobj.group('id')
1047         if not mobj.group('proto'):
1048             url = 'https://' + url
1049         if mobj.group('direct_link'):
1050             url = 'https://vimeo.com/' + video_id
1051
1052         # Retrieve video webpage to extract further information
1053         request = compat_urllib_request.Request(url, None, std_headers)
1054         webpage = self._download_webpage(request, video_id)
1055
1056         # Now we begin extracting as much information as we can from what we
1057         # retrieved. First we extract the information common to all extractors,
1058         # and latter we extract those that are Vimeo specific.
1059         self.report_extraction(video_id)
1060
1061         # Extract the config JSON
1062         try:
1063             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1064             config = json.loads(config)
1065         except:
1066             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1067                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1068             else:
1069                 raise ExtractorError(u'Unable to extract info section')
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = u''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             raise ExtractorError(u'No known codec found')
1119
1120         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1121                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1122
1123         return [{
1124             'id':       video_id,
1125             'url':      video_url,
1126             'uploader': video_uploader,
1127             'uploader_id': video_uploader_id,
1128             'upload_date':  video_upload_date,
1129             'title':    video_title,
1130             'ext':      video_extension,
1131             'thumbnail':    video_thumbnail,
1132             'description':  video_description,
1133         }]
1134
1135
1136 class ArteTvIE(InfoExtractor):
1137     """arte.tv information extractor."""
1138
1139     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1140     _LIVE_URL = r'index-[0-9]+\.html$'
1141
1142     IE_NAME = u'arte.tv'
1143
1144     def fetch_webpage(self, url):
1145         request = compat_urllib_request.Request(url)
1146         try:
1147             self.report_download_webpage(url)
1148             webpage = compat_urllib_request.urlopen(request).read()
1149         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1150             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1151         except ValueError as err:
1152             raise ExtractorError(u'Invalid URL: %s' % url)
1153         return webpage
1154
1155     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1156         page = self.fetch_webpage(url)
1157         mobj = re.search(regex, page, regexFlags)
1158         info = {}
1159
1160         if mobj is None:
1161             raise ExtractorError(u'Invalid URL: %s' % url)
1162
1163         for (i, key, err) in matchTuples:
1164             if mobj.group(i) is None:
1165                 raise ExtractorError(err)
1166             else:
1167                 info[key] = mobj.group(i)
1168
1169         return info
1170
1171     def extractLiveStream(self, url):
1172         video_lang = url.split('/')[-4]
1173         info = self.grep_webpage(
1174             url,
1175             r'src="(.*?/videothek_js.*?\.js)',
1176             0,
1177             [
1178                 (1, 'url', u'Invalid URL: %s' % url)
1179             ]
1180         )
1181         http_host = url.split('/')[2]
1182         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1183         info = self.grep_webpage(
1184             next_url,
1185             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1186                 '(http://.*?\.swf).*?' +
1187                 '(rtmp://.*?)\'',
1188             re.DOTALL,
1189             [
1190                 (1, 'path',   u'could not extract video path: %s' % url),
1191                 (2, 'player', u'could not extract video player: %s' % url),
1192                 (3, 'url',    u'could not extract video url: %s' % url)
1193             ]
1194         )
1195         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1196
1197     def extractPlus7Stream(self, url):
1198         video_lang = url.split('/')[-3]
1199         info = self.grep_webpage(
1200             url,
1201             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1202             0,
1203             [
1204                 (1, 'url', u'Invalid URL: %s' % url)
1205             ]
1206         )
1207         next_url = compat_urllib_parse.unquote(info.get('url'))
1208         info = self.grep_webpage(
1209             next_url,
1210             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1211             0,
1212             [
1213                 (1, 'url', u'Could not find <video> tag: %s' % url)
1214             ]
1215         )
1216         next_url = compat_urllib_parse.unquote(info.get('url'))
1217
1218         info = self.grep_webpage(
1219             next_url,
1220             r'<video id="(.*?)".*?>.*?' +
1221                 '<name>(.*?)</name>.*?' +
1222                 '<dateVideo>(.*?)</dateVideo>.*?' +
1223                 '<url quality="hd">(.*?)</url>',
1224             re.DOTALL,
1225             [
1226                 (1, 'id',    u'could not extract video id: %s' % url),
1227                 (2, 'title', u'could not extract video title: %s' % url),
1228                 (3, 'date',  u'could not extract video date: %s' % url),
1229                 (4, 'url',   u'could not extract video url: %s' % url)
1230             ]
1231         )
1232
1233         return {
1234             'id':           info.get('id'),
1235             'url':          compat_urllib_parse.unquote(info.get('url')),
1236             'uploader':     u'arte.tv',
1237             'upload_date':  unified_strdate(info.get('date')),
1238             'title':        info.get('title').decode('utf-8'),
1239             'ext':          u'mp4',
1240             'format':       u'NA',
1241             'player_url':   None,
1242         }
1243
1244     def _real_extract(self, url):
1245         video_id = url.split('/')[-1]
1246         self.report_extraction(video_id)
1247
1248         if re.search(self._LIVE_URL, video_id) is not None:
1249             self.extractLiveStream(url)
1250             return
1251         else:
1252             info = self.extractPlus7Stream(url)
1253
1254         return [info]
1255
1256
1257 class GenericIE(InfoExtractor):
1258     """Generic last-resort information extractor."""
1259
1260     _VALID_URL = r'.*'
1261     IE_NAME = u'generic'
1262
1263     def report_download_webpage(self, video_id):
1264         """Report webpage download."""
1265         if not self._downloader.params.get('test', False):
1266             self._downloader.report_warning(u'Falling back on generic information extractor.')
1267         super(GenericIE, self).report_download_webpage(video_id)
1268
1269     def report_following_redirect(self, new_url):
1270         """Report information extraction."""
1271         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1272
1273     def _test_redirect(self, url):
1274         """Check if it is a redirect, like url shorteners, in case return the new url."""
1275         class HeadRequest(compat_urllib_request.Request):
1276             def get_method(self):
1277                 return "HEAD"
1278
1279         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1280             """
1281             Subclass the HTTPRedirectHandler to make it use our
1282             HeadRequest also on the redirected URL
1283             """
1284             def redirect_request(self, req, fp, code, msg, headers, newurl):
1285                 if code in (301, 302, 303, 307):
1286                     newurl = newurl.replace(' ', '%20')
1287                     newheaders = dict((k,v) for k,v in req.headers.items()
1288                                       if k.lower() not in ("content-length", "content-type"))
1289                     return HeadRequest(newurl,
1290                                        headers=newheaders,
1291                                        origin_req_host=req.get_origin_req_host(),
1292                                        unverifiable=True)
1293                 else:
1294                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1295
1296         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1297             """
1298             Fallback to GET if HEAD is not allowed (405 HTTP error)
1299             """
1300             def http_error_405(self, req, fp, code, msg, headers):
1301                 fp.read()
1302                 fp.close()
1303
1304                 newheaders = dict((k,v) for k,v in req.headers.items()
1305                                   if k.lower() not in ("content-length", "content-type"))
1306                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1307                                                  headers=newheaders,
1308                                                  origin_req_host=req.get_origin_req_host(),
1309                                                  unverifiable=True))
1310
1311         # Build our opener
1312         opener = compat_urllib_request.OpenerDirector()
1313         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1314                         HTTPMethodFallback, HEADRedirectHandler,
1315                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1316             opener.add_handler(handler())
1317
1318         response = opener.open(HeadRequest(url))
1319         new_url = response.geturl()
1320
1321         if url == new_url:
1322             return False
1323
1324         self.report_following_redirect(new_url)
1325         return new_url
1326
1327     def _real_extract(self, url):
1328         new_url = self._test_redirect(url)
1329         if new_url: return [self.url_result(new_url)]
1330
1331         video_id = url.split('/')[-1]
1332         try:
1333             webpage = self._download_webpage(url, video_id)
1334         except ValueError as err:
1335             # since this is the last-resort InfoExtractor, if
1336             # this error is thrown, it'll be thrown here
1337             raise ExtractorError(u'Invalid URL: %s' % url)
1338
1339         self.report_extraction(video_id)
1340         # Start with something easy: JW Player in SWFObject
1341         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1342         if mobj is None:
1343             # Broaden the search a little bit
1344             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1345         if mobj is None:
1346             # Broaden the search a little bit: JWPlayer JS loader
1347             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1348         if mobj is None:
1349             raise ExtractorError(u'Invalid URL: %s' % url)
1350
1351         # It's possible that one of the regexes
1352         # matched, but returned an empty group:
1353         if mobj.group(1) is None:
1354             raise ExtractorError(u'Invalid URL: %s' % url)
1355
1356         video_url = compat_urllib_parse.unquote(mobj.group(1))
1357         video_id = os.path.basename(video_url)
1358
1359         # here's a fun little line of code for you:
1360         video_extension = os.path.splitext(video_id)[1][1:]
1361         video_id = os.path.splitext(video_id)[0]
1362
1363         # it's tempting to parse this further, but you would
1364         # have to take into account all the variations like
1365         #   Video Title - Site Name
1366         #   Site Name | Video Title
1367         #   Video Title - Tagline | Site Name
1368         # and so on and so forth; it's just not practical
1369         mobj = re.search(r'<title>(.*)</title>', webpage)
1370         if mobj is None:
1371             raise ExtractorError(u'Unable to extract title')
1372         video_title = mobj.group(1)
1373
1374         # video uploader is domain name
1375         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1376         if mobj is None:
1377             raise ExtractorError(u'Unable to extract title')
1378         video_uploader = mobj.group(1)
1379
1380         return [{
1381             'id':       video_id,
1382             'url':      video_url,
1383             'uploader': video_uploader,
1384             'upload_date':  None,
1385             'title':    video_title,
1386             'ext':      video_extension,
1387         }]
1388
1389
1390 class YoutubeSearchIE(InfoExtractor):
1391     """Information Extractor for YouTube search queries."""
1392     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1393     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1394     _max_youtube_results = 1000
1395     IE_NAME = u'youtube:search'
1396
1397     def report_download_page(self, query, pagenum):
1398         """Report attempt to download search page with given number."""
1399         query = query.decode(preferredencoding())
1400         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1401
1402     def _real_extract(self, query):
1403         mobj = re.match(self._VALID_URL, query)
1404         if mobj is None:
1405             raise ExtractorError(u'Invalid search query "%s"' % query)
1406
1407         prefix, query = query.split(':')
1408         prefix = prefix[8:]
1409         query = query.encode('utf-8')
1410         if prefix == '':
1411             return self._get_n_results(query, 1)
1412         elif prefix == 'all':
1413             self._get_n_results(query, self._max_youtube_results)
1414         else:
1415             try:
1416                 n = int(prefix)
1417                 if n <= 0:
1418                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1419                 elif n > self._max_youtube_results:
1420                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1421                     n = self._max_youtube_results
1422                 return self._get_n_results(query, n)
1423             except ValueError: # parsing prefix as integer fails
1424                 return self._get_n_results(query, 1)
1425
1426     def _get_n_results(self, query, n):
1427         """Get a specified number of results for a query"""
1428
1429         video_ids = []
1430         pagenum = 0
1431         limit = n
1432
1433         while (50 * pagenum) < limit:
1434             self.report_download_page(query, pagenum+1)
1435             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1436             request = compat_urllib_request.Request(result_url)
1437             try:
1438                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1439             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1440                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1441             api_response = json.loads(data)['data']
1442
1443             if not 'items' in api_response:
1444                 raise ExtractorError(u'[youtube] No video results')
1445
1446             new_ids = list(video['id'] for video in api_response['items'])
1447             video_ids += new_ids
1448
1449             limit = min(n, api_response['totalItems'])
1450             pagenum += 1
1451
1452         if len(video_ids) > n:
1453             video_ids = video_ids[:n]
1454         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1455         return videos
1456
1457
1458 class GoogleSearchIE(InfoExtractor):
1459     """Information Extractor for Google Video search queries."""
1460     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1461     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1462     _max_google_results = 1000
1463     IE_NAME = u'video.google:search'
1464
1465     def _real_extract(self, query):
1466         mobj = re.match(self._VALID_URL, query)
1467
1468         prefix = mobj.group('prefix')
1469         query = mobj.group('query')
1470         if prefix == '':
1471             return self._get_n_results(query, 1)
1472         elif prefix == 'all':
1473             return self._get_n_results(query, self._max_google_results)
1474         else:
1475             n = int(prefix)
1476             if n <= 0:
1477                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1478             elif n > self._max_google_results:
1479                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1480                 n = self._max_google_results
1481             return self._get_n_results(query, n)
1482
1483     def _get_n_results(self, query, n):
1484         """Get a specified number of results for a query"""
1485
1486         res = {
1487             '_type': 'playlist',
1488             'id': query,
1489             'entries': []
1490         }
1491
1492         for pagenum in itertools.count(1):
1493             result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1494             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1495                                              note='Downloading result page ' + str(pagenum))
1496
1497             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1498                 e = {
1499                     '_type': 'url',
1500                     'url': mobj.group(1)
1501                 }
1502                 res['entries'].append(e)
1503
1504             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1505                 return res
1506
1507 class YahooSearchIE(InfoExtractor):
1508     """Information Extractor for Yahoo! Video search queries."""
1509
1510     _WORKING = False
1511     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1512     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1513     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1514     _MORE_PAGES_INDICATOR = r'\s*Next'
1515     _max_yahoo_results = 1000
1516     IE_NAME = u'video.yahoo:search'
1517
1518     def report_download_page(self, query, pagenum):
1519         """Report attempt to download playlist page with given number."""
1520         query = query.decode(preferredencoding())
1521         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1522
1523     def _real_extract(self, query):
1524         mobj = re.match(self._VALID_URL, query)
1525         if mobj is None:
1526             raise ExtractorError(u'Invalid search query "%s"' % query)
1527
1528         prefix, query = query.split(':')
1529         prefix = prefix[8:]
1530         query = query.encode('utf-8')
1531         if prefix == '':
1532             self._download_n_results(query, 1)
1533             return
1534         elif prefix == 'all':
1535             self._download_n_results(query, self._max_yahoo_results)
1536             return
1537         else:
1538             try:
1539                 n = int(prefix)
1540                 if n <= 0:
1541                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1542                 elif n > self._max_yahoo_results:
1543                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1544                     n = self._max_yahoo_results
1545                 self._download_n_results(query, n)
1546                 return
1547             except ValueError: # parsing prefix as integer fails
1548                 self._download_n_results(query, 1)
1549                 return
1550
1551     def _download_n_results(self, query, n):
1552         """Downloads a specified number of results for a query"""
1553
1554         video_ids = []
1555         already_seen = set()
1556         pagenum = 1
1557
1558         while True:
1559             self.report_download_page(query, pagenum)
1560             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1561             request = compat_urllib_request.Request(result_url)
1562             try:
1563                 page = compat_urllib_request.urlopen(request).read()
1564             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1566
1567             # Extract video identifiers
1568             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1569                 video_id = mobj.group(1)
1570                 if video_id not in already_seen:
1571                     video_ids.append(video_id)
1572                     already_seen.add(video_id)
1573                     if len(video_ids) == n:
1574                         # Specified n videos reached
1575                         for id in video_ids:
1576                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1577                         return
1578
1579             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580                 for id in video_ids:
1581                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1582                 return
1583
1584             pagenum = pagenum + 1
1585
1586
1587 class YoutubePlaylistIE(InfoExtractor):
1588     """Information Extractor for YouTube playlists."""
1589
1590     _VALID_URL = r"""(?:
1591                         (?:https?://)?
1592                         (?:\w+\.)?
1593                         youtube\.com/
1594                         (?:
1595                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1596                            \? (?:.*?&)*? (?:p|a|list)=
1597                         |  p/
1598                         )
1599                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1600                         .*
1601                      |
1602                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1603                      )"""
1604     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1605     _MAX_RESULTS = 50
1606     IE_NAME = u'youtube:playlist'
1607
1608     @classmethod
1609     def suitable(cls, url):
1610         """Receives a URL and returns True if suitable for this IE."""
1611         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1612
1613     def _real_extract(self, url):
1614         # Extract playlist id
1615         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1616         if mobj is None:
1617             raise ExtractorError(u'Invalid URL: %s' % url)
1618
1619         # Download playlist videos from API
1620         playlist_id = mobj.group(1) or mobj.group(2)
1621         page_num = 1
1622         videos = []
1623
1624         while True:
1625             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1626             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1627
1628             try:
1629                 response = json.loads(page)
1630             except ValueError as err:
1631                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1632
1633             if 'feed' not in response:
1634                 raise ExtractorError(u'Got a malformed response from YouTube API')
1635             playlist_title = response['feed']['title']['$t']
1636             if 'entry' not in response['feed']:
1637                 # Number of videos is a multiple of self._MAX_RESULTS
1638                 break
1639
1640             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1641                         for entry in response['feed']['entry']
1642                         if 'content' in entry ]
1643
1644             if len(response['feed']['entry']) < self._MAX_RESULTS:
1645                 break
1646             page_num += 1
1647
1648         videos = [v[1] for v in sorted(videos)]
1649
1650         url_results = [self.url_result(url, 'Youtube') for url in videos]
1651         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1652
1653
1654 class YoutubeChannelIE(InfoExtractor):
1655     """Information Extractor for YouTube channels."""
1656
1657     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1658     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1659     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1660     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1661     IE_NAME = u'youtube:channel'
1662
1663     def extract_videos_from_page(self, page):
1664         ids_in_page = []
1665         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1666             if mobj.group(1) not in ids_in_page:
1667                 ids_in_page.append(mobj.group(1))
1668         return ids_in_page
1669
1670     def _real_extract(self, url):
1671         # Extract channel id
1672         mobj = re.match(self._VALID_URL, url)
1673         if mobj is None:
1674             raise ExtractorError(u'Invalid URL: %s' % url)
1675
1676         # Download channel page
1677         channel_id = mobj.group(1)
1678         video_ids = []
1679         pagenum = 1
1680
1681         url = self._TEMPLATE_URL % (channel_id, pagenum)
1682         page = self._download_webpage(url, channel_id,
1683                                       u'Downloading page #%s' % pagenum)
1684
1685         # Extract video identifiers
1686         ids_in_page = self.extract_videos_from_page(page)
1687         video_ids.extend(ids_in_page)
1688
1689         # Download any subsequent channel pages using the json-based channel_ajax query
1690         if self._MORE_PAGES_INDICATOR in page:
1691             while True:
1692                 pagenum = pagenum + 1
1693
1694                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1695                 page = self._download_webpage(url, channel_id,
1696                                               u'Downloading page #%s' % pagenum)
1697
1698                 page = json.loads(page)
1699
1700                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1701                 video_ids.extend(ids_in_page)
1702
1703                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1704                     break
1705
1706         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1707
1708         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1709         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1710         return [self.playlist_result(url_entries, channel_id)]
1711
1712
1713 class YoutubeUserIE(InfoExtractor):
1714     """Information Extractor for YouTube users."""
1715
1716     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1717     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1718     _GDATA_PAGE_SIZE = 50
1719     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1720     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1721     IE_NAME = u'youtube:user'
1722
1723     def _real_extract(self, url):
1724         # Extract username
1725         mobj = re.match(self._VALID_URL, url)
1726         if mobj is None:
1727             raise ExtractorError(u'Invalid URL: %s' % url)
1728
1729         username = mobj.group(1)
1730
1731         # Download video ids using YouTube Data API. Result size per
1732         # query is limited (currently to 50 videos) so we need to query
1733         # page by page until there are no video ids - it means we got
1734         # all of them.
1735
1736         video_ids = []
1737         pagenum = 0
1738
1739         while True:
1740             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1741
1742             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1743             page = self._download_webpage(gdata_url, username,
1744                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1745
1746             # Extract video identifiers
1747             ids_in_page = []
1748
1749             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1750                 if mobj.group(1) not in ids_in_page:
1751                     ids_in_page.append(mobj.group(1))
1752
1753             video_ids.extend(ids_in_page)
1754
1755             # A little optimization - if current page is not
1756             # "full", ie. does not contain PAGE_SIZE video ids then
1757             # we can assume that this page is the last one - there
1758             # are no more ids on further pages - no need to query
1759             # again.
1760
1761             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1762                 break
1763
1764             pagenum += 1
1765
1766         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1767         url_results = [self.url_result(url, 'Youtube') for url in urls]
1768         return [self.playlist_result(url_results, playlist_title = username)]
1769
1770
1771 class BlipTVUserIE(InfoExtractor):
1772     """Information Extractor for blip.tv users."""
1773
1774     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1775     _PAGE_SIZE = 12
1776     IE_NAME = u'blip.tv:user'
1777
1778     def _real_extract(self, url):
1779         # Extract username
1780         mobj = re.match(self._VALID_URL, url)
1781         if mobj is None:
1782             raise ExtractorError(u'Invalid URL: %s' % url)
1783
1784         username = mobj.group(1)
1785
1786         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1787
1788         page = self._download_webpage(url, username, u'Downloading user page')
1789         mobj = re.search(r'data-users-id="([^"]+)"', page)
1790         page_base = page_base % mobj.group(1)
1791
1792
1793         # Download video ids using BlipTV Ajax calls. Result size per
1794         # query is limited (currently to 12 videos) so we need to query
1795         # page by page until there are no video ids - it means we got
1796         # all of them.
1797
1798         video_ids = []
1799         pagenum = 1
1800
1801         while True:
1802             url = page_base + "&page=" + str(pagenum)
1803             page = self._download_webpage(url, username,
1804                                           u'Downloading video ids from page %d' % pagenum)
1805
1806             # Extract video identifiers
1807             ids_in_page = []
1808
1809             for mobj in re.finditer(r'href="/([^"]+)"', page):
1810                 if mobj.group(1) not in ids_in_page:
1811                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1812
1813             video_ids.extend(ids_in_page)
1814
1815             # A little optimization - if current page is not
1816             # "full", ie. does not contain PAGE_SIZE video ids then
1817             # we can assume that this page is the last one - there
1818             # are no more ids on further pages - no need to query
1819             # again.
1820
1821             if len(ids_in_page) < self._PAGE_SIZE:
1822                 break
1823
1824             pagenum += 1
1825
1826         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1827         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1828         return [self.playlist_result(url_entries, playlist_title = username)]
1829
1830
1831 class DepositFilesIE(InfoExtractor):
1832     """Information extractor for depositfiles.com"""
1833
1834     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1835
1836     def _real_extract(self, url):
1837         file_id = url.split('/')[-1]
1838         # Rebuild url in english locale
1839         url = 'http://depositfiles.com/en/files/' + file_id
1840
1841         # Retrieve file webpage with 'Free download' button pressed
1842         free_download_indication = { 'gateway_result' : '1' }
1843         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1844         try:
1845             self.report_download_webpage(file_id)
1846             webpage = compat_urllib_request.urlopen(request).read()
1847         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1849
1850         # Search for the real file URL
1851         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1852         if (mobj is None) or (mobj.group(1) is None):
1853             # Try to figure out reason of the error.
1854             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1855             if (mobj is not None) and (mobj.group(1) is not None):
1856                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1857                 raise ExtractorError(u'%s' % restriction_message)
1858             else:
1859                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1860
1861         file_url = mobj.group(1)
1862         file_extension = os.path.splitext(file_url)[1][1:]
1863
1864         # Search for file title
1865         mobj = re.search(r'<b title="(.*?)">', webpage)
1866         if mobj is None:
1867             raise ExtractorError(u'Unable to extract title')
1868         file_title = mobj.group(1).decode('utf-8')
1869
1870         return [{
1871             'id':       file_id.decode('utf-8'),
1872             'url':      file_url.decode('utf-8'),
1873             'uploader': None,
1874             'upload_date':  None,
1875             'title':    file_title,
1876             'ext':      file_extension.decode('utf-8'),
1877         }]
1878
1879
1880 class FacebookIE(InfoExtractor):
1881     """Information Extractor for Facebook"""
1882
1883     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885     _NETRC_MACHINE = 'facebook'
1886     IE_NAME = u'facebook'
1887
1888     def report_login(self):
1889         """Report attempt to log in."""
1890         self.to_screen(u'Logging in')
1891
1892     def _real_initialize(self):
1893         if self._downloader is None:
1894             return
1895
1896         useremail = None
1897         password = None
1898         downloader_params = self._downloader.params
1899
1900         # Attempt to use provided username and password or .netrc data
1901         if downloader_params.get('username', None) is not None:
1902             useremail = downloader_params['username']
1903             password = downloader_params['password']
1904         elif downloader_params.get('usenetrc', False):
1905             try:
1906                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1907                 if info is not None:
1908                     useremail = info[0]
1909                     password = info[2]
1910                 else:
1911                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1912             except (IOError, netrc.NetrcParseError) as err:
1913                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1914                 return
1915
1916         if useremail is None:
1917             return
1918
1919         # Log in
1920         login_form = {
1921             'email': useremail,
1922             'pass': password,
1923             'login': 'Log+In'
1924             }
1925         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1926         try:
1927             self.report_login()
1928             login_results = compat_urllib_request.urlopen(request).read()
1929             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1930                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1931                 return
1932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1934             return
1935
1936     def _real_extract(self, url):
1937         mobj = re.match(self._VALID_URL, url)
1938         if mobj is None:
1939             raise ExtractorError(u'Invalid URL: %s' % url)
1940         video_id = mobj.group('ID')
1941
1942         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1943         webpage = self._download_webpage(url, video_id)
1944
1945         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1946         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1947         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1948         if not m:
1949             raise ExtractorError(u'Cannot parse data')
1950         data = dict(json.loads(m.group(1)))
1951         params_raw = compat_urllib_parse.unquote(data['params'])
1952         params = json.loads(params_raw)
1953         video_data = params['video_data'][0]
1954         video_url = video_data.get('hd_src')
1955         if not video_url:
1956             video_url = video_data['sd_src']
1957         if not video_url:
1958             raise ExtractorError(u'Cannot find video URL')
1959         video_duration = int(video_data['video_duration'])
1960         thumbnail = video_data['thumbnail_src']
1961
1962         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1963         if not m:
1964             raise ExtractorError(u'Cannot find title in webpage')
1965         video_title = unescapeHTML(m.group(1))
1966
1967         info = {
1968             'id': video_id,
1969             'title': video_title,
1970             'url': video_url,
1971             'ext': 'mp4',
1972             'duration': video_duration,
1973             'thumbnail': thumbnail,
1974         }
1975         return [info]
1976
1977
1978 class BlipTVIE(InfoExtractor):
1979     """Information extractor for blip.tv"""
1980
1981     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1982     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1983     IE_NAME = u'blip.tv'
1984
1985     def report_direct_download(self, title):
1986         """Report information extraction."""
1987         self.to_screen(u'%s: Direct download detected' % title)
1988
1989     def _real_extract(self, url):
1990         mobj = re.match(self._VALID_URL, url)
1991         if mobj is None:
1992             raise ExtractorError(u'Invalid URL: %s' % url)
1993
1994         urlp = compat_urllib_parse_urlparse(url)
1995         if urlp.path.startswith('/play/'):
1996             request = compat_urllib_request.Request(url)
1997             response = compat_urllib_request.urlopen(request)
1998             redirecturl = response.geturl()
1999             rurlp = compat_urllib_parse_urlparse(redirecturl)
2000             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2001             url = 'http://blip.tv/a/a-' + file_id
2002             return self._real_extract(url)
2003
2004
2005         if '?' in url:
2006             cchar = '&'
2007         else:
2008             cchar = '?'
2009         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2010         request = compat_urllib_request.Request(json_url)
2011         request.add_header('User-Agent', 'iTunes/10.6.1')
2012         self.report_extraction(mobj.group(1))
2013         info = None
2014         try:
2015             urlh = compat_urllib_request.urlopen(request)
2016             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2017                 basename = url.split('/')[-1]
2018                 title,ext = os.path.splitext(basename)
2019                 title = title.decode('UTF-8')
2020                 ext = ext.replace('.', '')
2021                 self.report_direct_download(title)
2022                 info = {
2023                     'id': title,
2024                     'url': url,
2025                     'uploader': None,
2026                     'upload_date': None,
2027                     'title': title,
2028                     'ext': ext,
2029                     'urlhandle': urlh
2030                 }
2031         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2032             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2033         if info is None: # Regular URL
2034             try:
2035                 json_code_bytes = urlh.read()
2036                 json_code = json_code_bytes.decode('utf-8')
2037             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2038                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2039
2040             try:
2041                 json_data = json.loads(json_code)
2042                 if 'Post' in json_data:
2043                     data = json_data['Post']
2044                 else:
2045                     data = json_data
2046
2047                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2048                 video_url = data['media']['url']
2049                 umobj = re.match(self._URL_EXT, video_url)
2050                 if umobj is None:
2051                     raise ValueError('Can not determine filename extension')
2052                 ext = umobj.group(1)
2053
2054                 info = {
2055                     'id': data['item_id'],
2056                     'url': video_url,
2057                     'uploader': data['display_name'],
2058                     'upload_date': upload_date,
2059                     'title': data['title'],
2060                     'ext': ext,
2061                     'format': data['media']['mimeType'],
2062                     'thumbnail': data['thumbnailUrl'],
2063                     'description': data['description'],
2064                     'player_url': data['embedUrl'],
2065                     'user_agent': 'iTunes/10.6.1',
2066                 }
2067             except (ValueError,KeyError) as err:
2068                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2069
2070         return [info]
2071
2072
2073 class MyVideoIE(InfoExtractor):
2074     """Information Extractor for myvideo.de."""
2075
2076     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2077     IE_NAME = u'myvideo'
2078
2079     def _real_extract(self,url):
2080         mobj = re.match(self._VALID_URL, url)
2081         if mobj is None:
2082             raise ExtractorError(u'Invalid URL: %s' % url)
2083
2084         video_id = mobj.group(1)
2085
2086         # Get video webpage
2087         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2088         webpage = self._download_webpage(webpage_url, video_id)
2089
2090         self.report_extraction(video_id)
2091         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2092                  webpage)
2093         if mobj is None:
2094             raise ExtractorError(u'Unable to extract media URL')
2095         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2096
2097         mobj = re.search('<title>([^<]+)</title>', webpage)
2098         if mobj is None:
2099             raise ExtractorError(u'Unable to extract title')
2100
2101         video_title = mobj.group(1)
2102
2103         return [{
2104             'id':       video_id,
2105             'url':      video_url,
2106             'uploader': None,
2107             'upload_date':  None,
2108             'title':    video_title,
2109             'ext':      u'flv',
2110         }]
2111
2112 class ComedyCentralIE(InfoExtractor):
2113     """Information extractor for The Daily Show and Colbert Report """
2114
2115     # urls can be abbreviations like :thedailyshow or :colbert
2116     # urls for episodes like:
2117     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2118     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2119     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2120     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2121                       |(https?://)?(www\.)?
2122                           (?P<showname>thedailyshow|colbertnation)\.com/
2123                          (full-episodes/(?P<episode>.*)|
2124                           (?P<clip>
2125                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2126                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2127                      $"""
2128
2129     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2130
2131     _video_extensions = {
2132         '3500': 'mp4',
2133         '2200': 'mp4',
2134         '1700': 'mp4',
2135         '1200': 'mp4',
2136         '750': 'mp4',
2137         '400': 'mp4',
2138     }
2139     _video_dimensions = {
2140         '3500': '1280x720',
2141         '2200': '960x540',
2142         '1700': '768x432',
2143         '1200': '640x360',
2144         '750': '512x288',
2145         '400': '384x216',
2146     }
2147
2148     @classmethod
2149     def suitable(cls, url):
2150         """Receives a URL and returns True if suitable for this IE."""
2151         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2152
2153     def _print_formats(self, formats):
2154         print('Available formats:')
2155         for x in formats:
2156             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2157
2158
2159     def _real_extract(self, url):
2160         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2161         if mobj is None:
2162             raise ExtractorError(u'Invalid URL: %s' % url)
2163
2164         if mobj.group('shortname'):
2165             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2166                 url = u'http://www.thedailyshow.com/full-episodes/'
2167             else:
2168                 url = u'http://www.colbertnation.com/full-episodes/'
2169             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2170             assert mobj is not None
2171
2172         if mobj.group('clip'):
2173             if mobj.group('showname') == 'thedailyshow':
2174                 epTitle = mobj.group('tdstitle')
2175             else:
2176                 epTitle = mobj.group('cntitle')
2177             dlNewest = False
2178         else:
2179             dlNewest = not mobj.group('episode')
2180             if dlNewest:
2181                 epTitle = mobj.group('showname')
2182             else:
2183                 epTitle = mobj.group('episode')
2184
2185         self.report_extraction(epTitle)
2186         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2187         if dlNewest:
2188             url = htmlHandle.geturl()
2189             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2190             if mobj is None:
2191                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2192             if mobj.group('episode') == '':
2193                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2194             epTitle = mobj.group('episode')
2195
2196         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2197
2198         if len(mMovieParams) == 0:
2199             # The Colbert Report embeds the information in a without
2200             # a URL prefix; so extract the alternate reference
2201             # and then add the URL prefix manually.
2202
2203             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2204             if len(altMovieParams) == 0:
2205                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2206             else:
2207                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2208
2209         uri = mMovieParams[0][1]
2210         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2211         indexXml = self._download_webpage(indexUrl, epTitle,
2212                                           u'Downloading show index',
2213                                           u'unable to download episode index')
2214
2215         results = []
2216
2217         idoc = xml.etree.ElementTree.fromstring(indexXml)
2218         itemEls = idoc.findall('.//item')
2219         for partNum,itemEl in enumerate(itemEls):
2220             mediaId = itemEl.findall('./guid')[0].text
2221             shortMediaId = mediaId.split(':')[-1]
2222             showId = mediaId.split(':')[-2].replace('.com', '')
2223             officialTitle = itemEl.findall('./title')[0].text
2224             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2225
2226             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2227                         compat_urllib_parse.urlencode({'uri': mediaId}))
2228             configXml = self._download_webpage(configUrl, epTitle,
2229                                                u'Downloading configuration for %s' % shortMediaId)
2230
2231             cdoc = xml.etree.ElementTree.fromstring(configXml)
2232             turls = []
2233             for rendition in cdoc.findall('.//rendition'):
2234                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2235                 turls.append(finfo)
2236
2237             if len(turls) == 0:
2238                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2239                 continue
2240
2241             if self._downloader.params.get('listformats', None):
2242                 self._print_formats([i[0] for i in turls])
2243                 return
2244
2245             # For now, just pick the highest bitrate
2246             format,rtmp_video_url = turls[-1]
2247
2248             # Get the format arg from the arg stream
2249             req_format = self._downloader.params.get('format', None)
2250
2251             # Select format if we can find one
2252             for f,v in turls:
2253                 if f == req_format:
2254                     format, rtmp_video_url = f, v
2255                     break
2256
2257             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2258             if not m:
2259                 raise ExtractorError(u'Cannot transform RTMP url')
2260             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2261             video_url = base + m.group('finalid')
2262
2263             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2264             info = {
2265                 'id': shortMediaId,
2266                 'url': video_url,
2267                 'uploader': showId,
2268                 'upload_date': officialDate,
2269                 'title': effTitle,
2270                 'ext': 'mp4',
2271                 'format': format,
2272                 'thumbnail': None,
2273                 'description': officialTitle,
2274             }
2275             results.append(info)
2276
2277         return results
2278
2279
2280 class EscapistIE(InfoExtractor):
2281     """Information extractor for The Escapist """
2282
2283     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2284     IE_NAME = u'escapist'
2285
2286     def _real_extract(self, url):
2287         mobj = re.match(self._VALID_URL, url)
2288         if mobj is None:
2289             raise ExtractorError(u'Invalid URL: %s' % url)
2290         showName = mobj.group('showname')
2291         videoId = mobj.group('episode')
2292
2293         self.report_extraction(showName)
2294         webPage = self._download_webpage(url, showName)
2295
2296         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2297         description = unescapeHTML(descMatch.group(1))
2298         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2299         imgUrl = unescapeHTML(imgMatch.group(1))
2300         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2301         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2302         configUrlMatch = re.search('config=(.*)$', playerUrl)
2303         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2304
2305         configJSON = self._download_webpage(configUrl, showName,
2306                                             u'Downloading configuration',
2307                                             u'unable to download configuration')
2308
2309         # Technically, it's JavaScript, not JSON
2310         configJSON = configJSON.replace("'", '"')
2311
2312         try:
2313             config = json.loads(configJSON)
2314         except (ValueError,) as err:
2315             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2316
2317         playlist = config['playlist']
2318         videoUrl = playlist[1]['url']
2319
2320         info = {
2321             'id': videoId,
2322             'url': videoUrl,
2323             'uploader': showName,
2324             'upload_date': None,
2325             'title': showName,
2326             'ext': 'mp4',
2327             'thumbnail': imgUrl,
2328             'description': description,
2329             'player_url': playerUrl,
2330         }
2331
2332         return [info]
2333
2334 class CollegeHumorIE(InfoExtractor):
2335     """Information extractor for collegehumor.com"""
2336
2337     _WORKING = False
2338     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2339     IE_NAME = u'collegehumor'
2340
2341     def report_manifest(self, video_id):
2342         """Report information extraction."""
2343         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2344
2345     def _real_extract(self, url):
2346         mobj = re.match(self._VALID_URL, url)
2347         if mobj is None:
2348             raise ExtractorError(u'Invalid URL: %s' % url)
2349         video_id = mobj.group('videoid')
2350
2351         info = {
2352             'id': video_id,
2353             'uploader': None,
2354             'upload_date': None,
2355         }
2356
2357         self.report_extraction(video_id)
2358         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2359         try:
2360             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2361         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2362             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2363
2364         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2365         try:
2366             videoNode = mdoc.findall('./video')[0]
2367             info['description'] = videoNode.findall('./description')[0].text
2368             info['title'] = videoNode.findall('./caption')[0].text
2369             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2370             manifest_url = videoNode.findall('./file')[0].text
2371         except IndexError:
2372             raise ExtractorError(u'Invalid metadata XML file')
2373
2374         manifest_url += '?hdcore=2.10.3'
2375         self.report_manifest(video_id)
2376         try:
2377             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2378         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2379             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2380
2381         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2382         try:
2383             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2384             node_id = media_node.attrib['url']
2385             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2386         except IndexError as err:
2387             raise ExtractorError(u'Invalid manifest file')
2388
2389         url_pr = compat_urllib_parse_urlparse(manifest_url)
2390         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2391
2392         info['url'] = url
2393         info['ext'] = 'f4f'
2394         return [info]
2395
2396
2397 class XVideosIE(InfoExtractor):
2398     """Information extractor for xvideos.com"""
2399
2400     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2401     IE_NAME = u'xvideos'
2402
2403     def _real_extract(self, url):
2404         mobj = re.match(self._VALID_URL, url)
2405         if mobj is None:
2406             raise ExtractorError(u'Invalid URL: %s' % url)
2407         video_id = mobj.group(1)
2408
2409         webpage = self._download_webpage(url, video_id)
2410
2411         self.report_extraction(video_id)
2412
2413
2414         # Extract video URL
2415         mobj = re.search(r'flv_url=(.+?)&', webpage)
2416         if mobj is None:
2417             raise ExtractorError(u'Unable to extract video url')
2418         video_url = compat_urllib_parse.unquote(mobj.group(1))
2419
2420
2421         # Extract title
2422         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2423         if mobj is None:
2424             raise ExtractorError(u'Unable to extract video title')
2425         video_title = mobj.group(1)
2426
2427
2428         # Extract video thumbnail
2429         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2430         if mobj is None:
2431             raise ExtractorError(u'Unable to extract video thumbnail')
2432         video_thumbnail = mobj.group(0)
2433
2434         info = {
2435             'id': video_id,
2436             'url': video_url,
2437             'uploader': None,
2438             'upload_date': None,
2439             'title': video_title,
2440             'ext': 'flv',
2441             'thumbnail': video_thumbnail,
2442             'description': None,
2443         }
2444
2445         return [info]
2446
2447
2448 class SoundcloudIE(InfoExtractor):
2449     """Information extractor for soundcloud.com
2450        To access the media, the uid of the song and a stream token
2451        must be extracted from the page source and the script must make
2452        a request to media.soundcloud.com/crossdomain.xml. Then
2453        the media can be grabbed by requesting from an url composed
2454        of the stream token and uid
2455      """
2456
2457     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2458     IE_NAME = u'soundcloud'
2459
2460     def report_resolve(self, video_id):
2461         """Report information extraction."""
2462         self.to_screen(u'%s: Resolving id' % video_id)
2463
2464     def _real_extract(self, url):
2465         mobj = re.match(self._VALID_URL, url)
2466         if mobj is None:
2467             raise ExtractorError(u'Invalid URL: %s' % url)
2468
2469         # extract uploader (which is in the url)
2470         uploader = mobj.group(1)
2471         # extract simple title (uploader + slug of song title)
2472         slug_title =  mobj.group(2)
2473         simple_title = uploader + u'-' + slug_title
2474         full_title = '%s/%s' % (uploader, slug_title)
2475
2476         self.report_resolve(full_title)
2477
2478         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2479         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2480         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2481
2482         info = json.loads(info_json)
2483         video_id = info['id']
2484         self.report_extraction(full_title)
2485
2486         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2487         stream_json = self._download_webpage(streams_url, full_title,
2488                                              u'Downloading stream definitions',
2489                                              u'unable to download stream definitions')
2490
2491         streams = json.loads(stream_json)
2492         mediaURL = streams['http_mp3_128_url']
2493         upload_date = unified_strdate(info['created_at'])
2494
2495         return [{
2496             'id':       info['id'],
2497             'url':      mediaURL,
2498             'uploader': info['user']['username'],
2499             'upload_date': upload_date,
2500             'title':    info['title'],
2501             'ext':      u'mp3',
2502             'description': info['description'],
2503         }]
2504
2505 class SoundcloudSetIE(InfoExtractor):
2506     """Information extractor for soundcloud.com sets
2507        To access the media, the uid of the song and a stream token
2508        must be extracted from the page source and the script must make
2509        a request to media.soundcloud.com/crossdomain.xml. Then
2510        the media can be grabbed by requesting from an url composed
2511        of the stream token and uid
2512      """
2513
2514     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2515     IE_NAME = u'soundcloud:set'
2516
2517     def report_resolve(self, video_id):
2518         """Report information extraction."""
2519         self.to_screen(u'%s: Resolving id' % video_id)
2520
2521     def _real_extract(self, url):
2522         mobj = re.match(self._VALID_URL, url)
2523         if mobj is None:
2524             raise ExtractorError(u'Invalid URL: %s' % url)
2525
2526         # extract uploader (which is in the url)
2527         uploader = mobj.group(1)
2528         # extract simple title (uploader + slug of song title)
2529         slug_title =  mobj.group(2)
2530         simple_title = uploader + u'-' + slug_title
2531         full_title = '%s/sets/%s' % (uploader, slug_title)
2532
2533         self.report_resolve(full_title)
2534
2535         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2536         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2537         info_json = self._download_webpage(resolv_url, full_title)
2538
2539         videos = []
2540         info = json.loads(info_json)
2541         if 'errors' in info:
2542             for err in info['errors']:
2543                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2544             return
2545
2546         self.report_extraction(full_title)
2547         for track in info['tracks']:
2548             video_id = track['id']
2549
2550             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2551             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2552
2553             self.report_extraction(video_id)
2554             streams = json.loads(stream_json)
2555             mediaURL = streams['http_mp3_128_url']
2556
2557             videos.append({
2558                 'id':       video_id,
2559                 'url':      mediaURL,
2560                 'uploader': track['user']['username'],
2561                 'upload_date':  unified_strdate(track['created_at']),
2562                 'title':    track['title'],
2563                 'ext':      u'mp3',
2564                 'description': track['description'],
2565             })
2566         return videos
2567
2568
2569 class InfoQIE(InfoExtractor):
2570     """Information extractor for infoq.com"""
2571     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2572
2573     def _real_extract(self, url):
2574         mobj = re.match(self._VALID_URL, url)
2575         if mobj is None:
2576             raise ExtractorError(u'Invalid URL: %s' % url)
2577
2578         webpage = self._download_webpage(url, video_id=url)
2579         self.report_extraction(url)
2580
2581         # Extract video URL
2582         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2583         if mobj is None:
2584             raise ExtractorError(u'Unable to extract video url')
2585         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2586         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2587
2588         # Extract title
2589         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2590         if mobj is None:
2591             raise ExtractorError(u'Unable to extract video title')
2592         video_title = mobj.group(1)
2593
2594         # Extract description
2595         video_description = u'No description available.'
2596         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2597         if mobj is not None:
2598             video_description = mobj.group(1)
2599
2600         video_filename = video_url.split('/')[-1]
2601         video_id, extension = video_filename.split('.')
2602
2603         info = {
2604             'id': video_id,
2605             'url': video_url,
2606             'uploader': None,
2607             'upload_date': None,
2608             'title': video_title,
2609             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2610             'thumbnail': None,
2611             'description': video_description,
2612         }
2613
2614         return [info]
2615
2616 class MixcloudIE(InfoExtractor):
2617     """Information extractor for www.mixcloud.com"""
2618
2619     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2620     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2621     IE_NAME = u'mixcloud'
2622
2623     def report_download_json(self, file_id):
2624         """Report JSON download."""
2625         self.to_screen(u'Downloading json')
2626
2627     def get_urls(self, jsonData, fmt, bitrate='best'):
2628         """Get urls from 'audio_formats' section in json"""
2629         file_url = None
2630         try:
2631             bitrate_list = jsonData[fmt]
2632             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2633                 bitrate = max(bitrate_list) # select highest
2634
2635             url_list = jsonData[fmt][bitrate]
2636         except TypeError: # we have no bitrate info.
2637             url_list = jsonData[fmt]
2638         return url_list
2639
2640     def check_urls(self, url_list):
2641         """Returns 1st active url from list"""
2642         for url in url_list:
2643             try:
2644                 compat_urllib_request.urlopen(url)
2645                 return url
2646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2647                 url = None
2648
2649         return None
2650
2651     def _print_formats(self, formats):
2652         print('Available formats:')
2653         for fmt in formats.keys():
2654             for b in formats[fmt]:
2655                 try:
2656                     ext = formats[fmt][b][0]
2657                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2658                 except TypeError: # we have no bitrate info
2659                     ext = formats[fmt][0]
2660                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2661                     break
2662
2663     def _real_extract(self, url):
2664         mobj = re.match(self._VALID_URL, url)
2665         if mobj is None:
2666             raise ExtractorError(u'Invalid URL: %s' % url)
2667         # extract uploader & filename from url
2668         uploader = mobj.group(1).decode('utf-8')
2669         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2670
2671         # construct API request
2672         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2673         # retrieve .json file with links to files
2674         request = compat_urllib_request.Request(file_url)
2675         try:
2676             self.report_download_json(file_url)
2677             jsonData = compat_urllib_request.urlopen(request).read()
2678         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2679             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2680
2681         # parse JSON
2682         json_data = json.loads(jsonData)
2683         player_url = json_data['player_swf_url']
2684         formats = dict(json_data['audio_formats'])
2685
2686         req_format = self._downloader.params.get('format', None)
2687         bitrate = None
2688
2689         if self._downloader.params.get('listformats', None):
2690             self._print_formats(formats)
2691             return
2692
2693         if req_format is None or req_format == 'best':
2694             for format_param in formats.keys():
2695                 url_list = self.get_urls(formats, format_param)
2696                 # check urls
2697                 file_url = self.check_urls(url_list)
2698                 if file_url is not None:
2699                     break # got it!
2700         else:
2701             if req_format not in formats:
2702                 raise ExtractorError(u'Format is not available')
2703
2704             url_list = self.get_urls(formats, req_format)
2705             file_url = self.check_urls(url_list)
2706             format_param = req_format
2707
2708         return [{
2709             'id': file_id.decode('utf-8'),
2710             'url': file_url.decode('utf-8'),
2711             'uploader': uploader.decode('utf-8'),
2712             'upload_date': None,
2713             'title': json_data['name'],
2714             'ext': file_url.split('.')[-1].decode('utf-8'),
2715             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2716             'thumbnail': json_data['thumbnail_url'],
2717             'description': json_data['description'],
2718             'player_url': player_url.decode('utf-8'),
2719         }]
2720
2721 class StanfordOpenClassroomIE(InfoExtractor):
2722     """Information extractor for Stanford's Open ClassRoom"""
2723
2724     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2725     IE_NAME = u'stanfordoc'
2726
2727     def _real_extract(self, url):
2728         mobj = re.match(self._VALID_URL, url)
2729         if mobj is None:
2730             raise ExtractorError(u'Invalid URL: %s' % url)
2731
2732         if mobj.group('course') and mobj.group('video'): # A specific video
2733             course = mobj.group('course')
2734             video = mobj.group('video')
2735             info = {
2736                 'id': course + '_' + video,
2737                 'uploader': None,
2738                 'upload_date': None,
2739             }
2740
2741             self.report_extraction(info['id'])
2742             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2743             xmlUrl = baseUrl + video + '.xml'
2744             try:
2745                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2746             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2748             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2749             try:
2750                 info['title'] = mdoc.findall('./title')[0].text
2751                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2752             except IndexError:
2753                 raise ExtractorError(u'Invalid metadata XML file')
2754             info['ext'] = info['url'].rpartition('.')[2]
2755             return [info]
2756         elif mobj.group('course'): # A course page
2757             course = mobj.group('course')
2758             info = {
2759                 'id': course,
2760                 'type': 'playlist',
2761                 'uploader': None,
2762                 'upload_date': None,
2763             }
2764
2765             coursepage = self._download_webpage(url, info['id'],
2766                                         note='Downloading course info page',
2767                                         errnote='Unable to download course info page')
2768
2769             m = re.search('<h1>([^<]+)</h1>', coursepage)
2770             if m:
2771                 info['title'] = unescapeHTML(m.group(1))
2772             else:
2773                 info['title'] = info['id']
2774
2775             m = re.search('<description>([^<]+)</description>', coursepage)
2776             if m:
2777                 info['description'] = unescapeHTML(m.group(1))
2778
2779             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2780             info['list'] = [
2781                 {
2782                     'type': 'reference',
2783                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2784                 }
2785                     for vpage in links]
2786             results = []
2787             for entry in info['list']:
2788                 assert entry['type'] == 'reference'
2789                 results += self.extract(entry['url'])
2790             return results
2791         else: # Root page
2792             info = {
2793                 'id': 'Stanford OpenClassroom',
2794                 'type': 'playlist',
2795                 'uploader': None,
2796                 'upload_date': None,
2797             }
2798
2799             self.report_download_webpage(info['id'])
2800             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2801             try:
2802                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2803             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2805
2806             info['title'] = info['id']
2807
2808             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2809             info['list'] = [
2810                 {
2811                     'type': 'reference',
2812                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2813                 }
2814                     for cpage in links]
2815
2816             results = []
2817             for entry in info['list']:
2818                 assert entry['type'] == 'reference'
2819                 results += self.extract(entry['url'])
2820             return results
2821
2822 class MTVIE(InfoExtractor):
2823     """Information extractor for MTV.com"""
2824
2825     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2826     IE_NAME = u'mtv'
2827
2828     def _real_extract(self, url):
2829         mobj = re.match(self._VALID_URL, url)
2830         if mobj is None:
2831             raise ExtractorError(u'Invalid URL: %s' % url)
2832         if not mobj.group('proto'):
2833             url = 'http://' + url
2834         video_id = mobj.group('videoid')
2835
2836         webpage = self._download_webpage(url, video_id)
2837
2838         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2839         if mobj is None:
2840             raise ExtractorError(u'Unable to extract song name')
2841         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2842         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2843         if mobj is None:
2844             raise ExtractorError(u'Unable to extract performer')
2845         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2846         video_title = performer + ' - ' + song_name
2847
2848         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2849         if mobj is None:
2850             raise ExtractorError(u'Unable to mtvn_uri')
2851         mtvn_uri = mobj.group(1)
2852
2853         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2854         if mobj is None:
2855             raise ExtractorError(u'Unable to extract content id')
2856         content_id = mobj.group(1)
2857
2858         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2859         self.report_extraction(video_id)
2860         request = compat_urllib_request.Request(videogen_url)
2861         try:
2862             metadataXml = compat_urllib_request.urlopen(request).read()
2863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2865
2866         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2867         renditions = mdoc.findall('.//rendition')
2868
2869         # For now, always pick the highest quality.
2870         rendition = renditions[-1]
2871
2872         try:
2873             _,_,ext = rendition.attrib['type'].partition('/')
2874             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2875             video_url = rendition.find('./src').text
2876         except KeyError:
2877             raise ExtractorError('Invalid rendition field.')
2878
2879         info = {
2880             'id': video_id,
2881             'url': video_url,
2882             'uploader': performer,
2883             'upload_date': None,
2884             'title': video_title,
2885             'ext': ext,
2886             'format': format,
2887         }
2888
2889         return [info]
2890
2891
2892 class YoukuIE(InfoExtractor):
2893     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2894
2895     def _gen_sid(self):
2896         nowTime = int(time.time() * 1000)
2897         random1 = random.randint(1000,1998)
2898         random2 = random.randint(1000,9999)
2899
2900         return "%d%d%d" %(nowTime,random1,random2)
2901
2902     def _get_file_ID_mix_string(self, seed):
2903         mixed = []
2904         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2905         seed = float(seed)
2906         for i in range(len(source)):
2907             seed  =  (seed * 211 + 30031 ) % 65536
2908             index  =  math.floor(seed / 65536 * len(source) )
2909             mixed.append(source[int(index)])
2910             source.remove(source[int(index)])
2911         #return ''.join(mixed)
2912         return mixed
2913
2914     def _get_file_id(self, fileId, seed):
2915         mixed = self._get_file_ID_mix_string(seed)
2916         ids = fileId.split('*')
2917         realId = []
2918         for ch in ids:
2919             if ch:
2920                 realId.append(mixed[int(ch)])
2921         return ''.join(realId)
2922
2923     def _real_extract(self, url):
2924         mobj = re.match(self._VALID_URL, url)
2925         if mobj is None:
2926             raise ExtractorError(u'Invalid URL: %s' % url)
2927         video_id = mobj.group('ID')
2928
2929         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2930
2931         jsondata = self._download_webpage(info_url, video_id)
2932
2933         self.report_extraction(video_id)
2934         try:
2935             config = json.loads(jsondata)
2936
2937             video_title =  config['data'][0]['title']
2938             seed = config['data'][0]['seed']
2939
2940             format = self._downloader.params.get('format', None)
2941             supported_format = list(config['data'][0]['streamfileids'].keys())
2942
2943             if format is None or format == 'best':
2944                 if 'hd2' in supported_format:
2945                     format = 'hd2'
2946                 else:
2947                     format = 'flv'
2948                 ext = u'flv'
2949             elif format == 'worst':
2950                 format = 'mp4'
2951                 ext = u'mp4'
2952             else:
2953                 format = 'flv'
2954                 ext = u'flv'
2955
2956
2957             fileid = config['data'][0]['streamfileids'][format]
2958             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2959         except (UnicodeDecodeError, ValueError, KeyError):
2960             raise ExtractorError(u'Unable to extract info section')
2961
2962         files_info=[]
2963         sid = self._gen_sid()
2964         fileid = self._get_file_id(fileid, seed)
2965
2966         #column 8,9 of fileid represent the segment number
2967         #fileid[7:9] should be changed
2968         for index, key in enumerate(keys):
2969
2970             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2971             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2972
2973             info = {
2974                 'id': '%s_part%02d' % (video_id, index),
2975                 'url': download_url,
2976                 'uploader': None,
2977                 'upload_date': None,
2978                 'title': video_title,
2979                 'ext': ext,
2980             }
2981             files_info.append(info)
2982
2983         return files_info
2984
2985
2986 class XNXXIE(InfoExtractor):
2987     """Information extractor for xnxx.com"""
2988
2989     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2990     IE_NAME = u'xnxx'
2991     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2992     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2993     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2994
2995     def _real_extract(self, url):
2996         mobj = re.match(self._VALID_URL, url)
2997         if mobj is None:
2998             raise ExtractorError(u'Invalid URL: %s' % url)
2999         video_id = mobj.group(1)
3000
3001         # Get webpage content
3002         webpage = self._download_webpage(url, video_id)
3003
3004         result = re.search(self.VIDEO_URL_RE, webpage)
3005         if result is None:
3006             raise ExtractorError(u'Unable to extract video url')
3007         video_url = compat_urllib_parse.unquote(result.group(1))
3008
3009         result = re.search(self.VIDEO_TITLE_RE, webpage)
3010         if result is None:
3011             raise ExtractorError(u'Unable to extract video title')
3012         video_title = result.group(1)
3013
3014         result = re.search(self.VIDEO_THUMB_RE, webpage)
3015         if result is None:
3016             raise ExtractorError(u'Unable to extract video thumbnail')
3017         video_thumbnail = result.group(1)
3018
3019         return [{
3020             'id': video_id,
3021             'url': video_url,
3022             'uploader': None,
3023             'upload_date': None,
3024             'title': video_title,
3025             'ext': 'flv',
3026             'thumbnail': video_thumbnail,
3027             'description': None,
3028         }]
3029
3030
3031 class GooglePlusIE(InfoExtractor):
3032     """Information extractor for plus.google.com."""
3033
3034     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3035     IE_NAME = u'plus.google'
3036
3037     def report_extract_entry(self, url):
3038         """Report downloading extry"""
3039         self.to_screen(u'Downloading entry: %s' % url)
3040
3041     def report_date(self, upload_date):
3042         """Report downloading extry"""
3043         self.to_screen(u'Entry date: %s' % upload_date)
3044
3045     def report_uploader(self, uploader):
3046         """Report downloading extry"""
3047         self.to_screen(u'Uploader: %s' % uploader)
3048
3049     def report_title(self, video_title):
3050         """Report downloading extry"""
3051         self.to_screen(u'Title: %s' % video_title)
3052
3053     def report_extract_vid_page(self, video_page):
3054         """Report information extraction."""
3055         self.to_screen(u'Extracting video page: %s' % video_page)
3056
3057     def _real_extract(self, url):
3058         # Extract id from URL
3059         mobj = re.match(self._VALID_URL, url)
3060         if mobj is None:
3061             raise ExtractorError(u'Invalid URL: %s' % url)
3062
3063         post_url = mobj.group(0)
3064         video_id = mobj.group(1)
3065
3066         video_extension = 'flv'
3067
3068         # Step 1, Retrieve post webpage to extract further information
3069         self.report_extract_entry(post_url)
3070         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3071
3072         # Extract update date
3073         upload_date = None
3074         pattern = 'title="Timestamp">(.*?)</a>'
3075         mobj = re.search(pattern, webpage)
3076         if mobj:
3077             upload_date = mobj.group(1)
3078             # Convert timestring to a format suitable for filename
3079             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3080             upload_date = upload_date.strftime('%Y%m%d')
3081         self.report_date(upload_date)
3082
3083         # Extract uploader
3084         uploader = None
3085         pattern = r'rel\="author".*?>(.*?)</a>'
3086         mobj = re.search(pattern, webpage)
3087         if mobj:
3088             uploader = mobj.group(1)
3089         self.report_uploader(uploader)
3090
3091         # Extract title
3092         # Get the first line for title
3093         video_title = u'NA'
3094         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3095         mobj = re.search(pattern, webpage)
3096         if mobj:
3097             video_title = mobj.group(1)
3098         self.report_title(video_title)
3099
3100         # Step 2, Stimulate clicking the image box to launch video
3101         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3102         mobj = re.search(pattern, webpage)
3103         if mobj is None:
3104             raise ExtractorError(u'Unable to extract video page URL')
3105
3106         video_page = mobj.group(1)
3107         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3108         self.report_extract_vid_page(video_page)
3109
3110
3111         # Extract video links on video page
3112         """Extract video links of all sizes"""
3113         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3114         mobj = re.findall(pattern, webpage)
3115         if len(mobj) == 0:
3116             raise ExtractorError(u'Unable to extract video links')
3117
3118         # Sort in resolution
3119         links = sorted(mobj)
3120
3121         # Choose the lowest of the sort, i.e. highest resolution
3122         video_url = links[-1]
3123         # Only get the url. The resolution part in the tuple has no use anymore
3124         video_url = video_url[-1]
3125         # Treat escaped \u0026 style hex
3126         try:
3127             video_url = video_url.decode("unicode_escape")
3128         except AttributeError: # Python 3
3129             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3130
3131
3132         return [{
3133             'id':       video_id,
3134             'url':      video_url,
3135             'uploader': uploader,
3136             'upload_date':  upload_date,
3137             'title':    video_title,
3138             'ext':      video_extension,
3139         }]
3140
3141 class NBAIE(InfoExtractor):
3142     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3143     IE_NAME = u'nba'
3144
3145     def _real_extract(self, url):
3146         mobj = re.match(self._VALID_URL, url)
3147         if mobj is None:
3148             raise ExtractorError(u'Invalid URL: %s' % url)
3149
3150         video_id = mobj.group(1)
3151         if video_id.endswith('/index.html'):
3152             video_id = video_id[:-len('/index.html')]
3153
3154         webpage = self._download_webpage(url, video_id)
3155
3156         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3157         def _findProp(rexp, default=None):
3158             m = re.search(rexp, webpage)
3159             if m:
3160                 return unescapeHTML(m.group(1))
3161             else:
3162                 return default
3163
3164         shortened_video_id = video_id.rpartition('/')[2]
3165         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3166         info = {
3167             'id': shortened_video_id,
3168             'url': video_url,
3169             'ext': 'mp4',
3170             'title': title,
3171             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3172             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3173         }
3174         return [info]
3175
3176 class JustinTVIE(InfoExtractor):
3177     """Information extractor for justin.tv and twitch.tv"""
3178     # TODO: One broadcast may be split into multiple videos. The key
3179     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3180     # starts at 1 and increases. Can we treat all parts as one video?
3181
3182     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3183         (?:
3184             (?P<channelid>[^/]+)|
3185             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3186             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3187         )
3188         /?(?:\#.*)?$
3189         """
3190     _JUSTIN_PAGE_LIMIT = 100
3191     IE_NAME = u'justin.tv'
3192
3193     def report_download_page(self, channel, offset):
3194         """Report attempt to download a single page of videos."""
3195         self.to_screen(u'%s: Downloading video information from %d to %d' %
3196                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3197
3198     # Return count of items, list of *valid* items
3199     def _parse_page(self, url, video_id):
3200         webpage = self._download_webpage(url, video_id,
3201                                          u'Downloading video info JSON',
3202                                          u'unable to download video info JSON')
3203
3204         response = json.loads(webpage)
3205         if type(response) != list:
3206             error_text = response.get('error', 'unknown error')
3207             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3208         info = []
3209         for clip in response:
3210             video_url = clip['video_file_url']
3211             if video_url:
3212                 video_extension = os.path.splitext(video_url)[1][1:]
3213                 video_date = re.sub('-', '', clip['start_time'][:10])
3214                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3215                 video_id = clip['id']
3216                 video_title = clip.get('title', video_id)
3217                 info.append({
3218                     'id': video_id,
3219                     'url': video_url,
3220                     'title': video_title,
3221                     'uploader': clip.get('channel_name', video_uploader_id),
3222                     'uploader_id': video_uploader_id,
3223                     'upload_date': video_date,
3224                     'ext': video_extension,
3225                 })
3226         return (len(response), info)
3227
3228     def _real_extract(self, url):
3229         mobj = re.match(self._VALID_URL, url)
3230         if mobj is None:
3231             raise ExtractorError(u'invalid URL: %s' % url)
3232
3233         api_base = 'http://api.justin.tv'
3234         paged = False
3235         if mobj.group('channelid'):
3236             paged = True
3237             video_id = mobj.group('channelid')
3238             api = api_base + '/channel/archives/%s.json' % video_id
3239         elif mobj.group('chapterid'):
3240             chapter_id = mobj.group('chapterid')
3241
3242             webpage = self._download_webpage(url, chapter_id)
3243             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3244             if not m:
3245                 raise ExtractorError(u'Cannot find archive of a chapter')
3246             archive_id = m.group(1)
3247
3248             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3249             chapter_info_xml = self._download_webpage(api, chapter_id,
3250                                              note=u'Downloading chapter information',
3251                                              errnote=u'Chapter information download failed')
3252             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3253             for a in doc.findall('.//archive'):
3254                 if archive_id == a.find('./id').text:
3255                     break
3256             else:
3257                 raise ExtractorError(u'Could not find chapter in chapter information')
3258
3259             video_url = a.find('./video_file_url').text
3260             video_ext = video_url.rpartition('.')[2] or u'flv'
3261
3262             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3263             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3264                                    note='Downloading chapter metadata',
3265                                    errnote='Download of chapter metadata failed')
3266             chapter_info = json.loads(chapter_info_json)
3267
3268             bracket_start = int(doc.find('.//bracket_start').text)
3269             bracket_end = int(doc.find('.//bracket_end').text)
3270
3271             # TODO determine start (and probably fix up file)
3272             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3273             #video_url += u'?start=' + TODO:start_timestamp
3274             # bracket_start is 13290, but we want 51670615
3275             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3276                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3277
3278             info = {
3279                 'id': u'c' + chapter_id,
3280                 'url': video_url,
3281                 'ext': video_ext,
3282                 'title': chapter_info['title'],
3283                 'thumbnail': chapter_info['preview'],
3284                 'description': chapter_info['description'],
3285                 'uploader': chapter_info['channel']['display_name'],
3286                 'uploader_id': chapter_info['channel']['name'],
3287             }
3288             return [info]
3289         else:
3290             video_id = mobj.group('videoid')
3291             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3292
3293         self.report_extraction(video_id)
3294
3295         info = []
3296         offset = 0
3297         limit = self._JUSTIN_PAGE_LIMIT
3298         while True:
3299             if paged:
3300                 self.report_download_page(video_id, offset)
3301             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3302             page_count, page_info = self._parse_page(page_url, video_id)
3303             info.extend(page_info)
3304             if not paged or page_count != limit:
3305                 break
3306             offset += limit
3307         return info
3308
3309 class FunnyOrDieIE(InfoExtractor):
3310     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3311
3312     def _real_extract(self, url):
3313         mobj = re.match(self._VALID_URL, url)
3314         if mobj is None:
3315             raise ExtractorError(u'invalid URL: %s' % url)
3316
3317         video_id = mobj.group('id')
3318         webpage = self._download_webpage(url, video_id)
3319
3320         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3321         if not m:
3322             raise ExtractorError(u'Unable to find video information')
3323         video_url = unescapeHTML(m.group('url'))
3324
3325         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3326         if not m:
3327             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3328             if not m:
3329                 raise ExtractorError(u'Cannot find video title')
3330         title = clean_html(m.group('title'))
3331
3332         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3333         if m:
3334             desc = unescapeHTML(m.group('desc'))
3335         else:
3336             desc = None
3337
3338         info = {
3339             'id': video_id,
3340             'url': video_url,
3341             'ext': 'mp4',
3342             'title': title,
3343             'description': desc,
3344         }
3345         return [info]
3346
3347 class SteamIE(InfoExtractor):
3348     _VALID_URL = r"""http://store\.steampowered\.com/
3349                 (agecheck/)?
3350                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3351                 (?P<gameID>\d+)/?
3352                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3353                 """
3354
3355     @classmethod
3356     def suitable(cls, url):
3357         """Receives a URL and returns True if suitable for this IE."""
3358         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3359
3360     def _real_extract(self, url):
3361         m = re.match(self._VALID_URL, url, re.VERBOSE)
3362         gameID = m.group('gameID')
3363         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3364         self.report_age_confirmation()
3365         webpage = self._download_webpage(videourl, gameID)
3366         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3367         
3368         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3369         mweb = re.finditer(urlRE, webpage)
3370         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3371         titles = re.finditer(namesRE, webpage)
3372         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3373         thumbs = re.finditer(thumbsRE, webpage)
3374         videos = []
3375         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3376             video_id = vid.group('videoID')
3377             title = vtitle.group('videoName')
3378             video_url = vid.group('videoURL')
3379             video_thumb = thumb.group('thumbnail')
3380             if not video_url:
3381                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3382             info = {
3383                 'id':video_id,
3384                 'url':video_url,
3385                 'ext': 'flv',
3386                 'title': unescapeHTML(title),
3387                 'thumbnail': video_thumb
3388                   }
3389             videos.append(info)
3390         return [self.playlist_result(videos, gameID, game_title)]
3391
3392 class UstreamIE(InfoExtractor):
3393     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3394     IE_NAME = u'ustream'
3395
3396     def _real_extract(self, url):
3397         m = re.match(self._VALID_URL, url)
3398         video_id = m.group('videoID')
3399         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3400         webpage = self._download_webpage(url, video_id)
3401         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3402         title = m.group('title')
3403         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3404         uploader = m.group('uploader')
3405         info = {
3406                 'id':video_id,
3407                 'url':video_url,
3408                 'ext': 'flv',
3409                 'title': title,
3410                 'uploader': uploader
3411                   }
3412         return [info]
3413
3414 class WorldStarHipHopIE(InfoExtractor):
3415     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3416     IE_NAME = u'WorldStarHipHop'
3417
3418     def _real_extract(self, url):
3419         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3420
3421         m = re.match(self._VALID_URL, url)
3422         video_id = m.group('id')
3423
3424         webpage_src = self._download_webpage(url, video_id) 
3425
3426         mobj = re.search(_src_url, webpage_src)
3427
3428         if mobj is not None:
3429             video_url = mobj.group(1)
3430             if 'mp4' in video_url:
3431                 ext = 'mp4'
3432             else:
3433                 ext = 'flv'
3434         else:
3435             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3436
3437         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3438
3439         if mobj is None:
3440             raise ExtractorError(u'Cannot determine title')
3441         title = mobj.group(1)
3442
3443         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3444         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3445         if mobj is not None:
3446             thumbnail = mobj.group(1)
3447         else:
3448             _title = r"""candytitles.*>(.*)</span>"""
3449             mobj = re.search(_title, webpage_src)
3450             if mobj is not None:
3451                 title = mobj.group(1)
3452             thumbnail = None
3453
3454         results = [{
3455                     'id': video_id,
3456                     'url' : video_url,
3457                     'title' : title,
3458                     'thumbnail' : thumbnail,
3459                     'ext' : ext,
3460                     }]
3461         return results
3462
3463 class RBMARadioIE(InfoExtractor):
3464     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3465
3466     def _real_extract(self, url):
3467         m = re.match(self._VALID_URL, url)
3468         video_id = m.group('videoID')
3469
3470         webpage = self._download_webpage(url, video_id)
3471         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3472         if not m:
3473             raise ExtractorError(u'Cannot find metadata')
3474         json_data = m.group(1)
3475
3476         try:
3477             data = json.loads(json_data)
3478         except ValueError as e:
3479             raise ExtractorError(u'Invalid JSON: ' + str(e))
3480
3481         video_url = data['akamai_url'] + '&cbr=256'
3482         url_parts = compat_urllib_parse_urlparse(video_url)
3483         video_ext = url_parts.path.rpartition('.')[2]
3484         info = {
3485                 'id': video_id,
3486                 'url': video_url,
3487                 'ext': video_ext,
3488                 'title': data['title'],
3489                 'description': data.get('teaser_text'),
3490                 'location': data.get('country_of_origin'),
3491                 'uploader': data.get('host', {}).get('name'),
3492                 'uploader_id': data.get('host', {}).get('slug'),
3493                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3494                 'duration': data.get('duration'),
3495         }
3496         return [info]
3497
3498
3499 class YouPornIE(InfoExtractor):
3500     """Information extractor for youporn.com."""
3501     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3502
3503     def _print_formats(self, formats):
3504         """Print all available formats"""
3505         print(u'Available formats:')
3506         print(u'ext\t\tformat')
3507         print(u'---------------------------------')
3508         for format in formats:
3509             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3510
3511     def _specific(self, req_format, formats):
3512         for x in formats:
3513             if(x["format"]==req_format):
3514                 return x
3515         return None
3516
3517     def _real_extract(self, url):
3518         mobj = re.match(self._VALID_URL, url)
3519         if mobj is None:
3520             raise ExtractorError(u'Invalid URL: %s' % url)
3521
3522         video_id = mobj.group('videoid')
3523
3524         req = compat_urllib_request.Request(url)
3525         req.add_header('Cookie', 'age_verified=1')
3526         webpage = self._download_webpage(req, video_id)
3527
3528         # Get the video title
3529         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3530         if result is None:
3531             raise ExtractorError(u'Unable to extract video title')
3532         video_title = result.group('title').strip()
3533
3534         # Get the video date
3535         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3536         if result is None:
3537             self._downloader.report_warning(u'unable to extract video date')
3538             upload_date = None
3539         else:
3540             upload_date = unified_strdate(result.group('date').strip())
3541
3542         # Get the video uploader
3543         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3544         if result is None:
3545             self._downloader.report_warning(u'unable to extract uploader')
3546             video_uploader = None
3547         else:
3548             video_uploader = result.group('uploader').strip()
3549             video_uploader = clean_html( video_uploader )
3550
3551         # Get all of the formats available
3552         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3553         result = re.search(DOWNLOAD_LIST_RE, webpage)
3554         if result is None:
3555             raise ExtractorError(u'Unable to extract download list')
3556         download_list_html = result.group('download_list').strip()
3557
3558         # Get all of the links from the page
3559         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3560         links = re.findall(LINK_RE, download_list_html)
3561         if(len(links) == 0):
3562             raise ExtractorError(u'ERROR: no known formats available for video')
3563
3564         self.to_screen(u'Links found: %d' % len(links))
3565
3566         formats = []
3567         for link in links:
3568
3569             # A link looks like this:
3570             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3571             # A path looks like this:
3572             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3573             video_url = unescapeHTML( link )
3574             path = compat_urllib_parse_urlparse( video_url ).path
3575             extension = os.path.splitext( path )[1][1:]
3576             format = path.split('/')[4].split('_')[:2]
3577             size = format[0]
3578             bitrate = format[1]
3579             format = "-".join( format )
3580             title = u'%s-%s-%s' % (video_title, size, bitrate)
3581
3582             formats.append({
3583                 'id': video_id,
3584                 'url': video_url,
3585                 'uploader': video_uploader,
3586                 'upload_date': upload_date,
3587                 'title': title,
3588                 'ext': extension,
3589                 'format': format,
3590                 'thumbnail': None,
3591                 'description': None,
3592                 'player_url': None
3593             })
3594
3595         if self._downloader.params.get('listformats', None):
3596             self._print_formats(formats)
3597             return
3598
3599         req_format = self._downloader.params.get('format', None)
3600         self.to_screen(u'Format: %s' % req_format)
3601
3602         if req_format is None or req_format == 'best':
3603             return [formats[0]]
3604         elif req_format == 'worst':
3605             return [formats[-1]]
3606         elif req_format in ('-1', 'all'):
3607             return formats
3608         else:
3609             format = self._specific( req_format, formats )
3610             if result is None:
3611                 raise ExtractorError(u'Requested format not available')
3612             return [format]
3613
3614
3615
3616 class PornotubeIE(InfoExtractor):
3617     """Information extractor for pornotube.com."""
3618     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3619
3620     def _real_extract(self, url):
3621         mobj = re.match(self._VALID_URL, url)
3622         if mobj is None:
3623             raise ExtractorError(u'Invalid URL: %s' % url)
3624
3625         video_id = mobj.group('videoid')
3626         video_title = mobj.group('title')
3627
3628         # Get webpage content
3629         webpage = self._download_webpage(url, video_id)
3630
3631         # Get the video URL
3632         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3633         result = re.search(VIDEO_URL_RE, webpage)
3634         if result is None:
3635             raise ExtractorError(u'Unable to extract video url')
3636         video_url = compat_urllib_parse.unquote(result.group('url'))
3637
3638         #Get the uploaded date
3639         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3640         result = re.search(VIDEO_UPLOADED_RE, webpage)
3641         if result is None:
3642             raise ExtractorError(u'Unable to extract video title')
3643         upload_date = unified_strdate(result.group('date'))
3644
3645         info = {'id': video_id,
3646                 'url': video_url,
3647                 'uploader': None,
3648                 'upload_date': upload_date,
3649                 'title': video_title,
3650                 'ext': 'flv',
3651                 'format': 'flv'}
3652
3653         return [info]
3654
3655 class YouJizzIE(InfoExtractor):
3656     """Information extractor for youjizz.com."""
3657     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3658
3659     def _real_extract(self, url):
3660         mobj = re.match(self._VALID_URL, url)
3661         if mobj is None:
3662             raise ExtractorError(u'Invalid URL: %s' % url)
3663
3664         video_id = mobj.group('videoid')
3665
3666         # Get webpage content
3667         webpage = self._download_webpage(url, video_id)
3668
3669         # Get the video title
3670         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3671         if result is None:
3672             raise ExtractorError(u'ERROR: unable to extract video title')
3673         video_title = result.group('title').strip()
3674
3675         # Get the embed page
3676         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3677         if result is None:
3678             raise ExtractorError(u'ERROR: unable to extract embed page')
3679
3680         embed_page_url = result.group(0).strip()
3681         video_id = result.group('videoid')
3682
3683         webpage = self._download_webpage(embed_page_url, video_id)
3684
3685         # Get the video URL
3686         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3687         if result is None:
3688             raise ExtractorError(u'ERROR: unable to extract video url')
3689         video_url = result.group('source')
3690
3691         info = {'id': video_id,
3692                 'url': video_url,
3693                 'title': video_title,
3694                 'ext': 'flv',
3695                 'format': 'flv',
3696                 'player_url': embed_page_url}
3697
3698         return [info]
3699
3700 class EightTracksIE(InfoExtractor):
3701     IE_NAME = '8tracks'
3702     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3703
3704     def _real_extract(self, url):
3705         mobj = re.match(self._VALID_URL, url)
3706         if mobj is None:
3707             raise ExtractorError(u'Invalid URL: %s' % url)
3708         playlist_id = mobj.group('id')
3709
3710         webpage = self._download_webpage(url, playlist_id)
3711
3712         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3713         if not m:
3714             raise ExtractorError(u'Cannot find trax information')
3715         json_like = m.group(1)
3716         data = json.loads(json_like)
3717
3718         session = str(random.randint(0, 1000000000))
3719         mix_id = data['id']
3720         track_count = data['tracks_count']
3721         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3722         next_url = first_url
3723         res = []
3724         for i in itertools.count():
3725             api_json = self._download_webpage(next_url, playlist_id,
3726                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3727                 errnote=u'Failed to download song information')
3728             api_data = json.loads(api_json)
3729             track_data = api_data[u'set']['track']
3730             info = {
3731                 'id': track_data['id'],
3732                 'url': track_data['track_file_stream_url'],
3733                 'title': track_data['performer'] + u' - ' + track_data['name'],
3734                 'raw_title': track_data['name'],
3735                 'uploader_id': data['user']['login'],
3736                 'ext': 'm4a',
3737             }
3738             res.append(info)
3739             if api_data['set']['at_last_track']:
3740                 break
3741             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3742         return res
3743
3744 class KeekIE(InfoExtractor):
3745     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3746     IE_NAME = u'keek'
3747
3748     def _real_extract(self, url):
3749         m = re.match(self._VALID_URL, url)
3750         video_id = m.group('videoID')
3751         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3752         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3753         webpage = self._download_webpage(url, video_id)
3754         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3755         title = unescapeHTML(m.group('title'))
3756         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3757         uploader = clean_html(m.group('uploader'))
3758         info = {
3759                 'id': video_id,
3760                 'url': video_url,
3761                 'ext': 'mp4',
3762                 'title': title,
3763                 'thumbnail': thumbnail,
3764                 'uploader': uploader
3765         }
3766         return [info]
3767
3768 class TEDIE(InfoExtractor):
3769     _VALID_URL=r'''http://www\.ted\.com/
3770                    (
3771                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3772                         |
3773                         ((?P<type_talk>talks)) # We have a simple talk
3774                    )
3775                    (/lang/(.*?))? # The url may contain the language
3776                    /(?P<name>\w+) # Here goes the name and then ".html"
3777                    '''
3778
3779     @classmethod
3780     def suitable(cls, url):
3781         """Receives a URL and returns True if suitable for this IE."""
3782         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3783
3784     def _real_extract(self, url):
3785         m=re.match(self._VALID_URL, url, re.VERBOSE)
3786         if m.group('type_talk'):
3787             return [self._talk_info(url)]
3788         else :
3789             playlist_id=m.group('playlist_id')
3790             name=m.group('name')
3791             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792             return [self._playlist_videos_info(url,name,playlist_id)]
3793
3794     def _talk_video_link(self,mediaSlug):
3795         '''Returns the video link for that mediaSlug'''
3796         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3797
3798     def _playlist_videos_info(self,url,name,playlist_id=0):
3799         '''Returns the videos of the playlist'''
3800         video_RE=r'''
3801                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802                      ([.\s]*?)data-playlist_item_id="(\d+)"
3803                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3804                      '''
3805         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808         m_names=re.finditer(video_name_RE,webpage)
3809
3810         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811         m_playlist = re.search(playlist_RE, webpage)
3812         playlist_title = m_playlist.group('playlist_title')
3813
3814         playlist_entries = []
3815         for m_video, m_name in zip(m_videos,m_names):
3816             video_id=m_video.group('video_id')
3817             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818             playlist_entries.append(self.url_result(talk_url, 'TED'))
3819         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820
3821     def _talk_info(self, url, video_id=0):
3822         """Return the video for the talk in the url"""
3823         m=re.match(self._VALID_URL, url,re.VERBOSE)
3824         videoName=m.group('name')
3825         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826         # If the url includes the language we get the title translated
3827         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828         title=re.search(title_RE, webpage).group('title')
3829         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830                         "id":(?P<videoID>[\d]+).*?
3831                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833         thumb_match=re.search(thumb_RE,webpage)
3834         info_match=re.search(info_RE,webpage,re.VERBOSE)
3835         video_id=info_match.group('videoID')
3836         mediaSlug=info_match.group('mediaSlug')
3837         video_url=self._talk_video_link(mediaSlug)
3838         info = {
3839                 'id': video_id,
3840                 'url': video_url,
3841                 'ext': 'mp4',
3842                 'title': title,
3843                 'thumbnail': thumb_match.group('thumbnail')
3844                 }
3845         return info
3846
3847 class MySpassIE(InfoExtractor):
3848     _VALID_URL = r'http://www.myspass.de/.*'
3849
3850     def _real_extract(self, url):
3851         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3852
3853         # video id is the last path element of the URL
3854         # usually there is a trailing slash, so also try the second but last
3855         url_path = compat_urllib_parse_urlparse(url).path
3856         url_parent_path, video_id = os.path.split(url_path)
3857         if not video_id:
3858             _, video_id = os.path.split(url_parent_path)
3859
3860         # get metadata
3861         metadata_url = META_DATA_URL_TEMPLATE % video_id
3862         metadata_text = self._download_webpage(metadata_url, video_id)
3863         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3864
3865         # extract values from metadata
3866         url_flv_el = metadata.find('url_flv')
3867         if url_flv_el is None:
3868             raise ExtractorError(u'Unable to extract download url')
3869         video_url = url_flv_el.text
3870         extension = os.path.splitext(video_url)[1][1:]
3871         title_el = metadata.find('title')
3872         if title_el is None:
3873             raise ExtractorError(u'Unable to extract title')
3874         title = title_el.text
3875         format_id_el = metadata.find('format_id')
3876         if format_id_el is None:
3877             format = ext
3878         else:
3879             format = format_id_el.text
3880         description_el = metadata.find('description')
3881         if description_el is not None:
3882             description = description_el.text
3883         else:
3884             description = None
3885         imagePreview_el = metadata.find('imagePreview')
3886         if imagePreview_el is not None:
3887             thumbnail = imagePreview_el.text
3888         else:
3889             thumbnail = None
3890         info = {
3891             'id': video_id,
3892             'url': video_url,
3893             'title': title,
3894             'ext': extension,
3895             'format': format,
3896             'thumbnail': thumbnail,
3897             'description': description
3898         }
3899         return [info]
3900
3901 class SpiegelIE(InfoExtractor):
3902     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3903
3904     def _real_extract(self, url):
3905         m = re.match(self._VALID_URL, url)
3906         video_id = m.group('videoID')
3907
3908         webpage = self._download_webpage(url, video_id)
3909         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3910         if not m:
3911             raise ExtractorError(u'Cannot find title')
3912         video_title = unescapeHTML(m.group(1))
3913
3914         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915         xml_code = self._download_webpage(xml_url, video_id,
3916                     note=u'Downloading XML', errnote=u'Failed to download XML')
3917
3918         idoc = xml.etree.ElementTree.fromstring(xml_code)
3919         last_type = idoc[-1]
3920         filename = last_type.findall('./filename')[0].text
3921         duration = float(last_type.findall('./duration')[0].text)
3922
3923         video_url = 'http://video2.spiegel.de/flash/' + filename
3924         video_ext = filename.rpartition('.')[2]
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'ext': video_ext,
3929             'title': video_title,
3930             'duration': duration,
3931         }
3932         return [info]
3933
3934 class LiveLeakIE(InfoExtractor):
3935
3936     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937     IE_NAME = u'liveleak'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943
3944         video_id = mobj.group('video_id')
3945
3946         webpage = self._download_webpage(url, video_id)
3947
3948         m = re.search(r'file: "(.*?)",', webpage)
3949         if not m:
3950             raise ExtractorError(u'Unable to find video url')
3951         video_url = m.group(1)
3952
3953         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3954         if not m:
3955             raise ExtractorError(u'Cannot find video title')
3956         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3957
3958         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3959         if m:
3960             desc = unescapeHTML(m.group('desc'))
3961         else:
3962             desc = None
3963
3964         m = re.search(r'By:.*?(\w+)</a>', webpage)
3965         if m:
3966             uploader = clean_html(m.group(1))
3967         else:
3968             uploader = None
3969
3970         info = {
3971             'id':  video_id,
3972             'url': video_url,
3973             'ext': 'mp4',
3974             'title': title,
3975             'description': desc,
3976             'uploader': uploader
3977         }
3978
3979         return [info]
3980
3981 class ARDIE(InfoExtractor):
3982     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3983     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3984     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3985
3986     def _real_extract(self, url):
3987         # determine video id from url
3988         m = re.match(self._VALID_URL, url)
3989
3990         numid = re.search(r'documentId=([0-9]+)', url)
3991         if numid:
3992             video_id = numid.group(1)
3993         else:
3994             video_id = m.group('video_id')
3995
3996         # determine title and media streams from webpage
3997         html = self._download_webpage(url, video_id)
3998         title = re.search(self._TITLE, html).group('title')
3999         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4000         if not streams:
4001             assert '"fsk"' in html
4002             raise ExtractorError(u'This video is only available after 8:00 pm')
4003
4004         # choose default media type and highest quality for now
4005         stream = max([s for s in streams if int(s["media_type"]) == 0],
4006                      key=lambda s: int(s["quality"]))
4007
4008         # there's two possibilities: RTMP stream or HTTP download
4009         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4010         if stream['rtmp_url']:
4011             self.to_screen(u'RTMP download detected')
4012             assert stream['video_url'].startswith('mp4:')
4013             info["url"] = stream["rtmp_url"]
4014             info["play_path"] = stream['video_url']
4015         else:
4016             assert stream["video_url"].endswith('.mp4')
4017             info["url"] = stream["video_url"]
4018         return [info]
4019
4020 class TumblrIE(InfoExtractor):
4021     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4022
4023     def _real_extract(self, url):
4024         m_url = re.match(self._VALID_URL, url)
4025         video_id = m_url.group('id')
4026         blog = m_url.group('blog_name')
4027
4028         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4029         webpage = self._download_webpage(url, video_id)
4030
4031         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4032         video = re.search(re_video, webpage)
4033         if video is None:
4034             self.to_screen("No video founded")
4035             return []
4036         video_url = video.group('video_url')
4037         ext = video.group('ext')
4038
4039         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4040         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4041
4042         # The only place where you can get a title, it's not complete,
4043         # but searching in other places doesn't work for all videos
4044         re_title = r'<title>(?P<title>.*?)</title>'
4045         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4046
4047         return [{'id': video_id,
4048                  'url': video_url,
4049                  'title': title,
4050                  'thumbnail': thumb,
4051                  'ext': ext
4052                  }]
4053
4054 class BandcampIE(InfoExtractor):
4055     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4056
4057     def _real_extract(self, url):
4058         mobj = re.match(self._VALID_URL, url)
4059         title = mobj.group('title')
4060         webpage = self._download_webpage(url, title)
4061         # We get the link to the free download page
4062         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4063         if m_download is None:
4064             raise ExtractorError(u'No free songs founded')
4065
4066         download_link = m_download.group(1)
4067         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4068                        webpage, re.MULTILINE|re.DOTALL).group('id')
4069
4070         download_webpage = self._download_webpage(download_link, id,
4071                                                   'Downloading free downloads page')
4072         # We get the dictionary of the track from some javascrip code
4073         info = re.search(r'items: (.*?),$',
4074                          download_webpage, re.MULTILINE).group(1)
4075         info = json.loads(info)[0]
4076         # We pick mp3-320 for now, until format selection can be easily implemented.
4077         mp3_info = info[u'downloads'][u'mp3-320']
4078         # If we try to use this url it says the link has expired
4079         initial_url = mp3_info[u'url']
4080         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4081         m_url = re.match(re_url, initial_url)
4082         #We build the url we will use to get the final track url
4083         # This url is build in Bandcamp in the script download_bunde_*.js
4084         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4085         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4086         # If we could correctly generate the .rand field the url would be
4087         #in the "download_url" key
4088         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4089
4090         track_info = {'id':id,
4091                       'title' : info[u'title'],
4092                       'ext' : 'mp3',
4093                       'url' : final_url,
4094                       'thumbnail' : info[u'thumb_url'],
4095                       'uploader' : info[u'artist']
4096                       }
4097
4098         return [track_info]
4099
4100 class RedTubeIE(InfoExtractor):
4101     """Information Extractor for redtube"""
4102     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4103
4104     def _real_extract(self,url):
4105         mobj = re.match(self._VALID_URL, url)
4106         if mobj is None:
4107             raise ExtractorError(u'Invalid URL: %s' % url)
4108
4109         video_id = mobj.group('id')
4110         video_extension = 'mp4'        
4111         webpage = self._download_webpage(url, video_id)
4112         self.report_extraction(video_id)
4113         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4114
4115         if mobj is None:
4116             raise ExtractorError(u'Unable to extract media URL')
4117
4118         video_url = mobj.group(1)
4119         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4120         if mobj is None:
4121             raise ExtractorError(u'Unable to extract title')
4122         video_title = mobj.group(1)
4123
4124         return [{
4125             'id':       video_id,
4126             'url':      video_url,
4127             'ext':      video_extension,
4128             'title':    video_title,
4129         }]
4130         
4131 class InaIE(InfoExtractor):
4132     """Information Extractor for Ina.fr"""
4133     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4134
4135     def _real_extract(self,url):
4136         mobj = re.match(self._VALID_URL, url)
4137
4138         video_id = mobj.group('id')
4139         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4140         video_extension = 'mp4'
4141         webpage = self._download_webpage(mrss_url, video_id)
4142
4143         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4144         if mobj is None:
4145             raise ExtractorError(u'Unable to extract media URL')
4146         video_url = mobj.group(1)
4147
4148         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4149         if mobj is None:
4150             raise ExtractorError(u'Unable to extract title')
4151         video_title = mobj.group(1)
4152
4153         return [{
4154             'id':       video_id,
4155             'url':      video_url,
4156             'ext':      video_extension,
4157             'title':    video_title,
4158         }]
4159
4160 def gen_extractors():
4161     """ Return a list of an instance of every supported extractor.
4162     The order does matter; the first extractor matched is the one handling the URL.
4163     """
4164     return [
4165         YoutubePlaylistIE(),
4166         YoutubeChannelIE(),
4167         YoutubeUserIE(),
4168         YoutubeSearchIE(),
4169         YoutubeIE(),
4170         MetacafeIE(),
4171         DailymotionIE(),
4172         GoogleSearchIE(),
4173         PhotobucketIE(),
4174         YahooIE(),
4175         YahooSearchIE(),
4176         DepositFilesIE(),
4177         FacebookIE(),
4178         BlipTVUserIE(),
4179         BlipTVIE(),
4180         VimeoIE(),
4181         MyVideoIE(),
4182         ComedyCentralIE(),
4183         EscapistIE(),
4184         CollegeHumorIE(),
4185         XVideosIE(),
4186         SoundcloudSetIE(),
4187         SoundcloudIE(),
4188         InfoQIE(),
4189         MixcloudIE(),
4190         StanfordOpenClassroomIE(),
4191         MTVIE(),
4192         YoukuIE(),
4193         XNXXIE(),
4194         YouJizzIE(),
4195         PornotubeIE(),
4196         YouPornIE(),
4197         GooglePlusIE(),
4198         ArteTvIE(),
4199         NBAIE(),
4200         WorldStarHipHopIE(),
4201         JustinTVIE(),
4202         FunnyOrDieIE(),
4203         SteamIE(),
4204         UstreamIE(),
4205         RBMARadioIE(),
4206         EightTracksIE(),
4207         KeekIE(),
4208         TEDIE(),
4209         MySpassIE(),
4210         SpiegelIE(),
4211         LiveLeakIE(),
4212         ARDIE(),
4213         TumblrIE(),
4214         BandcampIE(),
4215         RedTubeIE(),
4216         InaIE(),
4217         GenericIE()
4218     ]
4219
4220 def get_info_extractor(ie_name):
4221     """Returns the info extractor class with the given ie_name"""
4222     return globals()[ie_name+'IE']