Merge remote-tracking branch 'origin/master'
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191
192 class YoutubeIE(InfoExtractor):
193     """Information extractor for youtube.com."""
194
195     _VALID_URL = r"""^
196                      (
197                          (?:https?://)?                                       # http(s):// (optional)
198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
201                          (?:                                                  # the various things that can precede the ID:
202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
203                              |(?:                                             # or the v= param in all its forms
204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
207                                  v=
208                              )
209                          )?                                                   # optional -> youtube.com/xxxx is OK
210                      )?                                                       # all until now is optional -> you can pass the naked ID
211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
212                      (?(1).+)?                                                # if we found the ID, everything can follow
213                      $"""
214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218     _NETRC_MACHINE = 'youtube'
219     # Listed in order of quality
220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222     _video_extensions = {
223         '13': '3gp',
224         '17': 'mp4',
225         '18': 'mp4',
226         '22': 'mp4',
227         '37': 'mp4',
228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229         '43': 'webm',
230         '44': 'webm',
231         '45': 'webm',
232         '46': 'webm',
233     }
234     _video_dimensions = {
235         '5': '240x400',
236         '6': '???',
237         '13': '???',
238         '17': '144x176',
239         '18': '360x640',
240         '22': '720x1280',
241         '34': '360x640',
242         '35': '480x854',
243         '37': '1080x1920',
244         '38': '3072x4096',
245         '43': '360x640',
246         '44': '480x854',
247         '45': '720x1280',
248         '46': '1080x1920',
249     }
250     IE_NAME = u'youtube'
251
252     @classmethod
253     def suitable(cls, url):
254         """Receives a URL and returns True if suitable for this IE."""
255         if YoutubePlaylistIE.suitable(url): return False
256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
257
258     def report_lang(self):
259         """Report attempt to set language."""
260         self.to_screen(u'Setting language')
261
262     def report_login(self):
263         """Report attempt to log in."""
264         self.to_screen(u'Logging in')
265
266     def report_video_webpage_download(self, video_id):
267         """Report attempt to download video webpage."""
268         self.to_screen(u'%s: Downloading video webpage' % video_id)
269
270     def report_video_info_webpage_download(self, video_id):
271         """Report attempt to download video info webpage."""
272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
273
274     def report_video_subtitles_download(self, video_id):
275         """Report attempt to download video info webpage."""
276         self.to_screen(u'%s: Checking available subtitles' % video_id)
277
278     def report_video_subtitles_request(self, video_id, sub_lang, format):
279         """Report attempt to download video info webpage."""
280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
281
282     def report_video_subtitles_available(self, video_id, sub_lang_list):
283         """Report available subtitles."""
284         sub_lang = ",".join(list(sub_lang_list.keys()))
285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
286
287     def report_information_extraction(self, video_id):
288         """Report attempt to extract video information."""
289         self.to_screen(u'%s: Extracting video information' % video_id)
290
291     def report_unavailable_format(self, video_id, format):
292         """Report extracted video URL."""
293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295     def report_rtmp_download(self):
296         """Indicate the download will use the RTMP protocol."""
297         self.to_screen(u'RTMP download detected')
298
299     def _get_available_subtitles(self, video_id):
300         self.report_video_subtitles_download(video_id)
301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302         try:
303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308         if not sub_lang_list:
309             return (u'video doesn\'t have subtitles', None)
310         return sub_lang_list
311
312     def _list_available_subtitles(self, video_id):
313         sub_lang_list = self._get_available_subtitles(video_id)
314         self.report_video_subtitles_available(video_id, sub_lang_list)
315
316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
317         """
318         Return tuple:
319         (error_message, sub_lang, sub)
320         """
321         self.report_video_subtitles_request(video_id, sub_lang, format)
322         params = compat_urllib_parse.urlencode({
323             'lang': sub_lang,
324             'name': sub_name,
325             'v': video_id,
326             'fmt': format,
327         })
328         url = 'http://www.youtube.com/api/timedtext?' + params
329         try:
330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
333         if not sub:
334             return (u'Did not fetch video subtitles', None, None)
335         return (None, sub_lang, sub)
336
337     def _extract_subtitle(self, video_id):
338         """
339         Return a list with a tuple:
340         [(error_message, sub_lang, sub)]
341         """
342         sub_lang_list = self._get_available_subtitles(video_id)
343         sub_format = self._downloader.params.get('subtitlesformat')
344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345             return [(sub_lang_list[0], None, None)]
346         if self._downloader.params.get('subtitleslang', False):
347             sub_lang = self._downloader.params.get('subtitleslang')
348         elif 'en' in sub_lang_list:
349             sub_lang = 'en'
350         else:
351             sub_lang = list(sub_lang_list.keys())[0]
352         if not sub_lang in sub_lang_list:
353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
354
355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
356         return [subtitle]
357
358     def _extract_all_subtitles(self, video_id):
359         sub_lang_list = self._get_available_subtitles(video_id)
360         sub_format = self._downloader.params.get('subtitlesformat')
361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362             return [(sub_lang_list[0], None, None)]
363         subtitles = []
364         for sub_lang in sub_lang_list:
365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366             subtitles.append(subtitle)
367         return subtitles
368
369     def _print_formats(self, formats):
370         print('Available formats:')
371         for x in formats:
372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374     def _real_initialize(self):
375         if self._downloader is None:
376             return
377
378         username = None
379         password = None
380         downloader_params = self._downloader.params
381
382         # Attempt to use provided username and password or .netrc data
383         if downloader_params.get('username', None) is not None:
384             username = downloader_params['username']
385             password = downloader_params['password']
386         elif downloader_params.get('usenetrc', False):
387             try:
388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389                 if info is not None:
390                     username = info[0]
391                     password = info[2]
392                 else:
393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394             except (IOError, netrc.NetrcParseError) as err:
395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
396                 return
397
398         # Set language
399         request = compat_urllib_request.Request(self._LANG_URL)
400         try:
401             self.report_lang()
402             compat_urllib_request.urlopen(request).read()
403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
405             return
406
407         # No authentication to be performed
408         if username is None:
409             return
410
411         request = compat_urllib_request.Request(self._LOGIN_URL)
412         try:
413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
416             return
417
418         galx = None
419         dsh = None
420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421         if match:
422           galx = match.group(1)
423
424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425         if match:
426           dsh = match.group(1)
427
428         # Log in
429         login_form_strs = {
430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
431                 u'Email': username,
432                 u'GALX': galx,
433                 u'Passwd': password,
434                 u'PersistentCookie': u'yes',
435                 u'_utf8': u'霱',
436                 u'bgresponse': u'js_disabled',
437                 u'checkConnection': u'',
438                 u'checkedDomains': u'youtube',
439                 u'dnConn': u'',
440                 u'dsh': dsh,
441                 u'pstMsg': u'0',
442                 u'rmShown': u'1',
443                 u'secTok': u'',
444                 u'signIn': u'Sign in',
445                 u'timeStmp': u'',
446                 u'service': u'youtube',
447                 u'uilel': u'3',
448                 u'hl': u'en_US',
449         }
450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451         # chokes on unicode
452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
455         try:
456             self.report_login()
457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459                 self._downloader.report_warning(u'unable to log in: bad username or password')
460                 return
461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463             return
464
465         # Confirm age
466         age_form = {
467                 'next_url':     '/',
468                 'action_confirm':   'Confirm',
469                 }
470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471         try:
472             self.report_age_confirmation()
473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
476
477     def _extract_id(self, url):
478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
479         if mobj is None:
480             raise ExtractorError(u'Invalid URL: %s' % url)
481         video_id = mobj.group(2)
482         return video_id
483
484     def _real_extract(self, url):
485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486         mobj = re.search(self._NEXT_URL_RE, url)
487         if mobj:
488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489         video_id = self._extract_id(url)
490
491         # Get video webpage
492         self.report_video_webpage_download(video_id)
493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494         request = compat_urllib_request.Request(url)
495         try:
496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
499
500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
501
502         # Attempt to extract SWF player URL
503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
504         if mobj is not None:
505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
506         else:
507             player_url = None
508
509         # Get video info
510         self.report_video_info_webpage_download(video_id)
511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513                     % (video_id, el_type))
514             video_info_webpage = self._download_webpage(video_info_url, video_id,
515                                     note=False,
516                                     errnote='unable to download video info webpage')
517             video_info = compat_parse_qs(video_info_webpage)
518             if 'token' in video_info:
519                 break
520         if 'token' not in video_info:
521             if 'reason' in video_info:
522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
523             else:
524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
525
526         # Check for "rental" videos
527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528             raise ExtractorError(u'"rental" videos not supported')
529
530         # Start extracting information
531         self.report_information_extraction(video_id)
532
533         # uploader
534         if 'author' not in video_info:
535             raise ExtractorError(u'Unable to extract uploader name')
536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
537
538         # uploader_id
539         video_uploader_id = None
540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
541         if mobj is not None:
542             video_uploader_id = mobj.group(1)
543         else:
544             self._downloader.report_warning(u'unable to extract uploader nickname')
545
546         # title
547         if 'title' not in video_info:
548             raise ExtractorError(u'Unable to extract video title')
549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550
551         # thumbnail image
552         if 'thumbnail_url' not in video_info:
553             self._downloader.report_warning(u'unable to extract video thumbnail')
554             video_thumbnail = ''
555         else:   # don't panic if we can't find it
556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
557
558         # upload date
559         upload_date = None
560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
561         if mobj is not None:
562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563             upload_date = unified_strdate(upload_date)
564
565         # description
566         video_description = get_element_by_id("eow-description", video_webpage)
567         if video_description:
568             video_description = clean_html(video_description)
569         else:
570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
571             if fd_mobj:
572                 video_description = unescapeHTML(fd_mobj.group(1))
573             else:
574                 video_description = u''
575
576         # subtitles
577         video_subtitles = None
578
579         if self._downloader.params.get('writesubtitles', False):
580             video_subtitles = self._extract_subtitle(video_id)
581             if video_subtitles:
582                 (sub_error, sub_lang, sub) = video_subtitles[0]
583                 if sub_error:
584                     self._downloader.report_error(sub_error)
585
586         if self._downloader.params.get('allsubtitles', False):
587             video_subtitles = self._extract_all_subtitles(video_id)
588             for video_subtitle in video_subtitles:
589                 (sub_error, sub_lang, sub) = video_subtitle
590                 if sub_error:
591                     self._downloader.report_error(sub_error)
592
593         if self._downloader.params.get('listsubtitles', False):
594             sub_lang_list = self._list_available_subtitles(video_id)
595             return
596
597         if 'length_seconds' not in video_info:
598             self._downloader.report_warning(u'unable to extract video duration')
599             video_duration = ''
600         else:
601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
602
603         # token
604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605
606         # Decide which formats to download
607         req_format = self._downloader.params.get('format', None)
608
609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610             self.report_rtmp_download()
611             video_url_list = [(None, video_info['conn'][0])]
612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
614             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
615             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
616             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
617
618             format_limit = self._downloader.params.get('format_limit', None)
619             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
620             if format_limit is not None and format_limit in available_formats:
621                 format_list = available_formats[available_formats.index(format_limit):]
622             else:
623                 format_list = available_formats
624             existing_formats = [x for x in format_list if x in url_map]
625             if len(existing_formats) == 0:
626                 raise ExtractorError(u'no known formats available for video')
627             if self._downloader.params.get('listformats', None):
628                 self._print_formats(existing_formats)
629                 return
630             if req_format is None or req_format == 'best':
631                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632             elif req_format == 'worst':
633                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634             elif req_format in ('-1', 'all'):
635                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
636             else:
637                 # Specific formats. We pick the first in a slash-delimeted sequence.
638                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639                 req_formats = req_format.split('/')
640                 video_url_list = None
641                 for rf in req_formats:
642                     if rf in url_map:
643                         video_url_list = [(rf, url_map[rf])]
644                         break
645                 if video_url_list is None:
646                     raise ExtractorError(u'requested format not available')
647         else:
648             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
649
650         results = []
651         for format_param, video_real_url in video_url_list:
652             # Extension
653             video_extension = self._video_extensions.get(format_param, 'flv')
654
655             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
656                                               self._video_dimensions.get(format_param, '???'))
657
658             results.append({
659                 'id':       video_id,
660                 'url':      video_real_url,
661                 'uploader': video_uploader,
662                 'uploader_id': video_uploader_id,
663                 'upload_date':  upload_date,
664                 'title':    video_title,
665                 'ext':      video_extension,
666                 'format':   video_format,
667                 'thumbnail':    video_thumbnail,
668                 'description':  video_description,
669                 'player_url':   player_url,
670                 'subtitles':    video_subtitles,
671                 'duration':     video_duration
672             })
673         return results
674
675
676 class MetacafeIE(InfoExtractor):
677     """Information Extractor for metacafe.com."""
678
679     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
680     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
681     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
682     IE_NAME = u'metacafe'
683
684     def report_disclaimer(self):
685         """Report disclaimer retrieval."""
686         self.to_screen(u'Retrieving disclaimer')
687
688     def _real_initialize(self):
689         # Retrieve disclaimer
690         request = compat_urllib_request.Request(self._DISCLAIMER)
691         try:
692             self.report_disclaimer()
693             disclaimer = compat_urllib_request.urlopen(request).read()
694         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
695             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
696
697         # Confirm age
698         disclaimer_form = {
699             'filters': '0',
700             'submit': "Continue - I'm over 18",
701             }
702         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
703         try:
704             self.report_age_confirmation()
705             disclaimer = compat_urllib_request.urlopen(request).read()
706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
708
709     def _real_extract(self, url):
710         # Extract id and simplified title from URL
711         mobj = re.match(self._VALID_URL, url)
712         if mobj is None:
713             raise ExtractorError(u'Invalid URL: %s' % url)
714
715         video_id = mobj.group(1)
716
717         # Check if video comes from YouTube
718         mobj2 = re.match(r'^yt-(.*)$', video_id)
719         if mobj2 is not None:
720             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
721
722         # Retrieve video webpage to extract further information
723         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
724
725         # Extract URL, uploader and title from webpage
726         self.report_extraction(video_id)
727         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
728         if mobj is not None:
729             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
730             video_extension = mediaURL[-3:]
731
732             # Extract gdaKey if available
733             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
734             if mobj is None:
735                 video_url = mediaURL
736             else:
737                 gdaKey = mobj.group(1)
738                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
739         else:
740             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
741             if mobj is None:
742                 raise ExtractorError(u'Unable to extract media URL')
743             vardict = compat_parse_qs(mobj.group(1))
744             if 'mediaData' not in vardict:
745                 raise ExtractorError(u'Unable to extract media URL')
746             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
747             if mobj is None:
748                 raise ExtractorError(u'Unable to extract media URL')
749             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
750             video_extension = mediaURL[-3:]
751             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
752
753         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
754         if mobj is None:
755             raise ExtractorError(u'Unable to extract title')
756         video_title = mobj.group(1).decode('utf-8')
757
758         mobj = re.search(r'submitter=(.*?);', webpage)
759         if mobj is None:
760             raise ExtractorError(u'Unable to extract uploader nickname')
761         video_uploader = mobj.group(1)
762
763         return [{
764             'id':       video_id.decode('utf-8'),
765             'url':      video_url.decode('utf-8'),
766             'uploader': video_uploader.decode('utf-8'),
767             'upload_date':  None,
768             'title':    video_title,
769             'ext':      video_extension.decode('utf-8'),
770         }]
771
772 class DailymotionIE(InfoExtractor):
773     """Information Extractor for Dailymotion"""
774
775     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
776     IE_NAME = u'dailymotion'
777
778     def _real_extract(self, url):
779         # Extract id and simplified title from URL
780         mobj = re.match(self._VALID_URL, url)
781         if mobj is None:
782             raise ExtractorError(u'Invalid URL: %s' % url)
783
784         video_id = mobj.group(1).split('_')[0].split('?')[0]
785
786         video_extension = 'mp4'
787
788         # Retrieve video webpage to extract further information
789         request = compat_urllib_request.Request(url)
790         request.add_header('Cookie', 'family_filter=off')
791         webpage = self._download_webpage(request, video_id)
792
793         # Extract URL, uploader and title from webpage
794         self.report_extraction(video_id)
795         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
796         if mobj is None:
797             raise ExtractorError(u'Unable to extract media URL')
798         flashvars = compat_urllib_parse.unquote(mobj.group(1))
799
800         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
801             if key in flashvars:
802                 max_quality = key
803                 self.to_screen(u'Using %s' % key)
804                 break
805         else:
806             raise ExtractorError(u'Unable to extract video URL')
807
808         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
809         if mobj is None:
810             raise ExtractorError(u'Unable to extract video URL')
811
812         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
813
814         # TODO: support choosing qualities
815
816         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
817         if mobj is None:
818             raise ExtractorError(u'Unable to extract title')
819         video_title = unescapeHTML(mobj.group('title'))
820
821         video_uploader = None
822         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
823         if mobj is None:
824             # lookin for official user
825             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
826             if mobj_official is None:
827                 self._downloader.report_warning(u'unable to extract uploader nickname')
828             else:
829                 video_uploader = mobj_official.group(1)
830         else:
831             video_uploader = mobj.group(1)
832
833         video_upload_date = None
834         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
835         if mobj is not None:
836             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
837
838         return [{
839             'id':       video_id,
840             'url':      video_url,
841             'uploader': video_uploader,
842             'upload_date':  video_upload_date,
843             'title':    video_title,
844             'ext':      video_extension,
845         }]
846
847
848 class PhotobucketIE(InfoExtractor):
849     """Information extractor for photobucket.com."""
850
851     # TODO: the original _VALID_URL was:
852     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
853     # Check if it's necessary to keep the old extracion process
854     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
855     IE_NAME = u'photobucket'
856
857     def _real_extract(self, url):
858         # Extract id from URL
859         mobj = re.match(self._VALID_URL, url)
860         if mobj is None:
861             raise ExtractorError(u'Invalid URL: %s' % url)
862
863         video_id = mobj.group('id')
864
865         video_extension = mobj.group('ext')
866
867         # Retrieve video webpage to extract further information
868         webpage = self._download_webpage(url, video_id)
869
870         # Extract URL, uploader, and title from webpage
871         self.report_extraction(video_id)
872         # We try first by looking the javascript code:
873         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
874         if mobj is not None:
875             info = json.loads(mobj.group('json'))
876             return [{
877                 'id':       video_id,
878                 'url':      info[u'downloadUrl'],
879                 'uploader': info[u'username'],
880                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
881                 'title':    info[u'title'],
882                 'ext':      video_extension,
883                 'thumbnail': info[u'thumbUrl'],
884             }]
885
886         # We try looking in other parts of the webpage
887         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
888         if mobj is None:
889             raise ExtractorError(u'Unable to extract media URL')
890         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
891
892         video_url = mediaURL
893
894         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
895         if mobj is None:
896             raise ExtractorError(u'Unable to extract title')
897         video_title = mobj.group(1).decode('utf-8')
898
899         video_uploader = mobj.group(2).decode('utf-8')
900
901         return [{
902             'id':       video_id.decode('utf-8'),
903             'url':      video_url.decode('utf-8'),
904             'uploader': video_uploader,
905             'upload_date':  None,
906             'title':    video_title,
907             'ext':      video_extension.decode('utf-8'),
908         }]
909
910
911 class YahooIE(InfoExtractor):
912     """Information extractor for video.yahoo.com."""
913
914     _WORKING = False
915     # _VALID_URL matches all Yahoo! Video URLs
916     # _VPAGE_URL matches only the extractable '/watch/' URLs
917     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919     IE_NAME = u'video.yahoo'
920
921     def _real_extract(self, url, new_video=True):
922         # Extract ID from URL
923         mobj = re.match(self._VALID_URL, url)
924         if mobj is None:
925             raise ExtractorError(u'Invalid URL: %s' % url)
926
927         video_id = mobj.group(2)
928         video_extension = 'flv'
929
930         # Rewrite valid but non-extractable URLs as
931         # extractable English language /watch/ URLs
932         if re.match(self._VPAGE_URL, url) is None:
933             request = compat_urllib_request.Request(url)
934             try:
935                 webpage = compat_urllib_request.urlopen(request).read()
936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937                 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
938
939             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
940             if mobj is None:
941                 raise ExtractorError(u'Unable to extract id field')
942             yahoo_id = mobj.group(1)
943
944             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
945             if mobj is None:
946                 raise ExtractorError(u'Unable to extract vid field')
947             yahoo_vid = mobj.group(1)
948
949             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
950             return self._real_extract(url, new_video=False)
951
952         # Retrieve video webpage to extract further information
953         request = compat_urllib_request.Request(url)
954         try:
955             self.report_download_webpage(video_id)
956             webpage = compat_urllib_request.urlopen(request).read()
957         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
959
960         # Extract uploader and title from webpage
961         self.report_extraction(video_id)
962         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
963         if mobj is None:
964             raise ExtractorError(u'Unable to extract video title')
965         video_title = mobj.group(1).decode('utf-8')
966
967         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
968         if mobj is None:
969             raise ExtractorError(u'Unable to extract video uploader')
970         video_uploader = mobj.group(1).decode('utf-8')
971
972         # Extract video thumbnail
973         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
974         if mobj is None:
975             raise ExtractorError(u'Unable to extract video thumbnail')
976         video_thumbnail = mobj.group(1).decode('utf-8')
977
978         # Extract video description
979         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
980         if mobj is None:
981             raise ExtractorError(u'Unable to extract video description')
982         video_description = mobj.group(1).decode('utf-8')
983         if not video_description:
984             video_description = 'No description available.'
985
986         # Extract video height and width
987         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
988         if mobj is None:
989             raise ExtractorError(u'Unable to extract video height')
990         yv_video_height = mobj.group(1)
991
992         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
993         if mobj is None:
994             raise ExtractorError(u'Unable to extract video width')
995         yv_video_width = mobj.group(1)
996
997         # Retrieve video playlist to extract media URL
998         # I'm not completely sure what all these options are, but we
999         # seem to need most of them, otherwise the server sends a 401.
1000         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1001         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1002         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1005         try:
1006             self.report_download_webpage(video_id)
1007             webpage = compat_urllib_request.urlopen(request).read()
1008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1010
1011         # Extract media URL from playlist XML
1012         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013         if mobj is None:
1014             raise ExtractorError(u'Unable to extract media URL')
1015         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1016         video_url = unescapeHTML(video_url)
1017
1018         return [{
1019             'id':       video_id.decode('utf-8'),
1020             'url':      video_url,
1021             'uploader': video_uploader,
1022             'upload_date':  None,
1023             'title':    video_title,
1024             'ext':      video_extension.decode('utf-8'),
1025             'thumbnail':    video_thumbnail.decode('utf-8'),
1026             'description':  video_description,
1027         }]
1028
1029
1030 class VimeoIE(InfoExtractor):
1031     """Information extractor for vimeo.com."""
1032
1033     # _VALID_URL matches Vimeo URLs
1034     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1035     IE_NAME = u'vimeo'
1036
1037     def _real_extract(self, url, new_video=True):
1038         # Extract ID from URL
1039         mobj = re.match(self._VALID_URL, url)
1040         if mobj is None:
1041             raise ExtractorError(u'Invalid URL: %s' % url)
1042
1043         video_id = mobj.group('id')
1044         if not mobj.group('proto'):
1045             url = 'https://' + url
1046         if mobj.group('direct_link'):
1047             url = 'https://vimeo.com/' + video_id
1048
1049         # Retrieve video webpage to extract further information
1050         request = compat_urllib_request.Request(url, None, std_headers)
1051         webpage = self._download_webpage(request, video_id)
1052
1053         # Now we begin extracting as much information as we can from what we
1054         # retrieved. First we extract the information common to all extractors,
1055         # and latter we extract those that are Vimeo specific.
1056         self.report_extraction(video_id)
1057
1058         # Extract the config JSON
1059         try:
1060             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061             config = json.loads(config)
1062         except:
1063             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1064                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1065             else:
1066                 raise ExtractorError(u'Unable to extract info section')
1067
1068         # Extract title
1069         video_title = config["video"]["title"]
1070
1071         # Extract uploader and uploader_id
1072         video_uploader = config["video"]["owner"]["name"]
1073         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074
1075         # Extract video thumbnail
1076         video_thumbnail = config["video"]["thumbnail"]
1077
1078         # Extract video description
1079         video_description = get_element_by_attribute("itemprop", "description", webpage)
1080         if video_description: video_description = clean_html(video_description)
1081         else: video_description = u''
1082
1083         # Extract upload date
1084         video_upload_date = None
1085         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086         if mobj is not None:
1087             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088
1089         # Vimeo specific: extract request signature and timestamp
1090         sig = config['request']['signature']
1091         timestamp = config['request']['timestamp']
1092
1093         # Vimeo specific: extract video codec and quality information
1094         # First consider quality, then codecs, then take everything
1095         # TODO bind to format param
1096         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097         files = { 'hd': [], 'sd': [], 'other': []}
1098         for codec_name, codec_extension in codecs:
1099             if codec_name in config["video"]["files"]:
1100                 if 'hd' in config["video"]["files"][codec_name]:
1101                     files['hd'].append((codec_name, codec_extension, 'hd'))
1102                 elif 'sd' in config["video"]["files"][codec_name]:
1103                     files['sd'].append((codec_name, codec_extension, 'sd'))
1104                 else:
1105                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106
1107         for quality in ('hd', 'sd', 'other'):
1108             if len(files[quality]) > 0:
1109                 video_quality = files[quality][0][2]
1110                 video_codec = files[quality][0][0]
1111                 video_extension = files[quality][0][1]
1112                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1113                 break
1114         else:
1115             raise ExtractorError(u'No known codec found')
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def fetch_webpage(self, url):
1142         request = compat_urllib_request.Request(url)
1143         try:
1144             self.report_download_webpage(url)
1145             webpage = compat_urllib_request.urlopen(request).read()
1146         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1148         except ValueError as err:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150         return webpage
1151
1152     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1153         page = self.fetch_webpage(url)
1154         mobj = re.search(regex, page, regexFlags)
1155         info = {}
1156
1157         if mobj is None:
1158             raise ExtractorError(u'Invalid URL: %s' % url)
1159
1160         for (i, key, err) in matchTuples:
1161             if mobj.group(i) is None:
1162                 raise ExtractorError(err)
1163             else:
1164                 info[key] = mobj.group(i)
1165
1166         return info
1167
1168     def extractLiveStream(self, url):
1169         video_lang = url.split('/')[-4]
1170         info = self.grep_webpage(
1171             url,
1172             r'src="(.*?/videothek_js.*?\.js)',
1173             0,
1174             [
1175                 (1, 'url', u'Invalid URL: %s' % url)
1176             ]
1177         )
1178         http_host = url.split('/')[2]
1179         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1180         info = self.grep_webpage(
1181             next_url,
1182             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1183                 '(http://.*?\.swf).*?' +
1184                 '(rtmp://.*?)\'',
1185             re.DOTALL,
1186             [
1187                 (1, 'path',   u'could not extract video path: %s' % url),
1188                 (2, 'player', u'could not extract video player: %s' % url),
1189                 (3, 'url',    u'could not extract video url: %s' % url)
1190             ]
1191         )
1192         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1193
1194     def extractPlus7Stream(self, url):
1195         video_lang = url.split('/')[-3]
1196         info = self.grep_webpage(
1197             url,
1198             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1199             0,
1200             [
1201                 (1, 'url', u'Invalid URL: %s' % url)
1202             ]
1203         )
1204         next_url = compat_urllib_parse.unquote(info.get('url'))
1205         info = self.grep_webpage(
1206             next_url,
1207             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1208             0,
1209             [
1210                 (1, 'url', u'Could not find <video> tag: %s' % url)
1211             ]
1212         )
1213         next_url = compat_urllib_parse.unquote(info.get('url'))
1214
1215         info = self.grep_webpage(
1216             next_url,
1217             r'<video id="(.*?)".*?>.*?' +
1218                 '<name>(.*?)</name>.*?' +
1219                 '<dateVideo>(.*?)</dateVideo>.*?' +
1220                 '<url quality="hd">(.*?)</url>',
1221             re.DOTALL,
1222             [
1223                 (1, 'id',    u'could not extract video id: %s' % url),
1224                 (2, 'title', u'could not extract video title: %s' % url),
1225                 (3, 'date',  u'could not extract video date: %s' % url),
1226                 (4, 'url',   u'could not extract video url: %s' % url)
1227             ]
1228         )
1229
1230         return {
1231             'id':           info.get('id'),
1232             'url':          compat_urllib_parse.unquote(info.get('url')),
1233             'uploader':     u'arte.tv',
1234             'upload_date':  unified_strdate(info.get('date')),
1235             'title':        info.get('title').decode('utf-8'),
1236             'ext':          u'mp4',
1237             'format':       u'NA',
1238             'player_url':   None,
1239         }
1240
1241     def _real_extract(self, url):
1242         video_id = url.split('/')[-1]
1243         self.report_extraction(video_id)
1244
1245         if re.search(self._LIVE_URL, video_id) is not None:
1246             self.extractLiveStream(url)
1247             return
1248         else:
1249             info = self.extractPlus7Stream(url)
1250
1251         return [info]
1252
1253
1254 class GenericIE(InfoExtractor):
1255     """Generic last-resort information extractor."""
1256
1257     _VALID_URL = r'.*'
1258     IE_NAME = u'generic'
1259
1260     def report_download_webpage(self, video_id):
1261         """Report webpage download."""
1262         if not self._downloader.params.get('test', False):
1263             self._downloader.report_warning(u'Falling back on generic information extractor.')
1264         super(GenericIE, self).report_download_webpage(video_id)
1265
1266     def report_following_redirect(self, new_url):
1267         """Report information extraction."""
1268         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1269
1270     def _test_redirect(self, url):
1271         """Check if it is a redirect, like url shorteners, in case return the new url."""
1272         class HeadRequest(compat_urllib_request.Request):
1273             def get_method(self):
1274                 return "HEAD"
1275
1276         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1277             """
1278             Subclass the HTTPRedirectHandler to make it use our
1279             HeadRequest also on the redirected URL
1280             """
1281             def redirect_request(self, req, fp, code, msg, headers, newurl):
1282                 if code in (301, 302, 303, 307):
1283                     newurl = newurl.replace(' ', '%20')
1284                     newheaders = dict((k,v) for k,v in req.headers.items()
1285                                       if k.lower() not in ("content-length", "content-type"))
1286                     return HeadRequest(newurl,
1287                                        headers=newheaders,
1288                                        origin_req_host=req.get_origin_req_host(),
1289                                        unverifiable=True)
1290                 else:
1291                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1292
1293         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1294             """
1295             Fallback to GET if HEAD is not allowed (405 HTTP error)
1296             """
1297             def http_error_405(self, req, fp, code, msg, headers):
1298                 fp.read()
1299                 fp.close()
1300
1301                 newheaders = dict((k,v) for k,v in req.headers.items()
1302                                   if k.lower() not in ("content-length", "content-type"))
1303                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1304                                                  headers=newheaders,
1305                                                  origin_req_host=req.get_origin_req_host(),
1306                                                  unverifiable=True))
1307
1308         # Build our opener
1309         opener = compat_urllib_request.OpenerDirector()
1310         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1311                         HTTPMethodFallback, HEADRedirectHandler,
1312                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1313             opener.add_handler(handler())
1314
1315         response = opener.open(HeadRequest(url))
1316         new_url = response.geturl()
1317
1318         if url == new_url:
1319             return False
1320
1321         self.report_following_redirect(new_url)
1322         return new_url
1323
1324     def _real_extract(self, url):
1325         new_url = self._test_redirect(url)
1326         if new_url: return [self.url_result(new_url)]
1327
1328         video_id = url.split('/')[-1]
1329         try:
1330             webpage = self._download_webpage(url, video_id)
1331         except ValueError as err:
1332             # since this is the last-resort InfoExtractor, if
1333             # this error is thrown, it'll be thrown here
1334             raise ExtractorError(u'Invalid URL: %s' % url)
1335
1336         self.report_extraction(video_id)
1337         # Start with something easy: JW Player in SWFObject
1338         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1339         if mobj is None:
1340             # Broaden the search a little bit
1341             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1342         if mobj is None:
1343             # Broaden the search a little bit: JWPlayer JS loader
1344             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1345         if mobj is None:
1346             raise ExtractorError(u'Invalid URL: %s' % url)
1347
1348         # It's possible that one of the regexes
1349         # matched, but returned an empty group:
1350         if mobj.group(1) is None:
1351             raise ExtractorError(u'Invalid URL: %s' % url)
1352
1353         video_url = compat_urllib_parse.unquote(mobj.group(1))
1354         video_id = os.path.basename(video_url)
1355
1356         # here's a fun little line of code for you:
1357         video_extension = os.path.splitext(video_id)[1][1:]
1358         video_id = os.path.splitext(video_id)[0]
1359
1360         # it's tempting to parse this further, but you would
1361         # have to take into account all the variations like
1362         #   Video Title - Site Name
1363         #   Site Name | Video Title
1364         #   Video Title - Tagline | Site Name
1365         # and so on and so forth; it's just not practical
1366         mobj = re.search(r'<title>(.*)</title>', webpage)
1367         if mobj is None:
1368             raise ExtractorError(u'Unable to extract title')
1369         video_title = mobj.group(1)
1370
1371         # video uploader is domain name
1372         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1373         if mobj is None:
1374             raise ExtractorError(u'Unable to extract title')
1375         video_uploader = mobj.group(1)
1376
1377         return [{
1378             'id':       video_id,
1379             'url':      video_url,
1380             'uploader': video_uploader,
1381             'upload_date':  None,
1382             'title':    video_title,
1383             'ext':      video_extension,
1384         }]
1385
1386
1387 class YoutubeSearchIE(InfoExtractor):
1388     """Information Extractor for YouTube search queries."""
1389     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1390     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1391     _max_youtube_results = 1000
1392     IE_NAME = u'youtube:search'
1393
1394     def report_download_page(self, query, pagenum):
1395         """Report attempt to download search page with given number."""
1396         query = query.decode(preferredencoding())
1397         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1398
1399     def _real_extract(self, query):
1400         mobj = re.match(self._VALID_URL, query)
1401         if mobj is None:
1402             raise ExtractorError(u'Invalid search query "%s"' % query)
1403
1404         prefix, query = query.split(':')
1405         prefix = prefix[8:]
1406         query = query.encode('utf-8')
1407         if prefix == '':
1408             return self._get_n_results(query, 1)
1409         elif prefix == 'all':
1410             self._get_n_results(query, self._max_youtube_results)
1411         else:
1412             try:
1413                 n = int(prefix)
1414                 if n <= 0:
1415                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1416                 elif n > self._max_youtube_results:
1417                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1418                     n = self._max_youtube_results
1419                 return self._get_n_results(query, n)
1420             except ValueError: # parsing prefix as integer fails
1421                 return self._get_n_results(query, 1)
1422
1423     def _get_n_results(self, query, n):
1424         """Get a specified number of results for a query"""
1425
1426         video_ids = []
1427         pagenum = 0
1428         limit = n
1429
1430         while (50 * pagenum) < limit:
1431             self.report_download_page(query, pagenum+1)
1432             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1433             request = compat_urllib_request.Request(result_url)
1434             try:
1435                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1437                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1438             api_response = json.loads(data)['data']
1439
1440             if not 'items' in api_response:
1441                 raise ExtractorError(u'[youtube] No video results')
1442
1443             new_ids = list(video['id'] for video in api_response['items'])
1444             video_ids += new_ids
1445
1446             limit = min(n, api_response['totalItems'])
1447             pagenum += 1
1448
1449         if len(video_ids) > n:
1450             video_ids = video_ids[:n]
1451         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1452         return videos
1453
1454
1455 class GoogleSearchIE(InfoExtractor):
1456     """Information Extractor for Google Video search queries."""
1457     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1458     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1459     _max_google_results = 1000
1460     IE_NAME = u'video.google:search'
1461
1462     def _real_extract(self, query):
1463         mobj = re.match(self._VALID_URL, query)
1464
1465         prefix = mobj.group('prefix')
1466         query = mobj.group('query')
1467         if prefix == '':
1468             return self._download_n_results(query, 1)
1469         elif prefix == 'all':
1470             return self._download_n_results(query, self._max_google_results)
1471         else:
1472             n = int(prefix)
1473             if n <= 0:
1474                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1475             elif n > self._max_google_results:
1476                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1477                 n = self._max_google_results
1478             return self._download_n_results(query, n)
1479
1480     def _download_n_results(self, query, n):
1481         """Downloads a specified number of results for a query"""
1482
1483         res = {
1484             '_type': 'playlist',
1485             'id': query,
1486             'entries': []
1487         }
1488
1489         for pagenum in itertools.count(1):
1490             result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1491             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1492                                              note='Downloading result page ' + str(pagenum))
1493
1494             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1495                 e = {
1496                     '_type': 'url',
1497                     'url': mobj.group(1)
1498                 }
1499                 res['entries'].append(e)
1500
1501             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1502                 return res
1503
1504 class YahooSearchIE(InfoExtractor):
1505     """Information Extractor for Yahoo! Video search queries."""
1506
1507     _WORKING = False
1508     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1509     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1510     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1511     _MORE_PAGES_INDICATOR = r'\s*Next'
1512     _max_yahoo_results = 1000
1513     IE_NAME = u'video.yahoo:search'
1514
1515     def report_download_page(self, query, pagenum):
1516         """Report attempt to download playlist page with given number."""
1517         query = query.decode(preferredencoding())
1518         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1519
1520     def _real_extract(self, query):
1521         mobj = re.match(self._VALID_URL, query)
1522         if mobj is None:
1523             raise ExtractorError(u'Invalid search query "%s"' % query)
1524
1525         prefix, query = query.split(':')
1526         prefix = prefix[8:]
1527         query = query.encode('utf-8')
1528         if prefix == '':
1529             self._download_n_results(query, 1)
1530             return
1531         elif prefix == 'all':
1532             self._download_n_results(query, self._max_yahoo_results)
1533             return
1534         else:
1535             try:
1536                 n = int(prefix)
1537                 if n <= 0:
1538                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1539                 elif n > self._max_yahoo_results:
1540                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1541                     n = self._max_yahoo_results
1542                 self._download_n_results(query, n)
1543                 return
1544             except ValueError: # parsing prefix as integer fails
1545                 self._download_n_results(query, 1)
1546                 return
1547
1548     def _download_n_results(self, query, n):
1549         """Downloads a specified number of results for a query"""
1550
1551         video_ids = []
1552         already_seen = set()
1553         pagenum = 1
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1563
1564             # Extract video identifiers
1565             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566                 video_id = mobj.group(1)
1567                 if video_id not in already_seen:
1568                     video_ids.append(video_id)
1569                     already_seen.add(video_id)
1570                     if len(video_ids) == n:
1571                         # Specified n videos reached
1572                         for id in video_ids:
1573                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1574                         return
1575
1576             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                 for id in video_ids:
1578                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1579                 return
1580
1581             pagenum = pagenum + 1
1582
1583
1584 class YoutubePlaylistIE(InfoExtractor):
1585     """Information Extractor for YouTube playlists."""
1586
1587     _VALID_URL = r"""(?:
1588                         (?:https?://)?
1589                         (?:\w+\.)?
1590                         youtube\.com/
1591                         (?:
1592                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1593                            \? (?:.*?&)*? (?:p|a|list)=
1594                         |  p/
1595                         )
1596                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1597                         .*
1598                      |
1599                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1600                      )"""
1601     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1602     _MAX_RESULTS = 50
1603     IE_NAME = u'youtube:playlist'
1604
1605     @classmethod
1606     def suitable(cls, url):
1607         """Receives a URL and returns True if suitable for this IE."""
1608         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1609
1610     def _real_extract(self, url):
1611         # Extract playlist id
1612         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1613         if mobj is None:
1614             raise ExtractorError(u'Invalid URL: %s' % url)
1615
1616         # Download playlist videos from API
1617         playlist_id = mobj.group(1) or mobj.group(2)
1618         page_num = 1
1619         videos = []
1620
1621         while True:
1622             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1623             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1624
1625             try:
1626                 response = json.loads(page)
1627             except ValueError as err:
1628                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1629
1630             if 'feed' not in response:
1631                 raise ExtractorError(u'Got a malformed response from YouTube API')
1632             playlist_title = response['feed']['title']['$t']
1633             if 'entry' not in response['feed']:
1634                 # Number of videos is a multiple of self._MAX_RESULTS
1635                 break
1636
1637             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1638                         for entry in response['feed']['entry']
1639                         if 'content' in entry ]
1640
1641             if len(response['feed']['entry']) < self._MAX_RESULTS:
1642                 break
1643             page_num += 1
1644
1645         videos = [v[1] for v in sorted(videos)]
1646
1647         url_results = [self.url_result(url, 'Youtube') for url in videos]
1648         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1649
1650
1651 class YoutubeChannelIE(InfoExtractor):
1652     """Information Extractor for YouTube channels."""
1653
1654     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1655     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1656     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1657     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1658     IE_NAME = u'youtube:channel'
1659
1660     def extract_videos_from_page(self, page):
1661         ids_in_page = []
1662         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1663             if mobj.group(1) not in ids_in_page:
1664                 ids_in_page.append(mobj.group(1))
1665         return ids_in_page
1666
1667     def _real_extract(self, url):
1668         # Extract channel id
1669         mobj = re.match(self._VALID_URL, url)
1670         if mobj is None:
1671             raise ExtractorError(u'Invalid URL: %s' % url)
1672
1673         # Download channel page
1674         channel_id = mobj.group(1)
1675         video_ids = []
1676         pagenum = 1
1677
1678         url = self._TEMPLATE_URL % (channel_id, pagenum)
1679         page = self._download_webpage(url, channel_id,
1680                                       u'Downloading page #%s' % pagenum)
1681
1682         # Extract video identifiers
1683         ids_in_page = self.extract_videos_from_page(page)
1684         video_ids.extend(ids_in_page)
1685
1686         # Download any subsequent channel pages using the json-based channel_ajax query
1687         if self._MORE_PAGES_INDICATOR in page:
1688             while True:
1689                 pagenum = pagenum + 1
1690
1691                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1692                 page = self._download_webpage(url, channel_id,
1693                                               u'Downloading page #%s' % pagenum)
1694
1695                 page = json.loads(page)
1696
1697                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1698                 video_ids.extend(ids_in_page)
1699
1700                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1701                     break
1702
1703         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1704
1705         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1706         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1707         return [self.playlist_result(url_entries, channel_id)]
1708
1709
1710 class YoutubeUserIE(InfoExtractor):
1711     """Information Extractor for YouTube users."""
1712
1713     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1714     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1715     _GDATA_PAGE_SIZE = 50
1716     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1717     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1718     IE_NAME = u'youtube:user'
1719
1720     def _real_extract(self, url):
1721         # Extract username
1722         mobj = re.match(self._VALID_URL, url)
1723         if mobj is None:
1724             raise ExtractorError(u'Invalid URL: %s' % url)
1725
1726         username = mobj.group(1)
1727
1728         # Download video ids using YouTube Data API. Result size per
1729         # query is limited (currently to 50 videos) so we need to query
1730         # page by page until there are no video ids - it means we got
1731         # all of them.
1732
1733         video_ids = []
1734         pagenum = 0
1735
1736         while True:
1737             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1738
1739             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1740             page = self._download_webpage(gdata_url, username,
1741                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1742
1743             # Extract video identifiers
1744             ids_in_page = []
1745
1746             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1747                 if mobj.group(1) not in ids_in_page:
1748                     ids_in_page.append(mobj.group(1))
1749
1750             video_ids.extend(ids_in_page)
1751
1752             # A little optimization - if current page is not
1753             # "full", ie. does not contain PAGE_SIZE video ids then
1754             # we can assume that this page is the last one - there
1755             # are no more ids on further pages - no need to query
1756             # again.
1757
1758             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1759                 break
1760
1761             pagenum += 1
1762
1763         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1764         url_results = [self.url_result(url, 'Youtube') for url in urls]
1765         return [self.playlist_result(url_results, playlist_title = username)]
1766
1767
1768 class BlipTVUserIE(InfoExtractor):
1769     """Information Extractor for blip.tv users."""
1770
1771     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1772     _PAGE_SIZE = 12
1773     IE_NAME = u'blip.tv:user'
1774
1775     def _real_extract(self, url):
1776         # Extract username
1777         mobj = re.match(self._VALID_URL, url)
1778         if mobj is None:
1779             raise ExtractorError(u'Invalid URL: %s' % url)
1780
1781         username = mobj.group(1)
1782
1783         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1784
1785         page = self._download_webpage(url, username, u'Downloading user page')
1786         mobj = re.search(r'data-users-id="([^"]+)"', page)
1787         page_base = page_base % mobj.group(1)
1788
1789
1790         # Download video ids using BlipTV Ajax calls. Result size per
1791         # query is limited (currently to 12 videos) so we need to query
1792         # page by page until there are no video ids - it means we got
1793         # all of them.
1794
1795         video_ids = []
1796         pagenum = 1
1797
1798         while True:
1799             url = page_base + "&page=" + str(pagenum)
1800             page = self._download_webpage(url, username,
1801                                           u'Downloading video ids from page %d' % pagenum)
1802
1803             # Extract video identifiers
1804             ids_in_page = []
1805
1806             for mobj in re.finditer(r'href="/([^"]+)"', page):
1807                 if mobj.group(1) not in ids_in_page:
1808                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1809
1810             video_ids.extend(ids_in_page)
1811
1812             # A little optimization - if current page is not
1813             # "full", ie. does not contain PAGE_SIZE video ids then
1814             # we can assume that this page is the last one - there
1815             # are no more ids on further pages - no need to query
1816             # again.
1817
1818             if len(ids_in_page) < self._PAGE_SIZE:
1819                 break
1820
1821             pagenum += 1
1822
1823         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1824         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1825         return [self.playlist_result(url_entries, playlist_title = username)]
1826
1827
1828 class DepositFilesIE(InfoExtractor):
1829     """Information extractor for depositfiles.com"""
1830
1831     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1832
1833     def _real_extract(self, url):
1834         file_id = url.split('/')[-1]
1835         # Rebuild url in english locale
1836         url = 'http://depositfiles.com/en/files/' + file_id
1837
1838         # Retrieve file webpage with 'Free download' button pressed
1839         free_download_indication = { 'gateway_result' : '1' }
1840         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1841         try:
1842             self.report_download_webpage(file_id)
1843             webpage = compat_urllib_request.urlopen(request).read()
1844         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1846
1847         # Search for the real file URL
1848         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1849         if (mobj is None) or (mobj.group(1) is None):
1850             # Try to figure out reason of the error.
1851             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1852             if (mobj is not None) and (mobj.group(1) is not None):
1853                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1854                 raise ExtractorError(u'%s' % restriction_message)
1855             else:
1856                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1857
1858         file_url = mobj.group(1)
1859         file_extension = os.path.splitext(file_url)[1][1:]
1860
1861         # Search for file title
1862         mobj = re.search(r'<b title="(.*?)">', webpage)
1863         if mobj is None:
1864             raise ExtractorError(u'Unable to extract title')
1865         file_title = mobj.group(1).decode('utf-8')
1866
1867         return [{
1868             'id':       file_id.decode('utf-8'),
1869             'url':      file_url.decode('utf-8'),
1870             'uploader': None,
1871             'upload_date':  None,
1872             'title':    file_title,
1873             'ext':      file_extension.decode('utf-8'),
1874         }]
1875
1876
1877 class FacebookIE(InfoExtractor):
1878     """Information Extractor for Facebook"""
1879
1880     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1881     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1882     _NETRC_MACHINE = 'facebook'
1883     IE_NAME = u'facebook'
1884
1885     def report_login(self):
1886         """Report attempt to log in."""
1887         self.to_screen(u'Logging in')
1888
1889     def _real_initialize(self):
1890         if self._downloader is None:
1891             return
1892
1893         useremail = None
1894         password = None
1895         downloader_params = self._downloader.params
1896
1897         # Attempt to use provided username and password or .netrc data
1898         if downloader_params.get('username', None) is not None:
1899             useremail = downloader_params['username']
1900             password = downloader_params['password']
1901         elif downloader_params.get('usenetrc', False):
1902             try:
1903                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1904                 if info is not None:
1905                     useremail = info[0]
1906                     password = info[2]
1907                 else:
1908                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1909             except (IOError, netrc.NetrcParseError) as err:
1910                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1911                 return
1912
1913         if useremail is None:
1914             return
1915
1916         # Log in
1917         login_form = {
1918             'email': useremail,
1919             'pass': password,
1920             'login': 'Log+In'
1921             }
1922         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1923         try:
1924             self.report_login()
1925             login_results = compat_urllib_request.urlopen(request).read()
1926             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1927                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1928                 return
1929         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1931             return
1932
1933     def _real_extract(self, url):
1934         mobj = re.match(self._VALID_URL, url)
1935         if mobj is None:
1936             raise ExtractorError(u'Invalid URL: %s' % url)
1937         video_id = mobj.group('ID')
1938
1939         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1940         webpage = self._download_webpage(url, video_id)
1941
1942         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1943         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1944         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1945         if not m:
1946             raise ExtractorError(u'Cannot parse data')
1947         data = dict(json.loads(m.group(1)))
1948         params_raw = compat_urllib_parse.unquote(data['params'])
1949         params = json.loads(params_raw)
1950         video_data = params['video_data'][0]
1951         video_url = video_data.get('hd_src')
1952         if not video_url:
1953             video_url = video_data['sd_src']
1954         if not video_url:
1955             raise ExtractorError(u'Cannot find video URL')
1956         video_duration = int(video_data['video_duration'])
1957         thumbnail = video_data['thumbnail_src']
1958
1959         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1960         if not m:
1961             raise ExtractorError(u'Cannot find title in webpage')
1962         video_title = unescapeHTML(m.group(1))
1963
1964         info = {
1965             'id': video_id,
1966             'title': video_title,
1967             'url': video_url,
1968             'ext': 'mp4',
1969             'duration': video_duration,
1970             'thumbnail': thumbnail,
1971         }
1972         return [info]
1973
1974
1975 class BlipTVIE(InfoExtractor):
1976     """Information extractor for blip.tv"""
1977
1978     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1979     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1980     IE_NAME = u'blip.tv'
1981
1982     def report_direct_download(self, title):
1983         """Report information extraction."""
1984         self.to_screen(u'%s: Direct download detected' % title)
1985
1986     def _real_extract(self, url):
1987         mobj = re.match(self._VALID_URL, url)
1988         if mobj is None:
1989             raise ExtractorError(u'Invalid URL: %s' % url)
1990
1991         urlp = compat_urllib_parse_urlparse(url)
1992         if urlp.path.startswith('/play/'):
1993             request = compat_urllib_request.Request(url)
1994             response = compat_urllib_request.urlopen(request)
1995             redirecturl = response.geturl()
1996             rurlp = compat_urllib_parse_urlparse(redirecturl)
1997             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1998             url = 'http://blip.tv/a/a-' + file_id
1999             return self._real_extract(url)
2000
2001
2002         if '?' in url:
2003             cchar = '&'
2004         else:
2005             cchar = '?'
2006         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2007         request = compat_urllib_request.Request(json_url)
2008         request.add_header('User-Agent', 'iTunes/10.6.1')
2009         self.report_extraction(mobj.group(1))
2010         info = None
2011         try:
2012             urlh = compat_urllib_request.urlopen(request)
2013             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2014                 basename = url.split('/')[-1]
2015                 title,ext = os.path.splitext(basename)
2016                 title = title.decode('UTF-8')
2017                 ext = ext.replace('.', '')
2018                 self.report_direct_download(title)
2019                 info = {
2020                     'id': title,
2021                     'url': url,
2022                     'uploader': None,
2023                     'upload_date': None,
2024                     'title': title,
2025                     'ext': ext,
2026                     'urlhandle': urlh
2027                 }
2028         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2029             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2030         if info is None: # Regular URL
2031             try:
2032                 json_code_bytes = urlh.read()
2033                 json_code = json_code_bytes.decode('utf-8')
2034             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2035                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2036
2037             try:
2038                 json_data = json.loads(json_code)
2039                 if 'Post' in json_data:
2040                     data = json_data['Post']
2041                 else:
2042                     data = json_data
2043
2044                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2045                 video_url = data['media']['url']
2046                 umobj = re.match(self._URL_EXT, video_url)
2047                 if umobj is None:
2048                     raise ValueError('Can not determine filename extension')
2049                 ext = umobj.group(1)
2050
2051                 info = {
2052                     'id': data['item_id'],
2053                     'url': video_url,
2054                     'uploader': data['display_name'],
2055                     'upload_date': upload_date,
2056                     'title': data['title'],
2057                     'ext': ext,
2058                     'format': data['media']['mimeType'],
2059                     'thumbnail': data['thumbnailUrl'],
2060                     'description': data['description'],
2061                     'player_url': data['embedUrl'],
2062                     'user_agent': 'iTunes/10.6.1',
2063                 }
2064             except (ValueError,KeyError) as err:
2065                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2066
2067         return [info]
2068
2069
2070 class MyVideoIE(InfoExtractor):
2071     """Information Extractor for myvideo.de."""
2072
2073     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2074     IE_NAME = u'myvideo'
2075
2076     def _real_extract(self,url):
2077         mobj = re.match(self._VALID_URL, url)
2078         if mobj is None:
2079             raise ExtractorError(u'Invalid URL: %s' % url)
2080
2081         video_id = mobj.group(1)
2082
2083         # Get video webpage
2084         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2085         webpage = self._download_webpage(webpage_url, video_id)
2086
2087         self.report_extraction(video_id)
2088         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2089                  webpage)
2090         if mobj is None:
2091             raise ExtractorError(u'Unable to extract media URL')
2092         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2093
2094         mobj = re.search('<title>([^<]+)</title>', webpage)
2095         if mobj is None:
2096             raise ExtractorError(u'Unable to extract title')
2097
2098         video_title = mobj.group(1)
2099
2100         return [{
2101             'id':       video_id,
2102             'url':      video_url,
2103             'uploader': None,
2104             'upload_date':  None,
2105             'title':    video_title,
2106             'ext':      u'flv',
2107         }]
2108
2109 class ComedyCentralIE(InfoExtractor):
2110     """Information extractor for The Daily Show and Colbert Report """
2111
2112     # urls can be abbreviations like :thedailyshow or :colbert
2113     # urls for episodes like:
2114     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2115     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2116     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2117     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2118                       |(https?://)?(www\.)?
2119                           (?P<showname>thedailyshow|colbertnation)\.com/
2120                          (full-episodes/(?P<episode>.*)|
2121                           (?P<clip>
2122                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2123                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2124                      $"""
2125
2126     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2127
2128     _video_extensions = {
2129         '3500': 'mp4',
2130         '2200': 'mp4',
2131         '1700': 'mp4',
2132         '1200': 'mp4',
2133         '750': 'mp4',
2134         '400': 'mp4',
2135     }
2136     _video_dimensions = {
2137         '3500': '1280x720',
2138         '2200': '960x540',
2139         '1700': '768x432',
2140         '1200': '640x360',
2141         '750': '512x288',
2142         '400': '384x216',
2143     }
2144
2145     @classmethod
2146     def suitable(cls, url):
2147         """Receives a URL and returns True if suitable for this IE."""
2148         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2149
2150     def _print_formats(self, formats):
2151         print('Available formats:')
2152         for x in formats:
2153             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2154
2155
2156     def _real_extract(self, url):
2157         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2158         if mobj is None:
2159             raise ExtractorError(u'Invalid URL: %s' % url)
2160
2161         if mobj.group('shortname'):
2162             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2163                 url = u'http://www.thedailyshow.com/full-episodes/'
2164             else:
2165                 url = u'http://www.colbertnation.com/full-episodes/'
2166             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2167             assert mobj is not None
2168
2169         if mobj.group('clip'):
2170             if mobj.group('showname') == 'thedailyshow':
2171                 epTitle = mobj.group('tdstitle')
2172             else:
2173                 epTitle = mobj.group('cntitle')
2174             dlNewest = False
2175         else:
2176             dlNewest = not mobj.group('episode')
2177             if dlNewest:
2178                 epTitle = mobj.group('showname')
2179             else:
2180                 epTitle = mobj.group('episode')
2181
2182         self.report_extraction(epTitle)
2183         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2184         if dlNewest:
2185             url = htmlHandle.geturl()
2186             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2187             if mobj is None:
2188                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2189             if mobj.group('episode') == '':
2190                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2191             epTitle = mobj.group('episode')
2192
2193         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2194
2195         if len(mMovieParams) == 0:
2196             # The Colbert Report embeds the information in a without
2197             # a URL prefix; so extract the alternate reference
2198             # and then add the URL prefix manually.
2199
2200             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2201             if len(altMovieParams) == 0:
2202                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2203             else:
2204                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2205
2206         uri = mMovieParams[0][1]
2207         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2208         indexXml = self._download_webpage(indexUrl, epTitle,
2209                                           u'Downloading show index',
2210                                           u'unable to download episode index')
2211
2212         results = []
2213
2214         idoc = xml.etree.ElementTree.fromstring(indexXml)
2215         itemEls = idoc.findall('.//item')
2216         for partNum,itemEl in enumerate(itemEls):
2217             mediaId = itemEl.findall('./guid')[0].text
2218             shortMediaId = mediaId.split(':')[-1]
2219             showId = mediaId.split(':')[-2].replace('.com', '')
2220             officialTitle = itemEl.findall('./title')[0].text
2221             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2222
2223             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2224                         compat_urllib_parse.urlencode({'uri': mediaId}))
2225             configXml = self._download_webpage(configUrl, epTitle,
2226                                                u'Downloading configuration for %s' % shortMediaId)
2227
2228             cdoc = xml.etree.ElementTree.fromstring(configXml)
2229             turls = []
2230             for rendition in cdoc.findall('.//rendition'):
2231                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2232                 turls.append(finfo)
2233
2234             if len(turls) == 0:
2235                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2236                 continue
2237
2238             if self._downloader.params.get('listformats', None):
2239                 self._print_formats([i[0] for i in turls])
2240                 return
2241
2242             # For now, just pick the highest bitrate
2243             format,rtmp_video_url = turls[-1]
2244
2245             # Get the format arg from the arg stream
2246             req_format = self._downloader.params.get('format', None)
2247
2248             # Select format if we can find one
2249             for f,v in turls:
2250                 if f == req_format:
2251                     format, rtmp_video_url = f, v
2252                     break
2253
2254             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2255             if not m:
2256                 raise ExtractorError(u'Cannot transform RTMP url')
2257             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2258             video_url = base + m.group('finalid')
2259
2260             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2261             info = {
2262                 'id': shortMediaId,
2263                 'url': video_url,
2264                 'uploader': showId,
2265                 'upload_date': officialDate,
2266                 'title': effTitle,
2267                 'ext': 'mp4',
2268                 'format': format,
2269                 'thumbnail': None,
2270                 'description': officialTitle,
2271             }
2272             results.append(info)
2273
2274         return results
2275
2276
2277 class EscapistIE(InfoExtractor):
2278     """Information extractor for The Escapist """
2279
2280     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2281     IE_NAME = u'escapist'
2282
2283     def _real_extract(self, url):
2284         mobj = re.match(self._VALID_URL, url)
2285         if mobj is None:
2286             raise ExtractorError(u'Invalid URL: %s' % url)
2287         showName = mobj.group('showname')
2288         videoId = mobj.group('episode')
2289
2290         self.report_extraction(showName)
2291         webPage = self._download_webpage(url, showName)
2292
2293         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2294         description = unescapeHTML(descMatch.group(1))
2295         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2296         imgUrl = unescapeHTML(imgMatch.group(1))
2297         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2298         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2299         configUrlMatch = re.search('config=(.*)$', playerUrl)
2300         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2301
2302         configJSON = self._download_webpage(configUrl, showName,
2303                                             u'Downloading configuration',
2304                                             u'unable to download configuration')
2305
2306         # Technically, it's JavaScript, not JSON
2307         configJSON = configJSON.replace("'", '"')
2308
2309         try:
2310             config = json.loads(configJSON)
2311         except (ValueError,) as err:
2312             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2313
2314         playlist = config['playlist']
2315         videoUrl = playlist[1]['url']
2316
2317         info = {
2318             'id': videoId,
2319             'url': videoUrl,
2320             'uploader': showName,
2321             'upload_date': None,
2322             'title': showName,
2323             'ext': 'mp4',
2324             'thumbnail': imgUrl,
2325             'description': description,
2326             'player_url': playerUrl,
2327         }
2328
2329         return [info]
2330
2331 class CollegeHumorIE(InfoExtractor):
2332     """Information extractor for collegehumor.com"""
2333
2334     _WORKING = False
2335     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2336     IE_NAME = u'collegehumor'
2337
2338     def report_manifest(self, video_id):
2339         """Report information extraction."""
2340         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2341
2342     def _real_extract(self, url):
2343         mobj = re.match(self._VALID_URL, url)
2344         if mobj is None:
2345             raise ExtractorError(u'Invalid URL: %s' % url)
2346         video_id = mobj.group('videoid')
2347
2348         info = {
2349             'id': video_id,
2350             'uploader': None,
2351             'upload_date': None,
2352         }
2353
2354         self.report_extraction(video_id)
2355         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2356         try:
2357             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2359             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2360
2361         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2362         try:
2363             videoNode = mdoc.findall('./video')[0]
2364             info['description'] = videoNode.findall('./description')[0].text
2365             info['title'] = videoNode.findall('./caption')[0].text
2366             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2367             manifest_url = videoNode.findall('./file')[0].text
2368         except IndexError:
2369             raise ExtractorError(u'Invalid metadata XML file')
2370
2371         manifest_url += '?hdcore=2.10.3'
2372         self.report_manifest(video_id)
2373         try:
2374             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2376             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2377
2378         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2379         try:
2380             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2381             node_id = media_node.attrib['url']
2382             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2383         except IndexError as err:
2384             raise ExtractorError(u'Invalid manifest file')
2385
2386         url_pr = compat_urllib_parse_urlparse(manifest_url)
2387         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2388
2389         info['url'] = url
2390         info['ext'] = 'f4f'
2391         return [info]
2392
2393
2394 class XVideosIE(InfoExtractor):
2395     """Information extractor for xvideos.com"""
2396
2397     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2398     IE_NAME = u'xvideos'
2399
2400     def _real_extract(self, url):
2401         mobj = re.match(self._VALID_URL, url)
2402         if mobj is None:
2403             raise ExtractorError(u'Invalid URL: %s' % url)
2404         video_id = mobj.group(1)
2405
2406         webpage = self._download_webpage(url, video_id)
2407
2408         self.report_extraction(video_id)
2409
2410
2411         # Extract video URL
2412         mobj = re.search(r'flv_url=(.+?)&', webpage)
2413         if mobj is None:
2414             raise ExtractorError(u'Unable to extract video url')
2415         video_url = compat_urllib_parse.unquote(mobj.group(1))
2416
2417
2418         # Extract title
2419         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2420         if mobj is None:
2421             raise ExtractorError(u'Unable to extract video title')
2422         video_title = mobj.group(1)
2423
2424
2425         # Extract video thumbnail
2426         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2427         if mobj is None:
2428             raise ExtractorError(u'Unable to extract video thumbnail')
2429         video_thumbnail = mobj.group(0)
2430
2431         info = {
2432             'id': video_id,
2433             'url': video_url,
2434             'uploader': None,
2435             'upload_date': None,
2436             'title': video_title,
2437             'ext': 'flv',
2438             'thumbnail': video_thumbnail,
2439             'description': None,
2440         }
2441
2442         return [info]
2443
2444
2445 class SoundcloudIE(InfoExtractor):
2446     """Information extractor for soundcloud.com
2447        To access the media, the uid of the song and a stream token
2448        must be extracted from the page source and the script must make
2449        a request to media.soundcloud.com/crossdomain.xml. Then
2450        the media can be grabbed by requesting from an url composed
2451        of the stream token and uid
2452      """
2453
2454     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2455     IE_NAME = u'soundcloud'
2456
2457     def report_resolve(self, video_id):
2458         """Report information extraction."""
2459         self.to_screen(u'%s: Resolving id' % video_id)
2460
2461     def _real_extract(self, url):
2462         mobj = re.match(self._VALID_URL, url)
2463         if mobj is None:
2464             raise ExtractorError(u'Invalid URL: %s' % url)
2465
2466         # extract uploader (which is in the url)
2467         uploader = mobj.group(1)
2468         # extract simple title (uploader + slug of song title)
2469         slug_title =  mobj.group(2)
2470         simple_title = uploader + u'-' + slug_title
2471         full_title = '%s/%s' % (uploader, slug_title)
2472
2473         self.report_resolve(full_title)
2474
2475         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2476         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2477         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2478
2479         info = json.loads(info_json)
2480         video_id = info['id']
2481         self.report_extraction(full_title)
2482
2483         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2484         stream_json = self._download_webpage(streams_url, full_title,
2485                                              u'Downloading stream definitions',
2486                                              u'unable to download stream definitions')
2487
2488         streams = json.loads(stream_json)
2489         mediaURL = streams['http_mp3_128_url']
2490         upload_date = unified_strdate(info['created_at'])
2491
2492         return [{
2493             'id':       info['id'],
2494             'url':      mediaURL,
2495             'uploader': info['user']['username'],
2496             'upload_date': upload_date,
2497             'title':    info['title'],
2498             'ext':      u'mp3',
2499             'description': info['description'],
2500         }]
2501
2502 class SoundcloudSetIE(InfoExtractor):
2503     """Information extractor for soundcloud.com sets
2504        To access the media, the uid of the song and a stream token
2505        must be extracted from the page source and the script must make
2506        a request to media.soundcloud.com/crossdomain.xml. Then
2507        the media can be grabbed by requesting from an url composed
2508        of the stream token and uid
2509      """
2510
2511     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2512     IE_NAME = u'soundcloud:set'
2513
2514     def report_resolve(self, video_id):
2515         """Report information extraction."""
2516         self.to_screen(u'%s: Resolving id' % video_id)
2517
2518     def _real_extract(self, url):
2519         mobj = re.match(self._VALID_URL, url)
2520         if mobj is None:
2521             raise ExtractorError(u'Invalid URL: %s' % url)
2522
2523         # extract uploader (which is in the url)
2524         uploader = mobj.group(1)
2525         # extract simple title (uploader + slug of song title)
2526         slug_title =  mobj.group(2)
2527         simple_title = uploader + u'-' + slug_title
2528         full_title = '%s/sets/%s' % (uploader, slug_title)
2529
2530         self.report_resolve(full_title)
2531
2532         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2533         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2534         info_json = self._download_webpage(resolv_url, full_title)
2535
2536         videos = []
2537         info = json.loads(info_json)
2538         if 'errors' in info:
2539             for err in info['errors']:
2540                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2541             return
2542
2543         self.report_extraction(full_title)
2544         for track in info['tracks']:
2545             video_id = track['id']
2546
2547             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2549
2550             self.report_extraction(video_id)
2551             streams = json.loads(stream_json)
2552             mediaURL = streams['http_mp3_128_url']
2553
2554             videos.append({
2555                 'id':       video_id,
2556                 'url':      mediaURL,
2557                 'uploader': track['user']['username'],
2558                 'upload_date':  unified_strdate(track['created_at']),
2559                 'title':    track['title'],
2560                 'ext':      u'mp3',
2561                 'description': track['description'],
2562             })
2563         return videos
2564
2565
2566 class InfoQIE(InfoExtractor):
2567     """Information extractor for infoq.com"""
2568     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2569
2570     def _real_extract(self, url):
2571         mobj = re.match(self._VALID_URL, url)
2572         if mobj is None:
2573             raise ExtractorError(u'Invalid URL: %s' % url)
2574
2575         webpage = self._download_webpage(url, video_id=url)
2576         self.report_extraction(url)
2577
2578         # Extract video URL
2579         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2580         if mobj is None:
2581             raise ExtractorError(u'Unable to extract video url')
2582         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2583         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2584
2585         # Extract title
2586         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2587         if mobj is None:
2588             raise ExtractorError(u'Unable to extract video title')
2589         video_title = mobj.group(1)
2590
2591         # Extract description
2592         video_description = u'No description available.'
2593         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2594         if mobj is not None:
2595             video_description = mobj.group(1)
2596
2597         video_filename = video_url.split('/')[-1]
2598         video_id, extension = video_filename.split('.')
2599
2600         info = {
2601             'id': video_id,
2602             'url': video_url,
2603             'uploader': None,
2604             'upload_date': None,
2605             'title': video_title,
2606             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2607             'thumbnail': None,
2608             'description': video_description,
2609         }
2610
2611         return [info]
2612
2613 class MixcloudIE(InfoExtractor):
2614     """Information extractor for www.mixcloud.com"""
2615
2616     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2617     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2618     IE_NAME = u'mixcloud'
2619
2620     def report_download_json(self, file_id):
2621         """Report JSON download."""
2622         self.to_screen(u'Downloading json')
2623
2624     def get_urls(self, jsonData, fmt, bitrate='best'):
2625         """Get urls from 'audio_formats' section in json"""
2626         file_url = None
2627         try:
2628             bitrate_list = jsonData[fmt]
2629             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2630                 bitrate = max(bitrate_list) # select highest
2631
2632             url_list = jsonData[fmt][bitrate]
2633         except TypeError: # we have no bitrate info.
2634             url_list = jsonData[fmt]
2635         return url_list
2636
2637     def check_urls(self, url_list):
2638         """Returns 1st active url from list"""
2639         for url in url_list:
2640             try:
2641                 compat_urllib_request.urlopen(url)
2642                 return url
2643             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644                 url = None
2645
2646         return None
2647
2648     def _print_formats(self, formats):
2649         print('Available formats:')
2650         for fmt in formats.keys():
2651             for b in formats[fmt]:
2652                 try:
2653                     ext = formats[fmt][b][0]
2654                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2655                 except TypeError: # we have no bitrate info
2656                     ext = formats[fmt][0]
2657                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2658                     break
2659
2660     def _real_extract(self, url):
2661         mobj = re.match(self._VALID_URL, url)
2662         if mobj is None:
2663             raise ExtractorError(u'Invalid URL: %s' % url)
2664         # extract uploader & filename from url
2665         uploader = mobj.group(1).decode('utf-8')
2666         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2667
2668         # construct API request
2669         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2670         # retrieve .json file with links to files
2671         request = compat_urllib_request.Request(file_url)
2672         try:
2673             self.report_download_json(file_url)
2674             jsonData = compat_urllib_request.urlopen(request).read()
2675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2677
2678         # parse JSON
2679         json_data = json.loads(jsonData)
2680         player_url = json_data['player_swf_url']
2681         formats = dict(json_data['audio_formats'])
2682
2683         req_format = self._downloader.params.get('format', None)
2684         bitrate = None
2685
2686         if self._downloader.params.get('listformats', None):
2687             self._print_formats(formats)
2688             return
2689
2690         if req_format is None or req_format == 'best':
2691             for format_param in formats.keys():
2692                 url_list = self.get_urls(formats, format_param)
2693                 # check urls
2694                 file_url = self.check_urls(url_list)
2695                 if file_url is not None:
2696                     break # got it!
2697         else:
2698             if req_format not in formats:
2699                 raise ExtractorError(u'Format is not available')
2700
2701             url_list = self.get_urls(formats, req_format)
2702             file_url = self.check_urls(url_list)
2703             format_param = req_format
2704
2705         return [{
2706             'id': file_id.decode('utf-8'),
2707             'url': file_url.decode('utf-8'),
2708             'uploader': uploader.decode('utf-8'),
2709             'upload_date': None,
2710             'title': json_data['name'],
2711             'ext': file_url.split('.')[-1].decode('utf-8'),
2712             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2713             'thumbnail': json_data['thumbnail_url'],
2714             'description': json_data['description'],
2715             'player_url': player_url.decode('utf-8'),
2716         }]
2717
2718 class StanfordOpenClassroomIE(InfoExtractor):
2719     """Information extractor for Stanford's Open ClassRoom"""
2720
2721     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2722     IE_NAME = u'stanfordoc'
2723
2724     def _real_extract(self, url):
2725         mobj = re.match(self._VALID_URL, url)
2726         if mobj is None:
2727             raise ExtractorError(u'Invalid URL: %s' % url)
2728
2729         if mobj.group('course') and mobj.group('video'): # A specific video
2730             course = mobj.group('course')
2731             video = mobj.group('video')
2732             info = {
2733                 'id': course + '_' + video,
2734                 'uploader': None,
2735                 'upload_date': None,
2736             }
2737
2738             self.report_extraction(info['id'])
2739             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2740             xmlUrl = baseUrl + video + '.xml'
2741             try:
2742                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2743             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2745             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2746             try:
2747                 info['title'] = mdoc.findall('./title')[0].text
2748                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2749             except IndexError:
2750                 raise ExtractorError(u'Invalid metadata XML file')
2751             info['ext'] = info['url'].rpartition('.')[2]
2752             return [info]
2753         elif mobj.group('course'): # A course page
2754             course = mobj.group('course')
2755             info = {
2756                 'id': course,
2757                 'type': 'playlist',
2758                 'uploader': None,
2759                 'upload_date': None,
2760             }
2761
2762             coursepage = self._download_webpage(url, info['id'],
2763                                         note='Downloading course info page',
2764                                         errnote='Unable to download course info page')
2765
2766             m = re.search('<h1>([^<]+)</h1>', coursepage)
2767             if m:
2768                 info['title'] = unescapeHTML(m.group(1))
2769             else:
2770                 info['title'] = info['id']
2771
2772             m = re.search('<description>([^<]+)</description>', coursepage)
2773             if m:
2774                 info['description'] = unescapeHTML(m.group(1))
2775
2776             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2777             info['list'] = [
2778                 {
2779                     'type': 'reference',
2780                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2781                 }
2782                     for vpage in links]
2783             results = []
2784             for entry in info['list']:
2785                 assert entry['type'] == 'reference'
2786                 results += self.extract(entry['url'])
2787             return results
2788         else: # Root page
2789             info = {
2790                 'id': 'Stanford OpenClassroom',
2791                 'type': 'playlist',
2792                 'uploader': None,
2793                 'upload_date': None,
2794             }
2795
2796             self.report_download_webpage(info['id'])
2797             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2798             try:
2799                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2800             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2802
2803             info['title'] = info['id']
2804
2805             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2806             info['list'] = [
2807                 {
2808                     'type': 'reference',
2809                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2810                 }
2811                     for cpage in links]
2812
2813             results = []
2814             for entry in info['list']:
2815                 assert entry['type'] == 'reference'
2816                 results += self.extract(entry['url'])
2817             return results
2818
2819 class MTVIE(InfoExtractor):
2820     """Information extractor for MTV.com"""
2821
2822     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2823     IE_NAME = u'mtv'
2824
2825     def _real_extract(self, url):
2826         mobj = re.match(self._VALID_URL, url)
2827         if mobj is None:
2828             raise ExtractorError(u'Invalid URL: %s' % url)
2829         if not mobj.group('proto'):
2830             url = 'http://' + url
2831         video_id = mobj.group('videoid')
2832
2833         webpage = self._download_webpage(url, video_id)
2834
2835         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2836         if mobj is None:
2837             raise ExtractorError(u'Unable to extract song name')
2838         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2839         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2840         if mobj is None:
2841             raise ExtractorError(u'Unable to extract performer')
2842         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2843         video_title = performer + ' - ' + song_name
2844
2845         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2846         if mobj is None:
2847             raise ExtractorError(u'Unable to mtvn_uri')
2848         mtvn_uri = mobj.group(1)
2849
2850         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2851         if mobj is None:
2852             raise ExtractorError(u'Unable to extract content id')
2853         content_id = mobj.group(1)
2854
2855         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2856         self.report_extraction(video_id)
2857         request = compat_urllib_request.Request(videogen_url)
2858         try:
2859             metadataXml = compat_urllib_request.urlopen(request).read()
2860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2862
2863         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2864         renditions = mdoc.findall('.//rendition')
2865
2866         # For now, always pick the highest quality.
2867         rendition = renditions[-1]
2868
2869         try:
2870             _,_,ext = rendition.attrib['type'].partition('/')
2871             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2872             video_url = rendition.find('./src').text
2873         except KeyError:
2874             raise ExtractorError('Invalid rendition field.')
2875
2876         info = {
2877             'id': video_id,
2878             'url': video_url,
2879             'uploader': performer,
2880             'upload_date': None,
2881             'title': video_title,
2882             'ext': ext,
2883             'format': format,
2884         }
2885
2886         return [info]
2887
2888
2889 class YoukuIE(InfoExtractor):
2890     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2891
2892     def _gen_sid(self):
2893         nowTime = int(time.time() * 1000)
2894         random1 = random.randint(1000,1998)
2895         random2 = random.randint(1000,9999)
2896
2897         return "%d%d%d" %(nowTime,random1,random2)
2898
2899     def _get_file_ID_mix_string(self, seed):
2900         mixed = []
2901         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2902         seed = float(seed)
2903         for i in range(len(source)):
2904             seed  =  (seed * 211 + 30031 ) % 65536
2905             index  =  math.floor(seed / 65536 * len(source) )
2906             mixed.append(source[int(index)])
2907             source.remove(source[int(index)])
2908         #return ''.join(mixed)
2909         return mixed
2910
2911     def _get_file_id(self, fileId, seed):
2912         mixed = self._get_file_ID_mix_string(seed)
2913         ids = fileId.split('*')
2914         realId = []
2915         for ch in ids:
2916             if ch:
2917                 realId.append(mixed[int(ch)])
2918         return ''.join(realId)
2919
2920     def _real_extract(self, url):
2921         mobj = re.match(self._VALID_URL, url)
2922         if mobj is None:
2923             raise ExtractorError(u'Invalid URL: %s' % url)
2924         video_id = mobj.group('ID')
2925
2926         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2927
2928         jsondata = self._download_webpage(info_url, video_id)
2929
2930         self.report_extraction(video_id)
2931         try:
2932             config = json.loads(jsondata)
2933
2934             video_title =  config['data'][0]['title']
2935             seed = config['data'][0]['seed']
2936
2937             format = self._downloader.params.get('format', None)
2938             supported_format = list(config['data'][0]['streamfileids'].keys())
2939
2940             if format is None or format == 'best':
2941                 if 'hd2' in supported_format:
2942                     format = 'hd2'
2943                 else:
2944                     format = 'flv'
2945                 ext = u'flv'
2946             elif format == 'worst':
2947                 format = 'mp4'
2948                 ext = u'mp4'
2949             else:
2950                 format = 'flv'
2951                 ext = u'flv'
2952
2953
2954             fileid = config['data'][0]['streamfileids'][format]
2955             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2956         except (UnicodeDecodeError, ValueError, KeyError):
2957             raise ExtractorError(u'Unable to extract info section')
2958
2959         files_info=[]
2960         sid = self._gen_sid()
2961         fileid = self._get_file_id(fileid, seed)
2962
2963         #column 8,9 of fileid represent the segment number
2964         #fileid[7:9] should be changed
2965         for index, key in enumerate(keys):
2966
2967             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2968             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2969
2970             info = {
2971                 'id': '%s_part%02d' % (video_id, index),
2972                 'url': download_url,
2973                 'uploader': None,
2974                 'upload_date': None,
2975                 'title': video_title,
2976                 'ext': ext,
2977             }
2978             files_info.append(info)
2979
2980         return files_info
2981
2982
2983 class XNXXIE(InfoExtractor):
2984     """Information extractor for xnxx.com"""
2985
2986     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2987     IE_NAME = u'xnxx'
2988     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2989     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2990     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2991
2992     def _real_extract(self, url):
2993         mobj = re.match(self._VALID_URL, url)
2994         if mobj is None:
2995             raise ExtractorError(u'Invalid URL: %s' % url)
2996         video_id = mobj.group(1)
2997
2998         # Get webpage content
2999         webpage = self._download_webpage(url, video_id)
3000
3001         result = re.search(self.VIDEO_URL_RE, webpage)
3002         if result is None:
3003             raise ExtractorError(u'Unable to extract video url')
3004         video_url = compat_urllib_parse.unquote(result.group(1))
3005
3006         result = re.search(self.VIDEO_TITLE_RE, webpage)
3007         if result is None:
3008             raise ExtractorError(u'Unable to extract video title')
3009         video_title = result.group(1)
3010
3011         result = re.search(self.VIDEO_THUMB_RE, webpage)
3012         if result is None:
3013             raise ExtractorError(u'Unable to extract video thumbnail')
3014         video_thumbnail = result.group(1)
3015
3016         return [{
3017             'id': video_id,
3018             'url': video_url,
3019             'uploader': None,
3020             'upload_date': None,
3021             'title': video_title,
3022             'ext': 'flv',
3023             'thumbnail': video_thumbnail,
3024             'description': None,
3025         }]
3026
3027
3028 class GooglePlusIE(InfoExtractor):
3029     """Information extractor for plus.google.com."""
3030
3031     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3032     IE_NAME = u'plus.google'
3033
3034     def report_extract_entry(self, url):
3035         """Report downloading extry"""
3036         self.to_screen(u'Downloading entry: %s' % url)
3037
3038     def report_date(self, upload_date):
3039         """Report downloading extry"""
3040         self.to_screen(u'Entry date: %s' % upload_date)
3041
3042     def report_uploader(self, uploader):
3043         """Report downloading extry"""
3044         self.to_screen(u'Uploader: %s' % uploader)
3045
3046     def report_title(self, video_title):
3047         """Report downloading extry"""
3048         self.to_screen(u'Title: %s' % video_title)
3049
3050     def report_extract_vid_page(self, video_page):
3051         """Report information extraction."""
3052         self.to_screen(u'Extracting video page: %s' % video_page)
3053
3054     def _real_extract(self, url):
3055         # Extract id from URL
3056         mobj = re.match(self._VALID_URL, url)
3057         if mobj is None:
3058             raise ExtractorError(u'Invalid URL: %s' % url)
3059
3060         post_url = mobj.group(0)
3061         video_id = mobj.group(1)
3062
3063         video_extension = 'flv'
3064
3065         # Step 1, Retrieve post webpage to extract further information
3066         self.report_extract_entry(post_url)
3067         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3068
3069         # Extract update date
3070         upload_date = None
3071         pattern = 'title="Timestamp">(.*?)</a>'
3072         mobj = re.search(pattern, webpage)
3073         if mobj:
3074             upload_date = mobj.group(1)
3075             # Convert timestring to a format suitable for filename
3076             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3077             upload_date = upload_date.strftime('%Y%m%d')
3078         self.report_date(upload_date)
3079
3080         # Extract uploader
3081         uploader = None
3082         pattern = r'rel\="author".*?>(.*?)</a>'
3083         mobj = re.search(pattern, webpage)
3084         if mobj:
3085             uploader = mobj.group(1)
3086         self.report_uploader(uploader)
3087
3088         # Extract title
3089         # Get the first line for title
3090         video_title = u'NA'
3091         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3092         mobj = re.search(pattern, webpage)
3093         if mobj:
3094             video_title = mobj.group(1)
3095         self.report_title(video_title)
3096
3097         # Step 2, Stimulate clicking the image box to launch video
3098         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3099         mobj = re.search(pattern, webpage)
3100         if mobj is None:
3101             raise ExtractorError(u'Unable to extract video page URL')
3102
3103         video_page = mobj.group(1)
3104         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3105         self.report_extract_vid_page(video_page)
3106
3107
3108         # Extract video links on video page
3109         """Extract video links of all sizes"""
3110         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3111         mobj = re.findall(pattern, webpage)
3112         if len(mobj) == 0:
3113             raise ExtractorError(u'Unable to extract video links')
3114
3115         # Sort in resolution
3116         links = sorted(mobj)
3117
3118         # Choose the lowest of the sort, i.e. highest resolution
3119         video_url = links[-1]
3120         # Only get the url. The resolution part in the tuple has no use anymore
3121         video_url = video_url[-1]
3122         # Treat escaped \u0026 style hex
3123         try:
3124             video_url = video_url.decode("unicode_escape")
3125         except AttributeError: # Python 3
3126             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3127
3128
3129         return [{
3130             'id':       video_id,
3131             'url':      video_url,
3132             'uploader': uploader,
3133             'upload_date':  upload_date,
3134             'title':    video_title,
3135             'ext':      video_extension,
3136         }]
3137
3138 class NBAIE(InfoExtractor):
3139     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3140     IE_NAME = u'nba'
3141
3142     def _real_extract(self, url):
3143         mobj = re.match(self._VALID_URL, url)
3144         if mobj is None:
3145             raise ExtractorError(u'Invalid URL: %s' % url)
3146
3147         video_id = mobj.group(1)
3148         if video_id.endswith('/index.html'):
3149             video_id = video_id[:-len('/index.html')]
3150
3151         webpage = self._download_webpage(url, video_id)
3152
3153         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3154         def _findProp(rexp, default=None):
3155             m = re.search(rexp, webpage)
3156             if m:
3157                 return unescapeHTML(m.group(1))
3158             else:
3159                 return default
3160
3161         shortened_video_id = video_id.rpartition('/')[2]
3162         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3163         info = {
3164             'id': shortened_video_id,
3165             'url': video_url,
3166             'ext': 'mp4',
3167             'title': title,
3168             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3169             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3170         }
3171         return [info]
3172
3173 class JustinTVIE(InfoExtractor):
3174     """Information extractor for justin.tv and twitch.tv"""
3175     # TODO: One broadcast may be split into multiple videos. The key
3176     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3177     # starts at 1 and increases. Can we treat all parts as one video?
3178
3179     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3180         (?:
3181             (?P<channelid>[^/]+)|
3182             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3183             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3184         )
3185         /?(?:\#.*)?$
3186         """
3187     _JUSTIN_PAGE_LIMIT = 100
3188     IE_NAME = u'justin.tv'
3189
3190     def report_download_page(self, channel, offset):
3191         """Report attempt to download a single page of videos."""
3192         self.to_screen(u'%s: Downloading video information from %d to %d' %
3193                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3194
3195     # Return count of items, list of *valid* items
3196     def _parse_page(self, url, video_id):
3197         webpage = self._download_webpage(url, video_id,
3198                                          u'Downloading video info JSON',
3199                                          u'unable to download video info JSON')
3200
3201         response = json.loads(webpage)
3202         if type(response) != list:
3203             error_text = response.get('error', 'unknown error')
3204             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3205         info = []
3206         for clip in response:
3207             video_url = clip['video_file_url']
3208             if video_url:
3209                 video_extension = os.path.splitext(video_url)[1][1:]
3210                 video_date = re.sub('-', '', clip['start_time'][:10])
3211                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3212                 video_id = clip['id']
3213                 video_title = clip.get('title', video_id)
3214                 info.append({
3215                     'id': video_id,
3216                     'url': video_url,
3217                     'title': video_title,
3218                     'uploader': clip.get('channel_name', video_uploader_id),
3219                     'uploader_id': video_uploader_id,
3220                     'upload_date': video_date,
3221                     'ext': video_extension,
3222                 })
3223         return (len(response), info)
3224
3225     def _real_extract(self, url):
3226         mobj = re.match(self._VALID_URL, url)
3227         if mobj is None:
3228             raise ExtractorError(u'invalid URL: %s' % url)
3229
3230         api_base = 'http://api.justin.tv'
3231         paged = False
3232         if mobj.group('channelid'):
3233             paged = True
3234             video_id = mobj.group('channelid')
3235             api = api_base + '/channel/archives/%s.json' % video_id
3236         elif mobj.group('chapterid'):
3237             chapter_id = mobj.group('chapterid')
3238
3239             webpage = self._download_webpage(url, chapter_id)
3240             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3241             if not m:
3242                 raise ExtractorError(u'Cannot find archive of a chapter')
3243             archive_id = m.group(1)
3244
3245             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3246             chapter_info_xml = self._download_webpage(api, chapter_id,
3247                                              note=u'Downloading chapter information',
3248                                              errnote=u'Chapter information download failed')
3249             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3250             for a in doc.findall('.//archive'):
3251                 if archive_id == a.find('./id').text:
3252                     break
3253             else:
3254                 raise ExtractorError(u'Could not find chapter in chapter information')
3255
3256             video_url = a.find('./video_file_url').text
3257             video_ext = video_url.rpartition('.')[2] or u'flv'
3258
3259             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3260             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3261                                    note='Downloading chapter metadata',
3262                                    errnote='Download of chapter metadata failed')
3263             chapter_info = json.loads(chapter_info_json)
3264
3265             bracket_start = int(doc.find('.//bracket_start').text)
3266             bracket_end = int(doc.find('.//bracket_end').text)
3267
3268             # TODO determine start (and probably fix up file)
3269             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3270             #video_url += u'?start=' + TODO:start_timestamp
3271             # bracket_start is 13290, but we want 51670615
3272             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3273                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3274
3275             info = {
3276                 'id': u'c' + chapter_id,
3277                 'url': video_url,
3278                 'ext': video_ext,
3279                 'title': chapter_info['title'],
3280                 'thumbnail': chapter_info['preview'],
3281                 'description': chapter_info['description'],
3282                 'uploader': chapter_info['channel']['display_name'],
3283                 'uploader_id': chapter_info['channel']['name'],
3284             }
3285             return [info]
3286         else:
3287             video_id = mobj.group('videoid')
3288             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3289
3290         self.report_extraction(video_id)
3291
3292         info = []
3293         offset = 0
3294         limit = self._JUSTIN_PAGE_LIMIT
3295         while True:
3296             if paged:
3297                 self.report_download_page(video_id, offset)
3298             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3299             page_count, page_info = self._parse_page(page_url, video_id)
3300             info.extend(page_info)
3301             if not paged or page_count != limit:
3302                 break
3303             offset += limit
3304         return info
3305
3306 class FunnyOrDieIE(InfoExtractor):
3307     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3308
3309     def _real_extract(self, url):
3310         mobj = re.match(self._VALID_URL, url)
3311         if mobj is None:
3312             raise ExtractorError(u'invalid URL: %s' % url)
3313
3314         video_id = mobj.group('id')
3315         webpage = self._download_webpage(url, video_id)
3316
3317         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3318         if not m:
3319             raise ExtractorError(u'Unable to find video information')
3320         video_url = unescapeHTML(m.group('url'))
3321
3322         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3323         if not m:
3324             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3325             if not m:
3326                 raise ExtractorError(u'Cannot find video title')
3327         title = clean_html(m.group('title'))
3328
3329         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3330         if m:
3331             desc = unescapeHTML(m.group('desc'))
3332         else:
3333             desc = None
3334
3335         info = {
3336             'id': video_id,
3337             'url': video_url,
3338             'ext': 'mp4',
3339             'title': title,
3340             'description': desc,
3341         }
3342         return [info]
3343
3344 class SteamIE(InfoExtractor):
3345     _VALID_URL = r"""http://store\.steampowered\.com/
3346                 (agecheck/)?
3347                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3348                 (?P<gameID>\d+)/?
3349                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3350                 """
3351
3352     @classmethod
3353     def suitable(cls, url):
3354         """Receives a URL and returns True if suitable for this IE."""
3355         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3356
3357     def _real_extract(self, url):
3358         m = re.match(self._VALID_URL, url, re.VERBOSE)
3359         gameID = m.group('gameID')
3360         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3361         self.report_age_confirmation()
3362         webpage = self._download_webpage(videourl, gameID)
3363         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3364         
3365         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3366         mweb = re.finditer(urlRE, webpage)
3367         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3368         titles = re.finditer(namesRE, webpage)
3369         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3370         thumbs = re.finditer(thumbsRE, webpage)
3371         videos = []
3372         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3373             video_id = vid.group('videoID')
3374             title = vtitle.group('videoName')
3375             video_url = vid.group('videoURL')
3376             video_thumb = thumb.group('thumbnail')
3377             if not video_url:
3378                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3379             info = {
3380                 'id':video_id,
3381                 'url':video_url,
3382                 'ext': 'flv',
3383                 'title': unescapeHTML(title),
3384                 'thumbnail': video_thumb
3385                   }
3386             videos.append(info)
3387         return [self.playlist_result(videos, gameID, game_title)]
3388
3389 class UstreamIE(InfoExtractor):
3390     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3391     IE_NAME = u'ustream'
3392
3393     def _real_extract(self, url):
3394         m = re.match(self._VALID_URL, url)
3395         video_id = m.group('videoID')
3396         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3397         webpage = self._download_webpage(url, video_id)
3398         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3399         title = m.group('title')
3400         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3401         uploader = m.group('uploader')
3402         info = {
3403                 'id':video_id,
3404                 'url':video_url,
3405                 'ext': 'flv',
3406                 'title': title,
3407                 'uploader': uploader
3408                   }
3409         return [info]
3410
3411 class WorldStarHipHopIE(InfoExtractor):
3412     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3413     IE_NAME = u'WorldStarHipHop'
3414
3415     def _real_extract(self, url):
3416         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3417
3418         m = re.match(self._VALID_URL, url)
3419         video_id = m.group('id')
3420
3421         webpage_src = self._download_webpage(url, video_id) 
3422
3423         mobj = re.search(_src_url, webpage_src)
3424
3425         if mobj is not None:
3426             video_url = mobj.group(1)
3427             if 'mp4' in video_url:
3428                 ext = 'mp4'
3429             else:
3430                 ext = 'flv'
3431         else:
3432             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3433
3434         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3435
3436         if mobj is None:
3437             raise ExtractorError(u'Cannot determine title')
3438         title = mobj.group(1)
3439
3440         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3441         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3442         if mobj is not None:
3443             thumbnail = mobj.group(1)
3444         else:
3445             _title = r"""candytitles.*>(.*)</span>"""
3446             mobj = re.search(_title, webpage_src)
3447             if mobj is not None:
3448                 title = mobj.group(1)
3449             thumbnail = None
3450
3451         results = [{
3452                     'id': video_id,
3453                     'url' : video_url,
3454                     'title' : title,
3455                     'thumbnail' : thumbnail,
3456                     'ext' : ext,
3457                     }]
3458         return results
3459
3460 class RBMARadioIE(InfoExtractor):
3461     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3462
3463     def _real_extract(self, url):
3464         m = re.match(self._VALID_URL, url)
3465         video_id = m.group('videoID')
3466
3467         webpage = self._download_webpage(url, video_id)
3468         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3469         if not m:
3470             raise ExtractorError(u'Cannot find metadata')
3471         json_data = m.group(1)
3472
3473         try:
3474             data = json.loads(json_data)
3475         except ValueError as e:
3476             raise ExtractorError(u'Invalid JSON: ' + str(e))
3477
3478         video_url = data['akamai_url'] + '&cbr=256'
3479         url_parts = compat_urllib_parse_urlparse(video_url)
3480         video_ext = url_parts.path.rpartition('.')[2]
3481         info = {
3482                 'id': video_id,
3483                 'url': video_url,
3484                 'ext': video_ext,
3485                 'title': data['title'],
3486                 'description': data.get('teaser_text'),
3487                 'location': data.get('country_of_origin'),
3488                 'uploader': data.get('host', {}).get('name'),
3489                 'uploader_id': data.get('host', {}).get('slug'),
3490                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3491                 'duration': data.get('duration'),
3492         }
3493         return [info]
3494
3495
3496 class YouPornIE(InfoExtractor):
3497     """Information extractor for youporn.com."""
3498     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3499
3500     def _print_formats(self, formats):
3501         """Print all available formats"""
3502         print(u'Available formats:')
3503         print(u'ext\t\tformat')
3504         print(u'---------------------------------')
3505         for format in formats:
3506             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3507
3508     def _specific(self, req_format, formats):
3509         for x in formats:
3510             if(x["format"]==req_format):
3511                 return x
3512         return None
3513
3514     def _real_extract(self, url):
3515         mobj = re.match(self._VALID_URL, url)
3516         if mobj is None:
3517             raise ExtractorError(u'Invalid URL: %s' % url)
3518
3519         video_id = mobj.group('videoid')
3520
3521         req = compat_urllib_request.Request(url)
3522         req.add_header('Cookie', 'age_verified=1')
3523         webpage = self._download_webpage(req, video_id)
3524
3525         # Get the video title
3526         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3527         if result is None:
3528             raise ExtractorError(u'Unable to extract video title')
3529         video_title = result.group('title').strip()
3530
3531         # Get the video date
3532         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3533         if result is None:
3534             self._downloader.report_warning(u'unable to extract video date')
3535             upload_date = None
3536         else:
3537             upload_date = unified_strdate(result.group('date').strip())
3538
3539         # Get the video uploader
3540         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3541         if result is None:
3542             self._downloader.report_warning(u'unable to extract uploader')
3543             video_uploader = None
3544         else:
3545             video_uploader = result.group('uploader').strip()
3546             video_uploader = clean_html( video_uploader )
3547
3548         # Get all of the formats available
3549         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3550         result = re.search(DOWNLOAD_LIST_RE, webpage)
3551         if result is None:
3552             raise ExtractorError(u'Unable to extract download list')
3553         download_list_html = result.group('download_list').strip()
3554
3555         # Get all of the links from the page
3556         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3557         links = re.findall(LINK_RE, download_list_html)
3558         if(len(links) == 0):
3559             raise ExtractorError(u'ERROR: no known formats available for video')
3560
3561         self.to_screen(u'Links found: %d' % len(links))
3562
3563         formats = []
3564         for link in links:
3565
3566             # A link looks like this:
3567             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3568             # A path looks like this:
3569             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3570             video_url = unescapeHTML( link )
3571             path = compat_urllib_parse_urlparse( video_url ).path
3572             extension = os.path.splitext( path )[1][1:]
3573             format = path.split('/')[4].split('_')[:2]
3574             size = format[0]
3575             bitrate = format[1]
3576             format = "-".join( format )
3577             title = u'%s-%s-%s' % (video_title, size, bitrate)
3578
3579             formats.append({
3580                 'id': video_id,
3581                 'url': video_url,
3582                 'uploader': video_uploader,
3583                 'upload_date': upload_date,
3584                 'title': title,
3585                 'ext': extension,
3586                 'format': format,
3587                 'thumbnail': None,
3588                 'description': None,
3589                 'player_url': None
3590             })
3591
3592         if self._downloader.params.get('listformats', None):
3593             self._print_formats(formats)
3594             return
3595
3596         req_format = self._downloader.params.get('format', None)
3597         self.to_screen(u'Format: %s' % req_format)
3598
3599         if req_format is None or req_format == 'best':
3600             return [formats[0]]
3601         elif req_format == 'worst':
3602             return [formats[-1]]
3603         elif req_format in ('-1', 'all'):
3604             return formats
3605         else:
3606             format = self._specific( req_format, formats )
3607             if result is None:
3608                 raise ExtractorError(u'Requested format not available')
3609             return [format]
3610
3611
3612
3613 class PornotubeIE(InfoExtractor):
3614     """Information extractor for pornotube.com."""
3615     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3616
3617     def _real_extract(self, url):
3618         mobj = re.match(self._VALID_URL, url)
3619         if mobj is None:
3620             raise ExtractorError(u'Invalid URL: %s' % url)
3621
3622         video_id = mobj.group('videoid')
3623         video_title = mobj.group('title')
3624
3625         # Get webpage content
3626         webpage = self._download_webpage(url, video_id)
3627
3628         # Get the video URL
3629         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3630         result = re.search(VIDEO_URL_RE, webpage)
3631         if result is None:
3632             raise ExtractorError(u'Unable to extract video url')
3633         video_url = compat_urllib_parse.unquote(result.group('url'))
3634
3635         #Get the uploaded date
3636         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3637         result = re.search(VIDEO_UPLOADED_RE, webpage)
3638         if result is None:
3639             raise ExtractorError(u'Unable to extract video title')
3640         upload_date = unified_strdate(result.group('date'))
3641
3642         info = {'id': video_id,
3643                 'url': video_url,
3644                 'uploader': None,
3645                 'upload_date': upload_date,
3646                 'title': video_title,
3647                 'ext': 'flv',
3648                 'format': 'flv'}
3649
3650         return [info]
3651
3652 class YouJizzIE(InfoExtractor):
3653     """Information extractor for youjizz.com."""
3654     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3655
3656     def _real_extract(self, url):
3657         mobj = re.match(self._VALID_URL, url)
3658         if mobj is None:
3659             raise ExtractorError(u'Invalid URL: %s' % url)
3660
3661         video_id = mobj.group('videoid')
3662
3663         # Get webpage content
3664         webpage = self._download_webpage(url, video_id)
3665
3666         # Get the video title
3667         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3668         if result is None:
3669             raise ExtractorError(u'ERROR: unable to extract video title')
3670         video_title = result.group('title').strip()
3671
3672         # Get the embed page
3673         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3674         if result is None:
3675             raise ExtractorError(u'ERROR: unable to extract embed page')
3676
3677         embed_page_url = result.group(0).strip()
3678         video_id = result.group('videoid')
3679
3680         webpage = self._download_webpage(embed_page_url, video_id)
3681
3682         # Get the video URL
3683         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3684         if result is None:
3685             raise ExtractorError(u'ERROR: unable to extract video url')
3686         video_url = result.group('source')
3687
3688         info = {'id': video_id,
3689                 'url': video_url,
3690                 'title': video_title,
3691                 'ext': 'flv',
3692                 'format': 'flv',
3693                 'player_url': embed_page_url}
3694
3695         return [info]
3696
3697 class EightTracksIE(InfoExtractor):
3698     IE_NAME = '8tracks'
3699     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3700
3701     def _real_extract(self, url):
3702         mobj = re.match(self._VALID_URL, url)
3703         if mobj is None:
3704             raise ExtractorError(u'Invalid URL: %s' % url)
3705         playlist_id = mobj.group('id')
3706
3707         webpage = self._download_webpage(url, playlist_id)
3708
3709         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3710         if not m:
3711             raise ExtractorError(u'Cannot find trax information')
3712         json_like = m.group(1)
3713         data = json.loads(json_like)
3714
3715         session = str(random.randint(0, 1000000000))
3716         mix_id = data['id']
3717         track_count = data['tracks_count']
3718         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719         next_url = first_url
3720         res = []
3721         for i in itertools.count():
3722             api_json = self._download_webpage(next_url, playlist_id,
3723                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724                 errnote=u'Failed to download song information')
3725             api_data = json.loads(api_json)
3726             track_data = api_data[u'set']['track']
3727             info = {
3728                 'id': track_data['id'],
3729                 'url': track_data['track_file_stream_url'],
3730                 'title': track_data['performer'] + u' - ' + track_data['name'],
3731                 'raw_title': track_data['name'],
3732                 'uploader_id': data['user']['login'],
3733                 'ext': 'm4a',
3734             }
3735             res.append(info)
3736             if api_data['set']['at_last_track']:
3737                 break
3738             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3739         return res
3740
3741 class KeekIE(InfoExtractor):
3742     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3743     IE_NAME = u'keek'
3744
3745     def _real_extract(self, url):
3746         m = re.match(self._VALID_URL, url)
3747         video_id = m.group('videoID')
3748         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3749         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3750         webpage = self._download_webpage(url, video_id)
3751         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3752         title = unescapeHTML(m.group('title'))
3753         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3754         uploader = clean_html(m.group('uploader'))
3755         info = {
3756                 'id': video_id,
3757                 'url': video_url,
3758                 'ext': 'mp4',
3759                 'title': title,
3760                 'thumbnail': thumbnail,
3761                 'uploader': uploader
3762         }
3763         return [info]
3764
3765 class TEDIE(InfoExtractor):
3766     _VALID_URL=r'''http://www\.ted\.com/
3767                    (
3768                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3769                         |
3770                         ((?P<type_talk>talks)) # We have a simple talk
3771                    )
3772                    (/lang/(.*?))? # The url may contain the language
3773                    /(?P<name>\w+) # Here goes the name and then ".html"
3774                    '''
3775
3776     @classmethod
3777     def suitable(cls, url):
3778         """Receives a URL and returns True if suitable for this IE."""
3779         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3780
3781     def _real_extract(self, url):
3782         m=re.match(self._VALID_URL, url, re.VERBOSE)
3783         if m.group('type_talk'):
3784             return [self._talk_info(url)]
3785         else :
3786             playlist_id=m.group('playlist_id')
3787             name=m.group('name')
3788             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3789             return [self._playlist_videos_info(url,name,playlist_id)]
3790
3791     def _talk_video_link(self,mediaSlug):
3792         '''Returns the video link for that mediaSlug'''
3793         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3794
3795     def _playlist_videos_info(self,url,name,playlist_id=0):
3796         '''Returns the videos of the playlist'''
3797         video_RE=r'''
3798                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3799                      ([.\s]*?)data-playlist_item_id="(\d+)"
3800                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3801                      '''
3802         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3803         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3804         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3805         m_names=re.finditer(video_name_RE,webpage)
3806
3807         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3808         m_playlist = re.search(playlist_RE, webpage)
3809         playlist_title = m_playlist.group('playlist_title')
3810
3811         playlist_entries = []
3812         for m_video, m_name in zip(m_videos,m_names):
3813             video_id=m_video.group('video_id')
3814             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3815             playlist_entries.append(self.url_result(talk_url, 'TED'))
3816         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3817
3818     def _talk_info(self, url, video_id=0):
3819         """Return the video for the talk in the url"""
3820         m=re.match(self._VALID_URL, url,re.VERBOSE)
3821         videoName=m.group('name')
3822         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3823         # If the url includes the language we get the title translated
3824         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3825         title=re.search(title_RE, webpage).group('title')
3826         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3827                         "id":(?P<videoID>[\d]+).*?
3828                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3829         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3830         thumb_match=re.search(thumb_RE,webpage)
3831         info_match=re.search(info_RE,webpage,re.VERBOSE)
3832         video_id=info_match.group('videoID')
3833         mediaSlug=info_match.group('mediaSlug')
3834         video_url=self._talk_video_link(mediaSlug)
3835         info = {
3836                 'id': video_id,
3837                 'url': video_url,
3838                 'ext': 'mp4',
3839                 'title': title,
3840                 'thumbnail': thumb_match.group('thumbnail')
3841                 }
3842         return info
3843
3844 class MySpassIE(InfoExtractor):
3845     _VALID_URL = r'http://www.myspass.de/.*'
3846
3847     def _real_extract(self, url):
3848         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3849
3850         # video id is the last path element of the URL
3851         # usually there is a trailing slash, so also try the second but last
3852         url_path = compat_urllib_parse_urlparse(url).path
3853         url_parent_path, video_id = os.path.split(url_path)
3854         if not video_id:
3855             _, video_id = os.path.split(url_parent_path)
3856
3857         # get metadata
3858         metadata_url = META_DATA_URL_TEMPLATE % video_id
3859         metadata_text = self._download_webpage(metadata_url, video_id)
3860         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3861
3862         # extract values from metadata
3863         url_flv_el = metadata.find('url_flv')
3864         if url_flv_el is None:
3865             raise ExtractorError(u'Unable to extract download url')
3866         video_url = url_flv_el.text
3867         extension = os.path.splitext(video_url)[1][1:]
3868         title_el = metadata.find('title')
3869         if title_el is None:
3870             raise ExtractorError(u'Unable to extract title')
3871         title = title_el.text
3872         format_id_el = metadata.find('format_id')
3873         if format_id_el is None:
3874             format = ext
3875         else:
3876             format = format_id_el.text
3877         description_el = metadata.find('description')
3878         if description_el is not None:
3879             description = description_el.text
3880         else:
3881             description = None
3882         imagePreview_el = metadata.find('imagePreview')
3883         if imagePreview_el is not None:
3884             thumbnail = imagePreview_el.text
3885         else:
3886             thumbnail = None
3887         info = {
3888             'id': video_id,
3889             'url': video_url,
3890             'title': title,
3891             'ext': extension,
3892             'format': format,
3893             'thumbnail': thumbnail,
3894             'description': description
3895         }
3896         return [info]
3897
3898 class SpiegelIE(InfoExtractor):
3899     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3900
3901     def _real_extract(self, url):
3902         m = re.match(self._VALID_URL, url)
3903         video_id = m.group('videoID')
3904
3905         webpage = self._download_webpage(url, video_id)
3906         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3907         if not m:
3908             raise ExtractorError(u'Cannot find title')
3909         video_title = unescapeHTML(m.group(1))
3910
3911         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3912         xml_code = self._download_webpage(xml_url, video_id,
3913                     note=u'Downloading XML', errnote=u'Failed to download XML')
3914
3915         idoc = xml.etree.ElementTree.fromstring(xml_code)
3916         last_type = idoc[-1]
3917         filename = last_type.findall('./filename')[0].text
3918         duration = float(last_type.findall('./duration')[0].text)
3919
3920         video_url = 'http://video2.spiegel.de/flash/' + filename
3921         video_ext = filename.rpartition('.')[2]
3922         info = {
3923             'id': video_id,
3924             'url': video_url,
3925             'ext': video_ext,
3926             'title': video_title,
3927             'duration': duration,
3928         }
3929         return [info]
3930
3931 class LiveLeakIE(InfoExtractor):
3932
3933     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3934     IE_NAME = u'liveleak'
3935
3936     def _real_extract(self, url):
3937         mobj = re.match(self._VALID_URL, url)
3938         if mobj is None:
3939             raise ExtractorError(u'Invalid URL: %s' % url)
3940
3941         video_id = mobj.group('video_id')
3942
3943         webpage = self._download_webpage(url, video_id)
3944
3945         m = re.search(r'file: "(.*?)",', webpage)
3946         if not m:
3947             raise ExtractorError(u'Unable to find video url')
3948         video_url = m.group(1)
3949
3950         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3951         if not m:
3952             raise ExtractorError(u'Cannot find video title')
3953         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3954
3955         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3956         if m:
3957             desc = unescapeHTML(m.group('desc'))
3958         else:
3959             desc = None
3960
3961         m = re.search(r'By:.*?(\w+)</a>', webpage)
3962         if m:
3963             uploader = clean_html(m.group(1))
3964         else:
3965             uploader = None
3966
3967         info = {
3968             'id':  video_id,
3969             'url': video_url,
3970             'ext': 'mp4',
3971             'title': title,
3972             'description': desc,
3973             'uploader': uploader
3974         }
3975
3976         return [info]
3977
3978 class ARDIE(InfoExtractor):
3979     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3980     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3981     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3982
3983     def _real_extract(self, url):
3984         # determine video id from url
3985         m = re.match(self._VALID_URL, url)
3986
3987         numid = re.search(r'documentId=([0-9]+)', url)
3988         if numid:
3989             video_id = numid.group(1)
3990         else:
3991             video_id = m.group('video_id')
3992
3993         # determine title and media streams from webpage
3994         html = self._download_webpage(url, video_id)
3995         title = re.search(self._TITLE, html).group('title')
3996         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3997         if not streams:
3998             assert '"fsk"' in html
3999             raise ExtractorError(u'This video is only available after 8:00 pm')
4000
4001         # choose default media type and highest quality for now
4002         stream = max([s for s in streams if int(s["media_type"]) == 0],
4003                      key=lambda s: int(s["quality"]))
4004
4005         # there's two possibilities: RTMP stream or HTTP download
4006         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4007         if stream['rtmp_url']:
4008             self.to_screen(u'RTMP download detected')
4009             assert stream['video_url'].startswith('mp4:')
4010             info["url"] = stream["rtmp_url"]
4011             info["play_path"] = stream['video_url']
4012         else:
4013             assert stream["video_url"].endswith('.mp4')
4014             info["url"] = stream["video_url"]
4015         return [info]
4016
4017 class TumblrIE(InfoExtractor):
4018     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4019
4020     def _real_extract(self, url):
4021         m_url = re.match(self._VALID_URL, url)
4022         video_id = m_url.group('id')
4023         blog = m_url.group('blog_name')
4024
4025         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4026         webpage = self._download_webpage(url, video_id)
4027
4028         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4029         video = re.search(re_video, webpage)
4030         if video is None:
4031             self.to_screen("No video founded")
4032             return []
4033         video_url = video.group('video_url')
4034         ext = video.group('ext')
4035
4036         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4037         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4038
4039         # The only place where you can get a title, it's not complete,
4040         # but searching in other places doesn't work for all videos
4041         re_title = r'<title>(?P<title>.*?)</title>'
4042         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4043
4044         return [{'id': video_id,
4045                  'url': video_url,
4046                  'title': title,
4047                  'thumbnail': thumb,
4048                  'ext': ext
4049                  }]
4050
4051 class BandcampIE(InfoExtractor):
4052     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4053
4054     def _real_extract(self, url):
4055         mobj = re.match(self._VALID_URL, url)
4056         title = mobj.group('title')
4057         webpage = self._download_webpage(url, title)
4058         # We get the link to the free download page
4059         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4060         if m_download is None:
4061             raise ExtractorError(u'No free songs founded')
4062
4063         download_link = m_download.group(1)
4064         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
4065                        webpage, re.MULTILINE|re.DOTALL).group('id')
4066
4067         download_webpage = self._download_webpage(download_link, id,
4068                                                   'Downloading free downloads page')
4069         # We get the dictionary of the track from some javascrip code
4070         info = re.search(r'items: (.*?),$',
4071                          download_webpage, re.MULTILINE).group(1)
4072         info = json.loads(info)[0]
4073         # We pick mp3-320 for now, until format selection can be easily implemented.
4074         mp3_info = info[u'downloads'][u'mp3-320']
4075         # If we try to use this url it says the link has expired
4076         initial_url = mp3_info[u'url']
4077         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4078         m_url = re.match(re_url, initial_url)
4079         #We build the url we will use to get the final track url
4080         # This url is build in Bandcamp in the script download_bunde_*.js
4081         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4082         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4083         # If we could correctly generate the .rand field the url would be
4084         #in the "download_url" key
4085         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4086
4087         track_info = {'id':id,
4088                       'title' : info[u'title'],
4089                       'ext' : 'mp3',
4090                       'url' : final_url,
4091                       'thumbnail' : info[u'thumb_url'],
4092                       'uploader' : info[u'artist']
4093                       }
4094
4095         return [track_info]
4096
4097 class RedTubeIE(InfoExtractor):
4098     """Information Extractor for redtube"""
4099     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4100
4101     def _real_extract(self,url):
4102         mobj = re.match(self._VALID_URL, url)
4103         if mobj is None:
4104             raise ExtractorError(u'Invalid URL: %s' % url)
4105
4106         video_id = mobj.group('id')
4107         video_extension = 'mp4'        
4108         webpage = self._download_webpage(url, video_id)
4109         self.report_extraction(video_id)
4110         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4111
4112         if mobj is None:
4113             raise ExtractorError(u'Unable to extract media URL')
4114
4115         video_url = mobj.group(1)
4116         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4117         if mobj is None:
4118             raise ExtractorError(u'Unable to extract title')
4119         video_title = mobj.group(1)
4120
4121         return [{
4122             'id':       video_id,
4123             'url':      video_url,
4124             'ext':      video_extension,
4125             'title':    video_title,
4126         }]
4127
4128
4129 def gen_extractors():
4130     """ Return a list of an instance of every supported extractor.
4131     The order does matter; the first extractor matched is the one handling the URL.
4132     """
4133     return [
4134         YoutubePlaylistIE(),
4135         YoutubeChannelIE(),
4136         YoutubeUserIE(),
4137         YoutubeSearchIE(),
4138         YoutubeIE(),
4139         MetacafeIE(),
4140         DailymotionIE(),
4141         GoogleSearchIE(),
4142         PhotobucketIE(),
4143         YahooIE(),
4144         YahooSearchIE(),
4145         DepositFilesIE(),
4146         FacebookIE(),
4147         BlipTVUserIE(),
4148         BlipTVIE(),
4149         VimeoIE(),
4150         MyVideoIE(),
4151         ComedyCentralIE(),
4152         EscapistIE(),
4153         CollegeHumorIE(),
4154         XVideosIE(),
4155         SoundcloudSetIE(),
4156         SoundcloudIE(),
4157         InfoQIE(),
4158         MixcloudIE(),
4159         StanfordOpenClassroomIE(),
4160         MTVIE(),
4161         YoukuIE(),
4162         XNXXIE(),
4163         YouJizzIE(),
4164         PornotubeIE(),
4165         YouPornIE(),
4166         GooglePlusIE(),
4167         ArteTvIE(),
4168         NBAIE(),
4169         WorldStarHipHopIE(),
4170         JustinTVIE(),
4171         FunnyOrDieIE(),
4172         SteamIE(),
4173         UstreamIE(),
4174         RBMARadioIE(),
4175         EightTracksIE(),
4176         KeekIE(),
4177         TEDIE(),
4178         MySpassIE(),
4179         SpiegelIE(),
4180         LiveLeakIE(),
4181         ARDIE(),
4182         TumblrIE(),
4183         BandcampIE(),
4184         RedTubeIE(),
4185         GenericIE()
4186     ]
4187
4188 def get_info_extractor(ie_name):
4189     """Returns the info extractor class with the given ie_name"""
4190     return globals()[ie_name+'IE']