GoogleSearchIE: change query urls to http://www.google.com/search
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             self.report_download_webpage(video_id)
118         elif note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns a tuple (page content as string, URL handle) """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         content = webpage_bytes.decode(encoding, 'replace')
146         return (content, urlh)
147
148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149         """ Returns the data of the page as a string """
150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152     def to_screen(self, msg):
153         """Print msg to screen, prefixing it with '[ie_name]'"""
154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156     def report_extraction(self, id_or_name):
157         """Report information extraction."""
158         self.to_screen(u'%s: Extracting information' % id_or_name)
159
160     def report_download_webpage(self, video_id):
161         """Report webpage download."""
162         self.to_screen(u'%s: Downloading webpage' % video_id)
163
164     def report_age_confirmation(self):
165         """Report attempt to confirm age."""
166         self.to_screen(u'Confirming age')
167
168     #Methods for following #608
169     #They set the correct value of the '_type' key
170     def video_result(self, video_info):
171         """Returns a video"""
172         video_info['_type'] = 'video'
173         return video_info
174     def url_result(self, url, ie=None):
175         """Returns a url that points to a page that should be processed"""
176         #TODO: ie should be the class used for getting the info
177         video_info = {'_type': 'url',
178                       'url': url,
179                       'ie_key': ie}
180         return video_info
181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182         """Returns a playlist"""
183         video_info = {'_type': 'playlist',
184                       'entries': entries}
185         if playlist_id:
186             video_info['id'] = playlist_id
187         if playlist_title:
188             video_info['title'] = playlist_title
189         return video_info
190
191
192 class YoutubeIE(InfoExtractor):
193     """Information extractor for youtube.com."""
194
195     _VALID_URL = r"""^
196                      (
197                          (?:https?://)?                                       # http(s):// (optional)
198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
201                          (?:                                                  # the various things that can precede the ID:
202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
203                              |(?:                                             # or the v= param in all its forms
204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
207                                  v=
208                              )
209                          )?                                                   # optional -> youtube.com/xxxx is OK
210                      )?                                                       # all until now is optional -> you can pass the naked ID
211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
212                      (?(1).+)?                                                # if we found the ID, everything can follow
213                      $"""
214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218     _NETRC_MACHINE = 'youtube'
219     # Listed in order of quality
220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222     _video_extensions = {
223         '13': '3gp',
224         '17': 'mp4',
225         '18': 'mp4',
226         '22': 'mp4',
227         '37': 'mp4',
228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229         '43': 'webm',
230         '44': 'webm',
231         '45': 'webm',
232         '46': 'webm',
233     }
234     _video_dimensions = {
235         '5': '240x400',
236         '6': '???',
237         '13': '???',
238         '17': '144x176',
239         '18': '360x640',
240         '22': '720x1280',
241         '34': '360x640',
242         '35': '480x854',
243         '37': '1080x1920',
244         '38': '3072x4096',
245         '43': '360x640',
246         '44': '480x854',
247         '45': '720x1280',
248         '46': '1080x1920',
249     }
250     IE_NAME = u'youtube'
251
252     @classmethod
253     def suitable(cls, url):
254         """Receives a URL and returns True if suitable for this IE."""
255         if YoutubePlaylistIE.suitable(url): return False
256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
257
258     def report_lang(self):
259         """Report attempt to set language."""
260         self.to_screen(u'Setting language')
261
262     def report_login(self):
263         """Report attempt to log in."""
264         self.to_screen(u'Logging in')
265
266     def report_video_webpage_download(self, video_id):
267         """Report attempt to download video webpage."""
268         self.to_screen(u'%s: Downloading video webpage' % video_id)
269
270     def report_video_info_webpage_download(self, video_id):
271         """Report attempt to download video info webpage."""
272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
273
274     def report_video_subtitles_download(self, video_id):
275         """Report attempt to download video info webpage."""
276         self.to_screen(u'%s: Checking available subtitles' % video_id)
277
278     def report_video_subtitles_request(self, video_id, sub_lang, format):
279         """Report attempt to download video info webpage."""
280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
281
282     def report_video_subtitles_available(self, video_id, sub_lang_list):
283         """Report available subtitles."""
284         sub_lang = ",".join(list(sub_lang_list.keys()))
285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
286
287     def report_information_extraction(self, video_id):
288         """Report attempt to extract video information."""
289         self.to_screen(u'%s: Extracting video information' % video_id)
290
291     def report_unavailable_format(self, video_id, format):
292         """Report extracted video URL."""
293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295     def report_rtmp_download(self):
296         """Indicate the download will use the RTMP protocol."""
297         self.to_screen(u'RTMP download detected')
298
299     def _get_available_subtitles(self, video_id):
300         self.report_video_subtitles_download(video_id)
301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302         try:
303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308         if not sub_lang_list:
309             return (u'video doesn\'t have subtitles', None)
310         return sub_lang_list
311
312     def _list_available_subtitles(self, video_id):
313         sub_lang_list = self._get_available_subtitles(video_id)
314         self.report_video_subtitles_available(video_id, sub_lang_list)
315
316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
317         """
318         Return tuple:
319         (error_message, sub_lang, sub)
320         """
321         self.report_video_subtitles_request(video_id, sub_lang, format)
322         params = compat_urllib_parse.urlencode({
323             'lang': sub_lang,
324             'name': sub_name,
325             'v': video_id,
326             'fmt': format,
327         })
328         url = 'http://www.youtube.com/api/timedtext?' + params
329         try:
330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
333         if not sub:
334             return (u'Did not fetch video subtitles', None, None)
335         return (None, sub_lang, sub)
336
337     def _extract_subtitle(self, video_id):
338         """
339         Return a list with a tuple:
340         [(error_message, sub_lang, sub)]
341         """
342         sub_lang_list = self._get_available_subtitles(video_id)
343         sub_format = self._downloader.params.get('subtitlesformat')
344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345             return [(sub_lang_list[0], None, None)]
346         if self._downloader.params.get('subtitleslang', False):
347             sub_lang = self._downloader.params.get('subtitleslang')
348         elif 'en' in sub_lang_list:
349             sub_lang = 'en'
350         else:
351             sub_lang = list(sub_lang_list.keys())[0]
352         if not sub_lang in sub_lang_list:
353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
354
355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
356         return [subtitle]
357
358     def _extract_all_subtitles(self, video_id):
359         sub_lang_list = self._get_available_subtitles(video_id)
360         sub_format = self._downloader.params.get('subtitlesformat')
361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362             return [(sub_lang_list[0], None, None)]
363         subtitles = []
364         for sub_lang in sub_lang_list:
365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366             subtitles.append(subtitle)
367         return subtitles
368
369     def _print_formats(self, formats):
370         print('Available formats:')
371         for x in formats:
372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374     def _real_initialize(self):
375         if self._downloader is None:
376             return
377
378         username = None
379         password = None
380         downloader_params = self._downloader.params
381
382         # Attempt to use provided username and password or .netrc data
383         if downloader_params.get('username', None) is not None:
384             username = downloader_params['username']
385             password = downloader_params['password']
386         elif downloader_params.get('usenetrc', False):
387             try:
388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389                 if info is not None:
390                     username = info[0]
391                     password = info[2]
392                 else:
393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394             except (IOError, netrc.NetrcParseError) as err:
395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
396                 return
397
398         # Set language
399         request = compat_urllib_request.Request(self._LANG_URL)
400         try:
401             self.report_lang()
402             compat_urllib_request.urlopen(request).read()
403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
405             return
406
407         # No authentication to be performed
408         if username is None:
409             return
410
411         request = compat_urllib_request.Request(self._LOGIN_URL)
412         try:
413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
416             return
417
418         galx = None
419         dsh = None
420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421         if match:
422           galx = match.group(1)
423
424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425         if match:
426           dsh = match.group(1)
427
428         # Log in
429         login_form_strs = {
430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
431                 u'Email': username,
432                 u'GALX': galx,
433                 u'Passwd': password,
434                 u'PersistentCookie': u'yes',
435                 u'_utf8': u'霱',
436                 u'bgresponse': u'js_disabled',
437                 u'checkConnection': u'',
438                 u'checkedDomains': u'youtube',
439                 u'dnConn': u'',
440                 u'dsh': dsh,
441                 u'pstMsg': u'0',
442                 u'rmShown': u'1',
443                 u'secTok': u'',
444                 u'signIn': u'Sign in',
445                 u'timeStmp': u'',
446                 u'service': u'youtube',
447                 u'uilel': u'3',
448                 u'hl': u'en_US',
449         }
450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451         # chokes on unicode
452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
455         try:
456             self.report_login()
457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459                 self._downloader.report_warning(u'unable to log in: bad username or password')
460                 return
461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463             return
464
465         # Confirm age
466         age_form = {
467                 'next_url':     '/',
468                 'action_confirm':   'Confirm',
469                 }
470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471         try:
472             self.report_age_confirmation()
473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
476
477     def _extract_id(self, url):
478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
479         if mobj is None:
480             raise ExtractorError(u'Invalid URL: %s' % url)
481         video_id = mobj.group(2)
482         return video_id
483
484     def _real_extract(self, url):
485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486         mobj = re.search(self._NEXT_URL_RE, url)
487         if mobj:
488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489         video_id = self._extract_id(url)
490
491         # Get video webpage
492         self.report_video_webpage_download(video_id)
493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494         request = compat_urllib_request.Request(url)
495         try:
496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
499
500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
501
502         # Attempt to extract SWF player URL
503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
504         if mobj is not None:
505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
506         else:
507             player_url = None
508
509         # Get video info
510         self.report_video_info_webpage_download(video_id)
511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513                     % (video_id, el_type))
514             video_info_webpage = self._download_webpage(video_info_url, video_id,
515                                     note=False,
516                                     errnote='unable to download video info webpage')
517             video_info = compat_parse_qs(video_info_webpage)
518             if 'token' in video_info:
519                 break
520         if 'token' not in video_info:
521             if 'reason' in video_info:
522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
523             else:
524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
525
526         # Check for "rental" videos
527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528             raise ExtractorError(u'"rental" videos not supported')
529
530         # Start extracting information
531         self.report_information_extraction(video_id)
532
533         # uploader
534         if 'author' not in video_info:
535             raise ExtractorError(u'Unable to extract uploader name')
536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
537
538         # uploader_id
539         video_uploader_id = None
540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
541         if mobj is not None:
542             video_uploader_id = mobj.group(1)
543         else:
544             self._downloader.report_warning(u'unable to extract uploader nickname')
545
546         # title
547         if 'title' not in video_info:
548             raise ExtractorError(u'Unable to extract video title')
549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550
551         # thumbnail image
552         if 'thumbnail_url' not in video_info:
553             self._downloader.report_warning(u'unable to extract video thumbnail')
554             video_thumbnail = ''
555         else:   # don't panic if we can't find it
556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
557
558         # upload date
559         upload_date = None
560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
561         if mobj is not None:
562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563             upload_date = unified_strdate(upload_date)
564
565         # description
566         video_description = get_element_by_id("eow-description", video_webpage)
567         if video_description:
568             video_description = clean_html(video_description)
569         else:
570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
571             if fd_mobj:
572                 video_description = unescapeHTML(fd_mobj.group(1))
573             else:
574                 video_description = u''
575
576         # subtitles
577         video_subtitles = None
578
579         if self._downloader.params.get('writesubtitles', False):
580             video_subtitles = self._extract_subtitle(video_id)
581             if video_subtitles:
582                 (sub_error, sub_lang, sub) = video_subtitles[0]
583                 if sub_error:
584                     self._downloader.report_error(sub_error)
585
586         if self._downloader.params.get('allsubtitles', False):
587             video_subtitles = self._extract_all_subtitles(video_id)
588             for video_subtitle in video_subtitles:
589                 (sub_error, sub_lang, sub) = video_subtitle
590                 if sub_error:
591                     self._downloader.report_error(sub_error)
592
593         if self._downloader.params.get('listsubtitles', False):
594             sub_lang_list = self._list_available_subtitles(video_id)
595             return
596
597         if 'length_seconds' not in video_info:
598             self._downloader.report_warning(u'unable to extract video duration')
599             video_duration = ''
600         else:
601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
602
603         # token
604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605
606         # Decide which formats to download
607         req_format = self._downloader.params.get('format', None)
608
609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610             self.report_rtmp_download()
611             video_url_list = [(None, video_info['conn'][0])]
612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613             url_map = {}
614             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615                 url_data = compat_parse_qs(url_data_str)
616                 if 'itag' in url_data and 'url' in url_data:
617                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618                     if not 'ratebypass' in url: url += '&ratebypass=yes'
619                     url_map[url_data['itag'][0]] = url
620
621             format_limit = self._downloader.params.get('format_limit', None)
622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623             if format_limit is not None and format_limit in available_formats:
624                 format_list = available_formats[available_formats.index(format_limit):]
625             else:
626                 format_list = available_formats
627             existing_formats = [x for x in format_list if x in url_map]
628             if len(existing_formats) == 0:
629                 raise ExtractorError(u'no known formats available for video')
630             if self._downloader.params.get('listformats', None):
631                 self._print_formats(existing_formats)
632                 return
633             if req_format is None or req_format == 'best':
634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635             elif req_format == 'worst':
636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637             elif req_format in ('-1', 'all'):
638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639             else:
640                 # Specific formats. We pick the first in a slash-delimeted sequence.
641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642                 req_formats = req_format.split('/')
643                 video_url_list = None
644                 for rf in req_formats:
645                     if rf in url_map:
646                         video_url_list = [(rf, url_map[rf])]
647                         break
648                 if video_url_list is None:
649                     raise ExtractorError(u'requested format not available')
650         else:
651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
652
653         results = []
654         for format_param, video_real_url in video_url_list:
655             # Extension
656             video_extension = self._video_extensions.get(format_param, 'flv')
657
658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659                                               self._video_dimensions.get(format_param, '???'))
660
661             results.append({
662                 'id':       video_id,
663                 'url':      video_real_url,
664                 'uploader': video_uploader,
665                 'uploader_id': video_uploader_id,
666                 'upload_date':  upload_date,
667                 'title':    video_title,
668                 'ext':      video_extension,
669                 'format':   video_format,
670                 'thumbnail':    video_thumbnail,
671                 'description':  video_description,
672                 'player_url':   player_url,
673                 'subtitles':    video_subtitles,
674                 'duration':     video_duration
675             })
676         return results
677
678
679 class MetacafeIE(InfoExtractor):
680     """Information Extractor for metacafe.com."""
681
682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685     IE_NAME = u'metacafe'
686
687     def report_disclaimer(self):
688         """Report disclaimer retrieval."""
689         self.to_screen(u'Retrieving disclaimer')
690
691     def _real_initialize(self):
692         # Retrieve disclaimer
693         request = compat_urllib_request.Request(self._DISCLAIMER)
694         try:
695             self.report_disclaimer()
696             disclaimer = compat_urllib_request.urlopen(request).read()
697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
699
700         # Confirm age
701         disclaimer_form = {
702             'filters': '0',
703             'submit': "Continue - I'm over 18",
704             }
705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706         try:
707             self.report_age_confirmation()
708             disclaimer = compat_urllib_request.urlopen(request).read()
709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
711
712     def _real_extract(self, url):
713         # Extract id and simplified title from URL
714         mobj = re.match(self._VALID_URL, url)
715         if mobj is None:
716             raise ExtractorError(u'Invalid URL: %s' % url)
717
718         video_id = mobj.group(1)
719
720         # Check if video comes from YouTube
721         mobj2 = re.match(r'^yt-(.*)$', video_id)
722         if mobj2 is not None:
723             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
724
725         # Retrieve video webpage to extract further information
726         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
727
728         # Extract URL, uploader and title from webpage
729         self.report_extraction(video_id)
730         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
731         if mobj is not None:
732             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733             video_extension = mediaURL[-3:]
734
735             # Extract gdaKey if available
736             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737             if mobj is None:
738                 video_url = mediaURL
739             else:
740                 gdaKey = mobj.group(1)
741                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
742         else:
743             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
744             if mobj is None:
745                 raise ExtractorError(u'Unable to extract media URL')
746             vardict = compat_parse_qs(mobj.group(1))
747             if 'mediaData' not in vardict:
748                 raise ExtractorError(u'Unable to extract media URL')
749             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
750             if mobj is None:
751                 raise ExtractorError(u'Unable to extract media URL')
752             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753             video_extension = mediaURL[-3:]
754             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
755
756         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
757         if mobj is None:
758             raise ExtractorError(u'Unable to extract title')
759         video_title = mobj.group(1).decode('utf-8')
760
761         mobj = re.search(r'submitter=(.*?);', webpage)
762         if mobj is None:
763             raise ExtractorError(u'Unable to extract uploader nickname')
764         video_uploader = mobj.group(1)
765
766         return [{
767             'id':       video_id.decode('utf-8'),
768             'url':      video_url.decode('utf-8'),
769             'uploader': video_uploader.decode('utf-8'),
770             'upload_date':  None,
771             'title':    video_title,
772             'ext':      video_extension.decode('utf-8'),
773         }]
774
775 class DailymotionIE(InfoExtractor):
776     """Information Extractor for Dailymotion"""
777
778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779     IE_NAME = u'dailymotion'
780
781     def _real_extract(self, url):
782         # Extract id and simplified title from URL
783         mobj = re.match(self._VALID_URL, url)
784         if mobj is None:
785             raise ExtractorError(u'Invalid URL: %s' % url)
786
787         video_id = mobj.group(1).split('_')[0].split('?')[0]
788
789         video_extension = 'mp4'
790
791         # Retrieve video webpage to extract further information
792         request = compat_urllib_request.Request(url)
793         request.add_header('Cookie', 'family_filter=off')
794         webpage = self._download_webpage(request, video_id)
795
796         # Extract URL, uploader and title from webpage
797         self.report_extraction(video_id)
798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
799         if mobj is None:
800             raise ExtractorError(u'Unable to extract media URL')
801         flashvars = compat_urllib_parse.unquote(mobj.group(1))
802
803         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
804             if key in flashvars:
805                 max_quality = key
806                 self.to_screen(u'Using %s' % key)
807                 break
808         else:
809             raise ExtractorError(u'Unable to extract video URL')
810
811         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
812         if mobj is None:
813             raise ExtractorError(u'Unable to extract video URL')
814
815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
816
817         # TODO: support choosing qualities
818
819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
820         if mobj is None:
821             raise ExtractorError(u'Unable to extract title')
822         video_title = unescapeHTML(mobj.group('title'))
823
824         video_uploader = None
825         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
826         if mobj is None:
827             # lookin for official user
828             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829             if mobj_official is None:
830                 self._downloader.report_warning(u'unable to extract uploader nickname')
831             else:
832                 video_uploader = mobj_official.group(1)
833         else:
834             video_uploader = mobj.group(1)
835
836         video_upload_date = None
837         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
838         if mobj is not None:
839             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
840
841         return [{
842             'id':       video_id,
843             'url':      video_url,
844             'uploader': video_uploader,
845             'upload_date':  video_upload_date,
846             'title':    video_title,
847             'ext':      video_extension,
848         }]
849
850
851 class PhotobucketIE(InfoExtractor):
852     """Information extractor for photobucket.com."""
853
854     # TODO: the original _VALID_URL was:
855     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856     # Check if it's necessary to keep the old extracion process
857     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858     IE_NAME = u'photobucket'
859
860     def _real_extract(self, url):
861         # Extract id from URL
862         mobj = re.match(self._VALID_URL, url)
863         if mobj is None:
864             raise ExtractorError(u'Invalid URL: %s' % url)
865
866         video_id = mobj.group('id')
867
868         video_extension = mobj.group('ext')
869
870         # Retrieve video webpage to extract further information
871         webpage = self._download_webpage(url, video_id)
872
873         # Extract URL, uploader, and title from webpage
874         self.report_extraction(video_id)
875         # We try first by looking the javascript code:
876         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
877         if mobj is not None:
878             info = json.loads(mobj.group('json'))
879             return [{
880                 'id':       video_id,
881                 'url':      info[u'downloadUrl'],
882                 'uploader': info[u'username'],
883                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884                 'title':    info[u'title'],
885                 'ext':      video_extension,
886                 'thumbnail': info[u'thumbUrl'],
887             }]
888
889         # We try looking in other parts of the webpage
890         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
891         if mobj is None:
892             raise ExtractorError(u'Unable to extract media URL')
893         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894
895         video_url = mediaURL
896
897         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
898         if mobj is None:
899             raise ExtractorError(u'Unable to extract title')
900         video_title = mobj.group(1).decode('utf-8')
901
902         video_uploader = mobj.group(2).decode('utf-8')
903
904         return [{
905             'id':       video_id.decode('utf-8'),
906             'url':      video_url.decode('utf-8'),
907             'uploader': video_uploader,
908             'upload_date':  None,
909             'title':    video_title,
910             'ext':      video_extension.decode('utf-8'),
911         }]
912
913
914 class YahooIE(InfoExtractor):
915     """Information extractor for screen.yahoo.com."""
916     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
917
918     def _real_extract(self, url):
919         mobj = re.match(self._VALID_URL, url)
920         if mobj is None:
921             raise ExtractorError(u'Invalid URL: %s' % url)
922         video_id = mobj.group('id')
923         webpage = self._download_webpage(url, video_id)
924         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
925
926         if m_id is None: 
927             # TODO: Check which url parameters are required
928             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
929             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
930             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
931                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
932                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
933                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
934                         '''
935             self.report_extraction(video_id)
936             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
937             if m_info is None:
938                 raise ExtractorError(u'Unable to extract video info')
939             video_title = m_info.group('title')
940             video_description = m_info.group('description')
941             video_thumb = m_info.group('thumb')
942             video_date = m_info.group('date')
943             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
944     
945             # TODO: Find a way to get mp4 videos
946             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
947             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
948             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
949             video_url = m_rest.group('url')
950             video_path = m_rest.group('path')
951             if m_rest is None:
952                 raise ExtractorError(u'Unable to extract video url')
953
954         else: # We have to use a different method if another id is defined
955             long_id = m_id.group('new_id')
956             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
957             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
958             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
959             info = json.loads(json_str)
960             res = info[u'query'][u'results'][u'mediaObj'][0]
961             stream = res[u'streams'][0]
962             video_path = stream[u'path']
963             video_url = stream[u'host']
964             meta = res[u'meta']
965             video_title = meta[u'title']
966             video_description = meta[u'description']
967             video_thumb = meta[u'thumbnail']
968             video_date = None # I can't find it
969
970         info_dict = {
971                      'id': video_id,
972                      'url': video_url,
973                      'play_path': video_path,
974                      'title':video_title,
975                      'description': video_description,
976                      'thumbnail': video_thumb,
977                      'upload_date': video_date,
978                      'ext': 'flv',
979                      }
980         return info_dict
981
982 class VimeoIE(InfoExtractor):
983     """Information extractor for vimeo.com."""
984
985     # _VALID_URL matches Vimeo URLs
986     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
987     IE_NAME = u'vimeo'
988
989     def _real_extract(self, url, new_video=True):
990         # Extract ID from URL
991         mobj = re.match(self._VALID_URL, url)
992         if mobj is None:
993             raise ExtractorError(u'Invalid URL: %s' % url)
994
995         video_id = mobj.group('id')
996         if not mobj.group('proto'):
997             url = 'https://' + url
998         if mobj.group('direct_link'):
999             url = 'https://vimeo.com/' + video_id
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url, None, std_headers)
1003         webpage = self._download_webpage(request, video_id)
1004
1005         # Now we begin extracting as much information as we can from what we
1006         # retrieved. First we extract the information common to all extractors,
1007         # and latter we extract those that are Vimeo specific.
1008         self.report_extraction(video_id)
1009
1010         # Extract the config JSON
1011         try:
1012             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013             config = json.loads(config)
1014         except:
1015             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1017             else:
1018                 raise ExtractorError(u'Unable to extract info section')
1019
1020         # Extract title
1021         video_title = config["video"]["title"]
1022
1023         # Extract uploader and uploader_id
1024         video_uploader = config["video"]["owner"]["name"]
1025         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027         # Extract video thumbnail
1028         video_thumbnail = config["video"]["thumbnail"]
1029
1030         # Extract video description
1031         video_description = get_element_by_attribute("itemprop", "description", webpage)
1032         if video_description: video_description = clean_html(video_description)
1033         else: video_description = u''
1034
1035         # Extract upload date
1036         video_upload_date = None
1037         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038         if mobj is not None:
1039             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041         # Vimeo specific: extract request signature and timestamp
1042         sig = config['request']['signature']
1043         timestamp = config['request']['timestamp']
1044
1045         # Vimeo specific: extract video codec and quality information
1046         # First consider quality, then codecs, then take everything
1047         # TODO bind to format param
1048         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049         files = { 'hd': [], 'sd': [], 'other': []}
1050         for codec_name, codec_extension in codecs:
1051             if codec_name in config["video"]["files"]:
1052                 if 'hd' in config["video"]["files"][codec_name]:
1053                     files['hd'].append((codec_name, codec_extension, 'hd'))
1054                 elif 'sd' in config["video"]["files"][codec_name]:
1055                     files['sd'].append((codec_name, codec_extension, 'sd'))
1056                 else:
1057                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059         for quality in ('hd', 'sd', 'other'):
1060             if len(files[quality]) > 0:
1061                 video_quality = files[quality][0][2]
1062                 video_codec = files[quality][0][0]
1063                 video_extension = files[quality][0][1]
1064                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065                 break
1066         else:
1067             raise ExtractorError(u'No known codec found')
1068
1069         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072         return [{
1073             'id':       video_id,
1074             'url':      video_url,
1075             'uploader': video_uploader,
1076             'uploader_id': video_uploader_id,
1077             'upload_date':  video_upload_date,
1078             'title':    video_title,
1079             'ext':      video_extension,
1080             'thumbnail':    video_thumbnail,
1081             'description':  video_description,
1082         }]
1083
1084
1085 class ArteTvIE(InfoExtractor):
1086     """arte.tv information extractor."""
1087
1088     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089     _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091     IE_NAME = u'arte.tv'
1092
1093     def fetch_webpage(self, url):
1094         request = compat_urllib_request.Request(url)
1095         try:
1096             self.report_download_webpage(url)
1097             webpage = compat_urllib_request.urlopen(request).read()
1098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100         except ValueError as err:
1101             raise ExtractorError(u'Invalid URL: %s' % url)
1102         return webpage
1103
1104     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105         page = self.fetch_webpage(url)
1106         mobj = re.search(regex, page, regexFlags)
1107         info = {}
1108
1109         if mobj is None:
1110             raise ExtractorError(u'Invalid URL: %s' % url)
1111
1112         for (i, key, err) in matchTuples:
1113             if mobj.group(i) is None:
1114                 raise ExtractorError(err)
1115             else:
1116                 info[key] = mobj.group(i)
1117
1118         return info
1119
1120     def extractLiveStream(self, url):
1121         video_lang = url.split('/')[-4]
1122         info = self.grep_webpage(
1123             url,
1124             r'src="(.*?/videothek_js.*?\.js)',
1125             0,
1126             [
1127                 (1, 'url', u'Invalid URL: %s' % url)
1128             ]
1129         )
1130         http_host = url.split('/')[2]
1131         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132         info = self.grep_webpage(
1133             next_url,
1134             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135                 '(http://.*?\.swf).*?' +
1136                 '(rtmp://.*?)\'',
1137             re.DOTALL,
1138             [
1139                 (1, 'path',   u'could not extract video path: %s' % url),
1140                 (2, 'player', u'could not extract video player: %s' % url),
1141                 (3, 'url',    u'could not extract video url: %s' % url)
1142             ]
1143         )
1144         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1145
1146     def extractPlus7Stream(self, url):
1147         video_lang = url.split('/')[-3]
1148         info = self.grep_webpage(
1149             url,
1150             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1151             0,
1152             [
1153                 (1, 'url', u'Invalid URL: %s' % url)
1154             ]
1155         )
1156         next_url = compat_urllib_parse.unquote(info.get('url'))
1157         info = self.grep_webpage(
1158             next_url,
1159             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1160             0,
1161             [
1162                 (1, 'url', u'Could not find <video> tag: %s' % url)
1163             ]
1164         )
1165         next_url = compat_urllib_parse.unquote(info.get('url'))
1166
1167         info = self.grep_webpage(
1168             next_url,
1169             r'<video id="(.*?)".*?>.*?' +
1170                 '<name>(.*?)</name>.*?' +
1171                 '<dateVideo>(.*?)</dateVideo>.*?' +
1172                 '<url quality="hd">(.*?)</url>',
1173             re.DOTALL,
1174             [
1175                 (1, 'id',    u'could not extract video id: %s' % url),
1176                 (2, 'title', u'could not extract video title: %s' % url),
1177                 (3, 'date',  u'could not extract video date: %s' % url),
1178                 (4, 'url',   u'could not extract video url: %s' % url)
1179             ]
1180         )
1181
1182         return {
1183             'id':           info.get('id'),
1184             'url':          compat_urllib_parse.unquote(info.get('url')),
1185             'uploader':     u'arte.tv',
1186             'upload_date':  unified_strdate(info.get('date')),
1187             'title':        info.get('title').decode('utf-8'),
1188             'ext':          u'mp4',
1189             'format':       u'NA',
1190             'player_url':   None,
1191         }
1192
1193     def _real_extract(self, url):
1194         video_id = url.split('/')[-1]
1195         self.report_extraction(video_id)
1196
1197         if re.search(self._LIVE_URL, video_id) is not None:
1198             self.extractLiveStream(url)
1199             return
1200         else:
1201             info = self.extractPlus7Stream(url)
1202
1203         return [info]
1204
1205
1206 class GenericIE(InfoExtractor):
1207     """Generic last-resort information extractor."""
1208
1209     _VALID_URL = r'.*'
1210     IE_NAME = u'generic'
1211
1212     def report_download_webpage(self, video_id):
1213         """Report webpage download."""
1214         if not self._downloader.params.get('test', False):
1215             self._downloader.report_warning(u'Falling back on generic information extractor.')
1216         super(GenericIE, self).report_download_webpage(video_id)
1217
1218     def report_following_redirect(self, new_url):
1219         """Report information extraction."""
1220         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1221
1222     def _test_redirect(self, url):
1223         """Check if it is a redirect, like url shorteners, in case return the new url."""
1224         class HeadRequest(compat_urllib_request.Request):
1225             def get_method(self):
1226                 return "HEAD"
1227
1228         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1229             """
1230             Subclass the HTTPRedirectHandler to make it use our
1231             HeadRequest also on the redirected URL
1232             """
1233             def redirect_request(self, req, fp, code, msg, headers, newurl):
1234                 if code in (301, 302, 303, 307):
1235                     newurl = newurl.replace(' ', '%20')
1236                     newheaders = dict((k,v) for k,v in req.headers.items()
1237                                       if k.lower() not in ("content-length", "content-type"))
1238                     return HeadRequest(newurl,
1239                                        headers=newheaders,
1240                                        origin_req_host=req.get_origin_req_host(),
1241                                        unverifiable=True)
1242                 else:
1243                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1244
1245         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1246             """
1247             Fallback to GET if HEAD is not allowed (405 HTTP error)
1248             """
1249             def http_error_405(self, req, fp, code, msg, headers):
1250                 fp.read()
1251                 fp.close()
1252
1253                 newheaders = dict((k,v) for k,v in req.headers.items()
1254                                   if k.lower() not in ("content-length", "content-type"))
1255                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1256                                                  headers=newheaders,
1257                                                  origin_req_host=req.get_origin_req_host(),
1258                                                  unverifiable=True))
1259
1260         # Build our opener
1261         opener = compat_urllib_request.OpenerDirector()
1262         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263                         HTTPMethodFallback, HEADRedirectHandler,
1264                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265             opener.add_handler(handler())
1266
1267         response = opener.open(HeadRequest(url))
1268         new_url = response.geturl()
1269
1270         if url == new_url:
1271             return False
1272
1273         self.report_following_redirect(new_url)
1274         return new_url
1275
1276     def _real_extract(self, url):
1277         new_url = self._test_redirect(url)
1278         if new_url: return [self.url_result(new_url)]
1279
1280         video_id = url.split('/')[-1]
1281         try:
1282             webpage = self._download_webpage(url, video_id)
1283         except ValueError as err:
1284             # since this is the last-resort InfoExtractor, if
1285             # this error is thrown, it'll be thrown here
1286             raise ExtractorError(u'Invalid URL: %s' % url)
1287
1288         self.report_extraction(video_id)
1289         # Start with something easy: JW Player in SWFObject
1290         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291         if mobj is None:
1292             # Broaden the search a little bit
1293             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1294         if mobj is None:
1295             # Broaden the search a little bit: JWPlayer JS loader
1296             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1297         if mobj is None:
1298             raise ExtractorError(u'Invalid URL: %s' % url)
1299
1300         # It's possible that one of the regexes
1301         # matched, but returned an empty group:
1302         if mobj.group(1) is None:
1303             raise ExtractorError(u'Invalid URL: %s' % url)
1304
1305         video_url = compat_urllib_parse.unquote(mobj.group(1))
1306         video_id = os.path.basename(video_url)
1307
1308         # here's a fun little line of code for you:
1309         video_extension = os.path.splitext(video_id)[1][1:]
1310         video_id = os.path.splitext(video_id)[0]
1311
1312         # it's tempting to parse this further, but you would
1313         # have to take into account all the variations like
1314         #   Video Title - Site Name
1315         #   Site Name | Video Title
1316         #   Video Title - Tagline | Site Name
1317         # and so on and so forth; it's just not practical
1318         mobj = re.search(r'<title>(.*)</title>', webpage)
1319         if mobj is None:
1320             raise ExtractorError(u'Unable to extract title')
1321         video_title = mobj.group(1)
1322
1323         # video uploader is domain name
1324         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325         if mobj is None:
1326             raise ExtractorError(u'Unable to extract title')
1327         video_uploader = mobj.group(1)
1328
1329         return [{
1330             'id':       video_id,
1331             'url':      video_url,
1332             'uploader': video_uploader,
1333             'upload_date':  None,
1334             'title':    video_title,
1335             'ext':      video_extension,
1336         }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340     """Information Extractor for YouTube search queries."""
1341     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343     _max_youtube_results = 1000
1344     IE_NAME = u'youtube:search'
1345
1346     def report_download_page(self, query, pagenum):
1347         """Report attempt to download search page with given number."""
1348         query = query.decode(preferredencoding())
1349         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1350
1351     def _real_extract(self, query):
1352         mobj = re.match(self._VALID_URL, query)
1353         if mobj is None:
1354             raise ExtractorError(u'Invalid search query "%s"' % query)
1355
1356         prefix, query = query.split(':')
1357         prefix = prefix[8:]
1358         query = query.encode('utf-8')
1359         if prefix == '':
1360             return self._get_n_results(query, 1)
1361         elif prefix == 'all':
1362             self._get_n_results(query, self._max_youtube_results)
1363         else:
1364             try:
1365                 n = int(prefix)
1366                 if n <= 0:
1367                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368                 elif n > self._max_youtube_results:
1369                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370                     n = self._max_youtube_results
1371                 return self._get_n_results(query, n)
1372             except ValueError: # parsing prefix as integer fails
1373                 return self._get_n_results(query, 1)
1374
1375     def _get_n_results(self, query, n):
1376         """Get a specified number of results for a query"""
1377
1378         video_ids = []
1379         pagenum = 0
1380         limit = n
1381
1382         while (50 * pagenum) < limit:
1383             self.report_download_page(query, pagenum+1)
1384             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385             request = compat_urllib_request.Request(result_url)
1386             try:
1387                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390             api_response = json.loads(data)['data']
1391
1392             if not 'items' in api_response:
1393                 raise ExtractorError(u'[youtube] No video results')
1394
1395             new_ids = list(video['id'] for video in api_response['items'])
1396             video_ids += new_ids
1397
1398             limit = min(n, api_response['totalItems'])
1399             pagenum += 1
1400
1401         if len(video_ids) > n:
1402             video_ids = video_ids[:n]
1403         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1404         return videos
1405
1406
1407 class GoogleSearchIE(InfoExtractor):
1408     """Information Extractor for Google Video search queries."""
1409     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1411     _max_google_results = 1000
1412     IE_NAME = u'video.google:search'
1413
1414     def _real_extract(self, query):
1415         mobj = re.match(self._VALID_URL, query)
1416
1417         prefix = mobj.group('prefix')
1418         query = mobj.group('query')
1419         if prefix == '':
1420             return self._get_n_results(query, 1)
1421         elif prefix == 'all':
1422             return self._get_n_results(query, self._max_google_results)
1423         else:
1424             n = int(prefix)
1425             if n <= 0:
1426                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427             elif n > self._max_google_results:
1428                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429                 n = self._max_google_results
1430             return self._get_n_results(query, n)
1431
1432     def _get_n_results(self, query, n):
1433         """Get a specified number of results for a query"""
1434
1435         res = {
1436             '_type': 'playlist',
1437             'id': query,
1438             'entries': []
1439         }
1440
1441         for pagenum in itertools.count(1):
1442             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443             print(result_url)
1444             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1445                                              note='Downloading result page ' + str(pagenum))
1446
1447             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1448                 e = {
1449                     '_type': 'url',
1450                     'url': mobj.group(1)
1451                 }
1452                 res['entries'].append(e)
1453
1454             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1455                 return res
1456
1457 class YahooSearchIE(InfoExtractor):
1458     """Information Extractor for Yahoo! Video search queries."""
1459
1460     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1461
1462     _max_yahoo_results = 1000
1463     IE_NAME = u'screen.yahoo:search'
1464
1465     def _real_extract(self, query):
1466         mobj = re.match(self._VALID_URL, query)
1467         if mobj is None:
1468             raise ExtractorError(u'Invalid search query "%s"' % query)
1469
1470         prefix, query = query.split(':')
1471         prefix = prefix[8:]
1472         query = query.encode('utf-8')
1473         if prefix == '':
1474             return self._get_n_results(query, 1)
1475         elif prefix == 'all':
1476             return self._get_n_results(query, self._max_yahoo_results)
1477         else:
1478             try:
1479                 n = int(prefix)
1480                 if n <= 0:
1481                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1482                 elif n > self._max_yahoo_results:
1483                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1484                     n = self._max_yahoo_results
1485                 return self._get_n_results(query, n)
1486             except ValueError: # parsing prefix as integer fails
1487                 return self._get_n_results(query, 1)
1488
1489     def _get_n_results(self, query, n):
1490         """Get a specified number of results for a query"""
1491
1492         res = {
1493             '_type': 'playlist',
1494             'id': query,
1495             'entries': []
1496         }
1497         for pagenum in itertools.count(0): 
1498             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1499             webpage = self._download_webpage(result_url, query,
1500                                              note='Downloading results page '+str(pagenum+1))
1501             info = json.loads(webpage)
1502             m = info[u'm']
1503             results = info[u'results']
1504
1505             for (i, r) in enumerate(results):
1506                 if (pagenum * 30) +i >= n:
1507                     break
1508                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1509                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1510                 res['entries'].append(e)
1511             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1512                 break
1513
1514         return res
1515
1516
1517 class YoutubePlaylistIE(InfoExtractor):
1518     """Information Extractor for YouTube playlists."""
1519
1520     _VALID_URL = r"""(?:
1521                         (?:https?://)?
1522                         (?:\w+\.)?
1523                         youtube\.com/
1524                         (?:
1525                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1526                            \? (?:.*?&)*? (?:p|a|list)=
1527                         |  p/
1528                         )
1529                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1530                         .*
1531                      |
1532                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1533                      )"""
1534     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1535     _MAX_RESULTS = 50
1536     IE_NAME = u'youtube:playlist'
1537
1538     @classmethod
1539     def suitable(cls, url):
1540         """Receives a URL and returns True if suitable for this IE."""
1541         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1542
1543     def _real_extract(self, url):
1544         # Extract playlist id
1545         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1546         if mobj is None:
1547             raise ExtractorError(u'Invalid URL: %s' % url)
1548
1549         # Download playlist videos from API
1550         playlist_id = mobj.group(1) or mobj.group(2)
1551         page_num = 1
1552         videos = []
1553
1554         while True:
1555             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1556             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1557
1558             try:
1559                 response = json.loads(page)
1560             except ValueError as err:
1561                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1562
1563             if 'feed' not in response:
1564                 raise ExtractorError(u'Got a malformed response from YouTube API')
1565             playlist_title = response['feed']['title']['$t']
1566             if 'entry' not in response['feed']:
1567                 # Number of videos is a multiple of self._MAX_RESULTS
1568                 break
1569
1570             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1571                         for entry in response['feed']['entry']
1572                         if 'content' in entry ]
1573
1574             if len(response['feed']['entry']) < self._MAX_RESULTS:
1575                 break
1576             page_num += 1
1577
1578         videos = [v[1] for v in sorted(videos)]
1579
1580         url_results = [self.url_result(url, 'Youtube') for url in videos]
1581         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1582
1583
1584 class YoutubeChannelIE(InfoExtractor):
1585     """Information Extractor for YouTube channels."""
1586
1587     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1588     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1589     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1590     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1591     IE_NAME = u'youtube:channel'
1592
1593     def extract_videos_from_page(self, page):
1594         ids_in_page = []
1595         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1596             if mobj.group(1) not in ids_in_page:
1597                 ids_in_page.append(mobj.group(1))
1598         return ids_in_page
1599
1600     def _real_extract(self, url):
1601         # Extract channel id
1602         mobj = re.match(self._VALID_URL, url)
1603         if mobj is None:
1604             raise ExtractorError(u'Invalid URL: %s' % url)
1605
1606         # Download channel page
1607         channel_id = mobj.group(1)
1608         video_ids = []
1609         pagenum = 1
1610
1611         url = self._TEMPLATE_URL % (channel_id, pagenum)
1612         page = self._download_webpage(url, channel_id,
1613                                       u'Downloading page #%s' % pagenum)
1614
1615         # Extract video identifiers
1616         ids_in_page = self.extract_videos_from_page(page)
1617         video_ids.extend(ids_in_page)
1618
1619         # Download any subsequent channel pages using the json-based channel_ajax query
1620         if self._MORE_PAGES_INDICATOR in page:
1621             while True:
1622                 pagenum = pagenum + 1
1623
1624                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1625                 page = self._download_webpage(url, channel_id,
1626                                               u'Downloading page #%s' % pagenum)
1627
1628                 page = json.loads(page)
1629
1630                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1631                 video_ids.extend(ids_in_page)
1632
1633                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1634                     break
1635
1636         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1637
1638         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1639         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1640         return [self.playlist_result(url_entries, channel_id)]
1641
1642
1643 class YoutubeUserIE(InfoExtractor):
1644     """Information Extractor for YouTube users."""
1645
1646     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1647     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1648     _GDATA_PAGE_SIZE = 50
1649     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1650     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1651     IE_NAME = u'youtube:user'
1652
1653     def _real_extract(self, url):
1654         # Extract username
1655         mobj = re.match(self._VALID_URL, url)
1656         if mobj is None:
1657             raise ExtractorError(u'Invalid URL: %s' % url)
1658
1659         username = mobj.group(1)
1660
1661         # Download video ids using YouTube Data API. Result size per
1662         # query is limited (currently to 50 videos) so we need to query
1663         # page by page until there are no video ids - it means we got
1664         # all of them.
1665
1666         video_ids = []
1667         pagenum = 0
1668
1669         while True:
1670             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1671
1672             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1673             page = self._download_webpage(gdata_url, username,
1674                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1675
1676             # Extract video identifiers
1677             ids_in_page = []
1678
1679             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1680                 if mobj.group(1) not in ids_in_page:
1681                     ids_in_page.append(mobj.group(1))
1682
1683             video_ids.extend(ids_in_page)
1684
1685             # A little optimization - if current page is not
1686             # "full", ie. does not contain PAGE_SIZE video ids then
1687             # we can assume that this page is the last one - there
1688             # are no more ids on further pages - no need to query
1689             # again.
1690
1691             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692                 break
1693
1694             pagenum += 1
1695
1696         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1697         url_results = [self.url_result(url, 'Youtube') for url in urls]
1698         return [self.playlist_result(url_results, playlist_title = username)]
1699
1700
1701 class BlipTVUserIE(InfoExtractor):
1702     """Information Extractor for blip.tv users."""
1703
1704     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1705     _PAGE_SIZE = 12
1706     IE_NAME = u'blip.tv:user'
1707
1708     def _real_extract(self, url):
1709         # Extract username
1710         mobj = re.match(self._VALID_URL, url)
1711         if mobj is None:
1712             raise ExtractorError(u'Invalid URL: %s' % url)
1713
1714         username = mobj.group(1)
1715
1716         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1717
1718         page = self._download_webpage(url, username, u'Downloading user page')
1719         mobj = re.search(r'data-users-id="([^"]+)"', page)
1720         page_base = page_base % mobj.group(1)
1721
1722
1723         # Download video ids using BlipTV Ajax calls. Result size per
1724         # query is limited (currently to 12 videos) so we need to query
1725         # page by page until there are no video ids - it means we got
1726         # all of them.
1727
1728         video_ids = []
1729         pagenum = 1
1730
1731         while True:
1732             url = page_base + "&page=" + str(pagenum)
1733             page = self._download_webpage(url, username,
1734                                           u'Downloading video ids from page %d' % pagenum)
1735
1736             # Extract video identifiers
1737             ids_in_page = []
1738
1739             for mobj in re.finditer(r'href="/([^"]+)"', page):
1740                 if mobj.group(1) not in ids_in_page:
1741                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1742
1743             video_ids.extend(ids_in_page)
1744
1745             # A little optimization - if current page is not
1746             # "full", ie. does not contain PAGE_SIZE video ids then
1747             # we can assume that this page is the last one - there
1748             # are no more ids on further pages - no need to query
1749             # again.
1750
1751             if len(ids_in_page) < self._PAGE_SIZE:
1752                 break
1753
1754             pagenum += 1
1755
1756         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1757         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1758         return [self.playlist_result(url_entries, playlist_title = username)]
1759
1760
1761 class DepositFilesIE(InfoExtractor):
1762     """Information extractor for depositfiles.com"""
1763
1764     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1765
1766     def _real_extract(self, url):
1767         file_id = url.split('/')[-1]
1768         # Rebuild url in english locale
1769         url = 'http://depositfiles.com/en/files/' + file_id
1770
1771         # Retrieve file webpage with 'Free download' button pressed
1772         free_download_indication = { 'gateway_result' : '1' }
1773         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1774         try:
1775             self.report_download_webpage(file_id)
1776             webpage = compat_urllib_request.urlopen(request).read()
1777         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1779
1780         # Search for the real file URL
1781         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1782         if (mobj is None) or (mobj.group(1) is None):
1783             # Try to figure out reason of the error.
1784             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1785             if (mobj is not None) and (mobj.group(1) is not None):
1786                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1787                 raise ExtractorError(u'%s' % restriction_message)
1788             else:
1789                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1790
1791         file_url = mobj.group(1)
1792         file_extension = os.path.splitext(file_url)[1][1:]
1793
1794         # Search for file title
1795         mobj = re.search(r'<b title="(.*?)">', webpage)
1796         if mobj is None:
1797             raise ExtractorError(u'Unable to extract title')
1798         file_title = mobj.group(1).decode('utf-8')
1799
1800         return [{
1801             'id':       file_id.decode('utf-8'),
1802             'url':      file_url.decode('utf-8'),
1803             'uploader': None,
1804             'upload_date':  None,
1805             'title':    file_title,
1806             'ext':      file_extension.decode('utf-8'),
1807         }]
1808
1809
1810 class FacebookIE(InfoExtractor):
1811     """Information Extractor for Facebook"""
1812
1813     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1814     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1815     _NETRC_MACHINE = 'facebook'
1816     IE_NAME = u'facebook'
1817
1818     def report_login(self):
1819         """Report attempt to log in."""
1820         self.to_screen(u'Logging in')
1821
1822     def _real_initialize(self):
1823         if self._downloader is None:
1824             return
1825
1826         useremail = None
1827         password = None
1828         downloader_params = self._downloader.params
1829
1830         # Attempt to use provided username and password or .netrc data
1831         if downloader_params.get('username', None) is not None:
1832             useremail = downloader_params['username']
1833             password = downloader_params['password']
1834         elif downloader_params.get('usenetrc', False):
1835             try:
1836                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1837                 if info is not None:
1838                     useremail = info[0]
1839                     password = info[2]
1840                 else:
1841                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1842             except (IOError, netrc.NetrcParseError) as err:
1843                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1844                 return
1845
1846         if useremail is None:
1847             return
1848
1849         # Log in
1850         login_form = {
1851             'email': useremail,
1852             'pass': password,
1853             'login': 'Log+In'
1854             }
1855         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1856         try:
1857             self.report_login()
1858             login_results = compat_urllib_request.urlopen(request).read()
1859             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1860                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1861                 return
1862         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1863             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1864             return
1865
1866     def _real_extract(self, url):
1867         mobj = re.match(self._VALID_URL, url)
1868         if mobj is None:
1869             raise ExtractorError(u'Invalid URL: %s' % url)
1870         video_id = mobj.group('ID')
1871
1872         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1873         webpage = self._download_webpage(url, video_id)
1874
1875         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1876         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1877         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1878         if not m:
1879             raise ExtractorError(u'Cannot parse data')
1880         data = dict(json.loads(m.group(1)))
1881         params_raw = compat_urllib_parse.unquote(data['params'])
1882         params = json.loads(params_raw)
1883         video_data = params['video_data'][0]
1884         video_url = video_data.get('hd_src')
1885         if not video_url:
1886             video_url = video_data['sd_src']
1887         if not video_url:
1888             raise ExtractorError(u'Cannot find video URL')
1889         video_duration = int(video_data['video_duration'])
1890         thumbnail = video_data['thumbnail_src']
1891
1892         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1893         if not m:
1894             raise ExtractorError(u'Cannot find title in webpage')
1895         video_title = unescapeHTML(m.group(1))
1896
1897         info = {
1898             'id': video_id,
1899             'title': video_title,
1900             'url': video_url,
1901             'ext': 'mp4',
1902             'duration': video_duration,
1903             'thumbnail': thumbnail,
1904         }
1905         return [info]
1906
1907
1908 class BlipTVIE(InfoExtractor):
1909     """Information extractor for blip.tv"""
1910
1911     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1912     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1913     IE_NAME = u'blip.tv'
1914
1915     def report_direct_download(self, title):
1916         """Report information extraction."""
1917         self.to_screen(u'%s: Direct download detected' % title)
1918
1919     def _real_extract(self, url):
1920         mobj = re.match(self._VALID_URL, url)
1921         if mobj is None:
1922             raise ExtractorError(u'Invalid URL: %s' % url)
1923
1924         urlp = compat_urllib_parse_urlparse(url)
1925         if urlp.path.startswith('/play/'):
1926             request = compat_urllib_request.Request(url)
1927             response = compat_urllib_request.urlopen(request)
1928             redirecturl = response.geturl()
1929             rurlp = compat_urllib_parse_urlparse(redirecturl)
1930             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1931             url = 'http://blip.tv/a/a-' + file_id
1932             return self._real_extract(url)
1933
1934
1935         if '?' in url:
1936             cchar = '&'
1937         else:
1938             cchar = '?'
1939         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1940         request = compat_urllib_request.Request(json_url)
1941         request.add_header('User-Agent', 'iTunes/10.6.1')
1942         self.report_extraction(mobj.group(1))
1943         info = None
1944         try:
1945             urlh = compat_urllib_request.urlopen(request)
1946             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1947                 basename = url.split('/')[-1]
1948                 title,ext = os.path.splitext(basename)
1949                 title = title.decode('UTF-8')
1950                 ext = ext.replace('.', '')
1951                 self.report_direct_download(title)
1952                 info = {
1953                     'id': title,
1954                     'url': url,
1955                     'uploader': None,
1956                     'upload_date': None,
1957                     'title': title,
1958                     'ext': ext,
1959                     'urlhandle': urlh
1960                 }
1961         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1962             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1963         if info is None: # Regular URL
1964             try:
1965                 json_code_bytes = urlh.read()
1966                 json_code = json_code_bytes.decode('utf-8')
1967             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1968                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1969
1970             try:
1971                 json_data = json.loads(json_code)
1972                 if 'Post' in json_data:
1973                     data = json_data['Post']
1974                 else:
1975                     data = json_data
1976
1977                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1978                 video_url = data['media']['url']
1979                 umobj = re.match(self._URL_EXT, video_url)
1980                 if umobj is None:
1981                     raise ValueError('Can not determine filename extension')
1982                 ext = umobj.group(1)
1983
1984                 info = {
1985                     'id': data['item_id'],
1986                     'url': video_url,
1987                     'uploader': data['display_name'],
1988                     'upload_date': upload_date,
1989                     'title': data['title'],
1990                     'ext': ext,
1991                     'format': data['media']['mimeType'],
1992                     'thumbnail': data['thumbnailUrl'],
1993                     'description': data['description'],
1994                     'player_url': data['embedUrl'],
1995                     'user_agent': 'iTunes/10.6.1',
1996                 }
1997             except (ValueError,KeyError) as err:
1998                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1999
2000         return [info]
2001
2002
2003 class MyVideoIE(InfoExtractor):
2004     """Information Extractor for myvideo.de."""
2005
2006     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2007     IE_NAME = u'myvideo'
2008
2009     def _real_extract(self,url):
2010         mobj = re.match(self._VALID_URL, url)
2011         if mobj is None:
2012             raise ExtractorError(u'Invalid URL: %s' % url)
2013
2014         video_id = mobj.group(1)
2015
2016         # Get video webpage
2017         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2018         webpage = self._download_webpage(webpage_url, video_id)
2019
2020         self.report_extraction(video_id)
2021         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2022                  webpage)
2023         if mobj is None:
2024             raise ExtractorError(u'Unable to extract media URL')
2025         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2026
2027         mobj = re.search('<title>([^<]+)</title>', webpage)
2028         if mobj is None:
2029             raise ExtractorError(u'Unable to extract title')
2030
2031         video_title = mobj.group(1)
2032
2033         return [{
2034             'id':       video_id,
2035             'url':      video_url,
2036             'uploader': None,
2037             'upload_date':  None,
2038             'title':    video_title,
2039             'ext':      u'flv',
2040         }]
2041
2042 class ComedyCentralIE(InfoExtractor):
2043     """Information extractor for The Daily Show and Colbert Report """
2044
2045     # urls can be abbreviations like :thedailyshow or :colbert
2046     # urls for episodes like:
2047     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2048     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2049     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2050     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2051                       |(https?://)?(www\.)?
2052                           (?P<showname>thedailyshow|colbertnation)\.com/
2053                          (full-episodes/(?P<episode>.*)|
2054                           (?P<clip>
2055                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2056                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2057                      $"""
2058
2059     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2060
2061     _video_extensions = {
2062         '3500': 'mp4',
2063         '2200': 'mp4',
2064         '1700': 'mp4',
2065         '1200': 'mp4',
2066         '750': 'mp4',
2067         '400': 'mp4',
2068     }
2069     _video_dimensions = {
2070         '3500': '1280x720',
2071         '2200': '960x540',
2072         '1700': '768x432',
2073         '1200': '640x360',
2074         '750': '512x288',
2075         '400': '384x216',
2076     }
2077
2078     @classmethod
2079     def suitable(cls, url):
2080         """Receives a URL and returns True if suitable for this IE."""
2081         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2082
2083     def _print_formats(self, formats):
2084         print('Available formats:')
2085         for x in formats:
2086             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2087
2088
2089     def _real_extract(self, url):
2090         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2091         if mobj is None:
2092             raise ExtractorError(u'Invalid URL: %s' % url)
2093
2094         if mobj.group('shortname'):
2095             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2096                 url = u'http://www.thedailyshow.com/full-episodes/'
2097             else:
2098                 url = u'http://www.colbertnation.com/full-episodes/'
2099             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2100             assert mobj is not None
2101
2102         if mobj.group('clip'):
2103             if mobj.group('showname') == 'thedailyshow':
2104                 epTitle = mobj.group('tdstitle')
2105             else:
2106                 epTitle = mobj.group('cntitle')
2107             dlNewest = False
2108         else:
2109             dlNewest = not mobj.group('episode')
2110             if dlNewest:
2111                 epTitle = mobj.group('showname')
2112             else:
2113                 epTitle = mobj.group('episode')
2114
2115         self.report_extraction(epTitle)
2116         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2117         if dlNewest:
2118             url = htmlHandle.geturl()
2119             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2120             if mobj is None:
2121                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2122             if mobj.group('episode') == '':
2123                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2124             epTitle = mobj.group('episode')
2125
2126         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2127
2128         if len(mMovieParams) == 0:
2129             # The Colbert Report embeds the information in a without
2130             # a URL prefix; so extract the alternate reference
2131             # and then add the URL prefix manually.
2132
2133             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2134             if len(altMovieParams) == 0:
2135                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2136             else:
2137                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2138
2139         uri = mMovieParams[0][1]
2140         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2141         indexXml = self._download_webpage(indexUrl, epTitle,
2142                                           u'Downloading show index',
2143                                           u'unable to download episode index')
2144
2145         results = []
2146
2147         idoc = xml.etree.ElementTree.fromstring(indexXml)
2148         itemEls = idoc.findall('.//item')
2149         for partNum,itemEl in enumerate(itemEls):
2150             mediaId = itemEl.findall('./guid')[0].text
2151             shortMediaId = mediaId.split(':')[-1]
2152             showId = mediaId.split(':')[-2].replace('.com', '')
2153             officialTitle = itemEl.findall('./title')[0].text
2154             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2155
2156             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2157                         compat_urllib_parse.urlencode({'uri': mediaId}))
2158             configXml = self._download_webpage(configUrl, epTitle,
2159                                                u'Downloading configuration for %s' % shortMediaId)
2160
2161             cdoc = xml.etree.ElementTree.fromstring(configXml)
2162             turls = []
2163             for rendition in cdoc.findall('.//rendition'):
2164                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2165                 turls.append(finfo)
2166
2167             if len(turls) == 0:
2168                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2169                 continue
2170
2171             if self._downloader.params.get('listformats', None):
2172                 self._print_formats([i[0] for i in turls])
2173                 return
2174
2175             # For now, just pick the highest bitrate
2176             format,rtmp_video_url = turls[-1]
2177
2178             # Get the format arg from the arg stream
2179             req_format = self._downloader.params.get('format', None)
2180
2181             # Select format if we can find one
2182             for f,v in turls:
2183                 if f == req_format:
2184                     format, rtmp_video_url = f, v
2185                     break
2186
2187             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2188             if not m:
2189                 raise ExtractorError(u'Cannot transform RTMP url')
2190             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2191             video_url = base + m.group('finalid')
2192
2193             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2194             info = {
2195                 'id': shortMediaId,
2196                 'url': video_url,
2197                 'uploader': showId,
2198                 'upload_date': officialDate,
2199                 'title': effTitle,
2200                 'ext': 'mp4',
2201                 'format': format,
2202                 'thumbnail': None,
2203                 'description': officialTitle,
2204             }
2205             results.append(info)
2206
2207         return results
2208
2209
2210 class EscapistIE(InfoExtractor):
2211     """Information extractor for The Escapist """
2212
2213     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2214     IE_NAME = u'escapist'
2215
2216     def _real_extract(self, url):
2217         mobj = re.match(self._VALID_URL, url)
2218         if mobj is None:
2219             raise ExtractorError(u'Invalid URL: %s' % url)
2220         showName = mobj.group('showname')
2221         videoId = mobj.group('episode')
2222
2223         self.report_extraction(showName)
2224         webPage = self._download_webpage(url, showName)
2225
2226         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2227         description = unescapeHTML(descMatch.group(1))
2228         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2229         imgUrl = unescapeHTML(imgMatch.group(1))
2230         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2231         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2232         configUrlMatch = re.search('config=(.*)$', playerUrl)
2233         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2234
2235         configJSON = self._download_webpage(configUrl, showName,
2236                                             u'Downloading configuration',
2237                                             u'unable to download configuration')
2238
2239         # Technically, it's JavaScript, not JSON
2240         configJSON = configJSON.replace("'", '"')
2241
2242         try:
2243             config = json.loads(configJSON)
2244         except (ValueError,) as err:
2245             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2246
2247         playlist = config['playlist']
2248         videoUrl = playlist[1]['url']
2249
2250         info = {
2251             'id': videoId,
2252             'url': videoUrl,
2253             'uploader': showName,
2254             'upload_date': None,
2255             'title': showName,
2256             'ext': 'mp4',
2257             'thumbnail': imgUrl,
2258             'description': description,
2259             'player_url': playerUrl,
2260         }
2261
2262         return [info]
2263
2264 class CollegeHumorIE(InfoExtractor):
2265     """Information extractor for collegehumor.com"""
2266
2267     _WORKING = False
2268     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2269     IE_NAME = u'collegehumor'
2270
2271     def report_manifest(self, video_id):
2272         """Report information extraction."""
2273         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2274
2275     def _real_extract(self, url):
2276         mobj = re.match(self._VALID_URL, url)
2277         if mobj is None:
2278             raise ExtractorError(u'Invalid URL: %s' % url)
2279         video_id = mobj.group('videoid')
2280
2281         info = {
2282             'id': video_id,
2283             'uploader': None,
2284             'upload_date': None,
2285         }
2286
2287         self.report_extraction(video_id)
2288         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2289         try:
2290             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2292             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2293
2294         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2295         try:
2296             videoNode = mdoc.findall('./video')[0]
2297             info['description'] = videoNode.findall('./description')[0].text
2298             info['title'] = videoNode.findall('./caption')[0].text
2299             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2300             manifest_url = videoNode.findall('./file')[0].text
2301         except IndexError:
2302             raise ExtractorError(u'Invalid metadata XML file')
2303
2304         manifest_url += '?hdcore=2.10.3'
2305         self.report_manifest(video_id)
2306         try:
2307             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2309             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2310
2311         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2312         try:
2313             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2314             node_id = media_node.attrib['url']
2315             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2316         except IndexError as err:
2317             raise ExtractorError(u'Invalid manifest file')
2318
2319         url_pr = compat_urllib_parse_urlparse(manifest_url)
2320         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2321
2322         info['url'] = url
2323         info['ext'] = 'f4f'
2324         return [info]
2325
2326
2327 class XVideosIE(InfoExtractor):
2328     """Information extractor for xvideos.com"""
2329
2330     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2331     IE_NAME = u'xvideos'
2332
2333     def _real_extract(self, url):
2334         mobj = re.match(self._VALID_URL, url)
2335         if mobj is None:
2336             raise ExtractorError(u'Invalid URL: %s' % url)
2337         video_id = mobj.group(1)
2338
2339         webpage = self._download_webpage(url, video_id)
2340
2341         self.report_extraction(video_id)
2342
2343
2344         # Extract video URL
2345         mobj = re.search(r'flv_url=(.+?)&', webpage)
2346         if mobj is None:
2347             raise ExtractorError(u'Unable to extract video url')
2348         video_url = compat_urllib_parse.unquote(mobj.group(1))
2349
2350
2351         # Extract title
2352         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2353         if mobj is None:
2354             raise ExtractorError(u'Unable to extract video title')
2355         video_title = mobj.group(1)
2356
2357
2358         # Extract video thumbnail
2359         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2360         if mobj is None:
2361             raise ExtractorError(u'Unable to extract video thumbnail')
2362         video_thumbnail = mobj.group(0)
2363
2364         info = {
2365             'id': video_id,
2366             'url': video_url,
2367             'uploader': None,
2368             'upload_date': None,
2369             'title': video_title,
2370             'ext': 'flv',
2371             'thumbnail': video_thumbnail,
2372             'description': None,
2373         }
2374
2375         return [info]
2376
2377
2378 class SoundcloudIE(InfoExtractor):
2379     """Information extractor for soundcloud.com
2380        To access the media, the uid of the song and a stream token
2381        must be extracted from the page source and the script must make
2382        a request to media.soundcloud.com/crossdomain.xml. Then
2383        the media can be grabbed by requesting from an url composed
2384        of the stream token and uid
2385      """
2386
2387     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2388     IE_NAME = u'soundcloud'
2389
2390     def report_resolve(self, video_id):
2391         """Report information extraction."""
2392         self.to_screen(u'%s: Resolving id' % video_id)
2393
2394     def _real_extract(self, url):
2395         mobj = re.match(self._VALID_URL, url)
2396         if mobj is None:
2397             raise ExtractorError(u'Invalid URL: %s' % url)
2398
2399         # extract uploader (which is in the url)
2400         uploader = mobj.group(1)
2401         # extract simple title (uploader + slug of song title)
2402         slug_title =  mobj.group(2)
2403         simple_title = uploader + u'-' + slug_title
2404         full_title = '%s/%s' % (uploader, slug_title)
2405
2406         self.report_resolve(full_title)
2407
2408         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2409         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2410         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2411
2412         info = json.loads(info_json)
2413         video_id = info['id']
2414         self.report_extraction(full_title)
2415
2416         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2417         stream_json = self._download_webpage(streams_url, full_title,
2418                                              u'Downloading stream definitions',
2419                                              u'unable to download stream definitions')
2420
2421         streams = json.loads(stream_json)
2422         mediaURL = streams['http_mp3_128_url']
2423         upload_date = unified_strdate(info['created_at'])
2424
2425         return [{
2426             'id':       info['id'],
2427             'url':      mediaURL,
2428             'uploader': info['user']['username'],
2429             'upload_date': upload_date,
2430             'title':    info['title'],
2431             'ext':      u'mp3',
2432             'description': info['description'],
2433         }]
2434
2435 class SoundcloudSetIE(InfoExtractor):
2436     """Information extractor for soundcloud.com sets
2437        To access the media, the uid of the song and a stream token
2438        must be extracted from the page source and the script must make
2439        a request to media.soundcloud.com/crossdomain.xml. Then
2440        the media can be grabbed by requesting from an url composed
2441        of the stream token and uid
2442      """
2443
2444     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2445     IE_NAME = u'soundcloud:set'
2446
2447     def report_resolve(self, video_id):
2448         """Report information extraction."""
2449         self.to_screen(u'%s: Resolving id' % video_id)
2450
2451     def _real_extract(self, url):
2452         mobj = re.match(self._VALID_URL, url)
2453         if mobj is None:
2454             raise ExtractorError(u'Invalid URL: %s' % url)
2455
2456         # extract uploader (which is in the url)
2457         uploader = mobj.group(1)
2458         # extract simple title (uploader + slug of song title)
2459         slug_title =  mobj.group(2)
2460         simple_title = uploader + u'-' + slug_title
2461         full_title = '%s/sets/%s' % (uploader, slug_title)
2462
2463         self.report_resolve(full_title)
2464
2465         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2466         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2467         info_json = self._download_webpage(resolv_url, full_title)
2468
2469         videos = []
2470         info = json.loads(info_json)
2471         if 'errors' in info:
2472             for err in info['errors']:
2473                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2474             return
2475
2476         self.report_extraction(full_title)
2477         for track in info['tracks']:
2478             video_id = track['id']
2479
2480             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2481             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2482
2483             self.report_extraction(video_id)
2484             streams = json.loads(stream_json)
2485             mediaURL = streams['http_mp3_128_url']
2486
2487             videos.append({
2488                 'id':       video_id,
2489                 'url':      mediaURL,
2490                 'uploader': track['user']['username'],
2491                 'upload_date':  unified_strdate(track['created_at']),
2492                 'title':    track['title'],
2493                 'ext':      u'mp3',
2494                 'description': track['description'],
2495             })
2496         return videos
2497
2498
2499 class InfoQIE(InfoExtractor):
2500     """Information extractor for infoq.com"""
2501     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2502
2503     def _real_extract(self, url):
2504         mobj = re.match(self._VALID_URL, url)
2505         if mobj is None:
2506             raise ExtractorError(u'Invalid URL: %s' % url)
2507
2508         webpage = self._download_webpage(url, video_id=url)
2509         self.report_extraction(url)
2510
2511         # Extract video URL
2512         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2513         if mobj is None:
2514             raise ExtractorError(u'Unable to extract video url')
2515         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2516         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2517
2518         # Extract title
2519         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2520         if mobj is None:
2521             raise ExtractorError(u'Unable to extract video title')
2522         video_title = mobj.group(1)
2523
2524         # Extract description
2525         video_description = u'No description available.'
2526         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2527         if mobj is not None:
2528             video_description = mobj.group(1)
2529
2530         video_filename = video_url.split('/')[-1]
2531         video_id, extension = video_filename.split('.')
2532
2533         info = {
2534             'id': video_id,
2535             'url': video_url,
2536             'uploader': None,
2537             'upload_date': None,
2538             'title': video_title,
2539             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2540             'thumbnail': None,
2541             'description': video_description,
2542         }
2543
2544         return [info]
2545
2546 class MixcloudIE(InfoExtractor):
2547     """Information extractor for www.mixcloud.com"""
2548
2549     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2550     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2551     IE_NAME = u'mixcloud'
2552
2553     def report_download_json(self, file_id):
2554         """Report JSON download."""
2555         self.to_screen(u'Downloading json')
2556
2557     def get_urls(self, jsonData, fmt, bitrate='best'):
2558         """Get urls from 'audio_formats' section in json"""
2559         file_url = None
2560         try:
2561             bitrate_list = jsonData[fmt]
2562             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2563                 bitrate = max(bitrate_list) # select highest
2564
2565             url_list = jsonData[fmt][bitrate]
2566         except TypeError: # we have no bitrate info.
2567             url_list = jsonData[fmt]
2568         return url_list
2569
2570     def check_urls(self, url_list):
2571         """Returns 1st active url from list"""
2572         for url in url_list:
2573             try:
2574                 compat_urllib_request.urlopen(url)
2575                 return url
2576             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577                 url = None
2578
2579         return None
2580
2581     def _print_formats(self, formats):
2582         print('Available formats:')
2583         for fmt in formats.keys():
2584             for b in formats[fmt]:
2585                 try:
2586                     ext = formats[fmt][b][0]
2587                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2588                 except TypeError: # we have no bitrate info
2589                     ext = formats[fmt][0]
2590                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2591                     break
2592
2593     def _real_extract(self, url):
2594         mobj = re.match(self._VALID_URL, url)
2595         if mobj is None:
2596             raise ExtractorError(u'Invalid URL: %s' % url)
2597         # extract uploader & filename from url
2598         uploader = mobj.group(1).decode('utf-8')
2599         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2600
2601         # construct API request
2602         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2603         # retrieve .json file with links to files
2604         request = compat_urllib_request.Request(file_url)
2605         try:
2606             self.report_download_json(file_url)
2607             jsonData = compat_urllib_request.urlopen(request).read()
2608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2609             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2610
2611         # parse JSON
2612         json_data = json.loads(jsonData)
2613         player_url = json_data['player_swf_url']
2614         formats = dict(json_data['audio_formats'])
2615
2616         req_format = self._downloader.params.get('format', None)
2617         bitrate = None
2618
2619         if self._downloader.params.get('listformats', None):
2620             self._print_formats(formats)
2621             return
2622
2623         if req_format is None or req_format == 'best':
2624             for format_param in formats.keys():
2625                 url_list = self.get_urls(formats, format_param)
2626                 # check urls
2627                 file_url = self.check_urls(url_list)
2628                 if file_url is not None:
2629                     break # got it!
2630         else:
2631             if req_format not in formats:
2632                 raise ExtractorError(u'Format is not available')
2633
2634             url_list = self.get_urls(formats, req_format)
2635             file_url = self.check_urls(url_list)
2636             format_param = req_format
2637
2638         return [{
2639             'id': file_id.decode('utf-8'),
2640             'url': file_url.decode('utf-8'),
2641             'uploader': uploader.decode('utf-8'),
2642             'upload_date': None,
2643             'title': json_data['name'],
2644             'ext': file_url.split('.')[-1].decode('utf-8'),
2645             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2646             'thumbnail': json_data['thumbnail_url'],
2647             'description': json_data['description'],
2648             'player_url': player_url.decode('utf-8'),
2649         }]
2650
2651 class StanfordOpenClassroomIE(InfoExtractor):
2652     """Information extractor for Stanford's Open ClassRoom"""
2653
2654     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2655     IE_NAME = u'stanfordoc'
2656
2657     def _real_extract(self, url):
2658         mobj = re.match(self._VALID_URL, url)
2659         if mobj is None:
2660             raise ExtractorError(u'Invalid URL: %s' % url)
2661
2662         if mobj.group('course') and mobj.group('video'): # A specific video
2663             course = mobj.group('course')
2664             video = mobj.group('video')
2665             info = {
2666                 'id': course + '_' + video,
2667                 'uploader': None,
2668                 'upload_date': None,
2669             }
2670
2671             self.report_extraction(info['id'])
2672             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2673             xmlUrl = baseUrl + video + '.xml'
2674             try:
2675                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2676             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2678             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2679             try:
2680                 info['title'] = mdoc.findall('./title')[0].text
2681                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2682             except IndexError:
2683                 raise ExtractorError(u'Invalid metadata XML file')
2684             info['ext'] = info['url'].rpartition('.')[2]
2685             return [info]
2686         elif mobj.group('course'): # A course page
2687             course = mobj.group('course')
2688             info = {
2689                 'id': course,
2690                 'type': 'playlist',
2691                 'uploader': None,
2692                 'upload_date': None,
2693             }
2694
2695             coursepage = self._download_webpage(url, info['id'],
2696                                         note='Downloading course info page',
2697                                         errnote='Unable to download course info page')
2698
2699             m = re.search('<h1>([^<]+)</h1>', coursepage)
2700             if m:
2701                 info['title'] = unescapeHTML(m.group(1))
2702             else:
2703                 info['title'] = info['id']
2704
2705             m = re.search('<description>([^<]+)</description>', coursepage)
2706             if m:
2707                 info['description'] = unescapeHTML(m.group(1))
2708
2709             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2710             info['list'] = [
2711                 {
2712                     'type': 'reference',
2713                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2714                 }
2715                     for vpage in links]
2716             results = []
2717             for entry in info['list']:
2718                 assert entry['type'] == 'reference'
2719                 results += self.extract(entry['url'])
2720             return results
2721         else: # Root page
2722             info = {
2723                 'id': 'Stanford OpenClassroom',
2724                 'type': 'playlist',
2725                 'uploader': None,
2726                 'upload_date': None,
2727             }
2728
2729             self.report_download_webpage(info['id'])
2730             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2731             try:
2732                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2733             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2734                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2735
2736             info['title'] = info['id']
2737
2738             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2739             info['list'] = [
2740                 {
2741                     'type': 'reference',
2742                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2743                 }
2744                     for cpage in links]
2745
2746             results = []
2747             for entry in info['list']:
2748                 assert entry['type'] == 'reference'
2749                 results += self.extract(entry['url'])
2750             return results
2751
2752 class MTVIE(InfoExtractor):
2753     """Information extractor for MTV.com"""
2754
2755     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2756     IE_NAME = u'mtv'
2757
2758     def _real_extract(self, url):
2759         mobj = re.match(self._VALID_URL, url)
2760         if mobj is None:
2761             raise ExtractorError(u'Invalid URL: %s' % url)
2762         if not mobj.group('proto'):
2763             url = 'http://' + url
2764         video_id = mobj.group('videoid')
2765
2766         webpage = self._download_webpage(url, video_id)
2767
2768         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2769         if mobj is None:
2770             raise ExtractorError(u'Unable to extract song name')
2771         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2772         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2773         if mobj is None:
2774             raise ExtractorError(u'Unable to extract performer')
2775         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2776         video_title = performer + ' - ' + song_name
2777
2778         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2779         if mobj is None:
2780             raise ExtractorError(u'Unable to mtvn_uri')
2781         mtvn_uri = mobj.group(1)
2782
2783         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2784         if mobj is None:
2785             raise ExtractorError(u'Unable to extract content id')
2786         content_id = mobj.group(1)
2787
2788         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2789         self.report_extraction(video_id)
2790         request = compat_urllib_request.Request(videogen_url)
2791         try:
2792             metadataXml = compat_urllib_request.urlopen(request).read()
2793         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2794             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2795
2796         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2797         renditions = mdoc.findall('.//rendition')
2798
2799         # For now, always pick the highest quality.
2800         rendition = renditions[-1]
2801
2802         try:
2803             _,_,ext = rendition.attrib['type'].partition('/')
2804             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2805             video_url = rendition.find('./src').text
2806         except KeyError:
2807             raise ExtractorError('Invalid rendition field.')
2808
2809         info = {
2810             'id': video_id,
2811             'url': video_url,
2812             'uploader': performer,
2813             'upload_date': None,
2814             'title': video_title,
2815             'ext': ext,
2816             'format': format,
2817         }
2818
2819         return [info]
2820
2821
2822 class YoukuIE(InfoExtractor):
2823     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2824
2825     def _gen_sid(self):
2826         nowTime = int(time.time() * 1000)
2827         random1 = random.randint(1000,1998)
2828         random2 = random.randint(1000,9999)
2829
2830         return "%d%d%d" %(nowTime,random1,random2)
2831
2832     def _get_file_ID_mix_string(self, seed):
2833         mixed = []
2834         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2835         seed = float(seed)
2836         for i in range(len(source)):
2837             seed  =  (seed * 211 + 30031 ) % 65536
2838             index  =  math.floor(seed / 65536 * len(source) )
2839             mixed.append(source[int(index)])
2840             source.remove(source[int(index)])
2841         #return ''.join(mixed)
2842         return mixed
2843
2844     def _get_file_id(self, fileId, seed):
2845         mixed = self._get_file_ID_mix_string(seed)
2846         ids = fileId.split('*')
2847         realId = []
2848         for ch in ids:
2849             if ch:
2850                 realId.append(mixed[int(ch)])
2851         return ''.join(realId)
2852
2853     def _real_extract(self, url):
2854         mobj = re.match(self._VALID_URL, url)
2855         if mobj is None:
2856             raise ExtractorError(u'Invalid URL: %s' % url)
2857         video_id = mobj.group('ID')
2858
2859         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2860
2861         jsondata = self._download_webpage(info_url, video_id)
2862
2863         self.report_extraction(video_id)
2864         try:
2865             config = json.loads(jsondata)
2866
2867             video_title =  config['data'][0]['title']
2868             seed = config['data'][0]['seed']
2869
2870             format = self._downloader.params.get('format', None)
2871             supported_format = list(config['data'][0]['streamfileids'].keys())
2872
2873             if format is None or format == 'best':
2874                 if 'hd2' in supported_format:
2875                     format = 'hd2'
2876                 else:
2877                     format = 'flv'
2878                 ext = u'flv'
2879             elif format == 'worst':
2880                 format = 'mp4'
2881                 ext = u'mp4'
2882             else:
2883                 format = 'flv'
2884                 ext = u'flv'
2885
2886
2887             fileid = config['data'][0]['streamfileids'][format]
2888             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2889         except (UnicodeDecodeError, ValueError, KeyError):
2890             raise ExtractorError(u'Unable to extract info section')
2891
2892         files_info=[]
2893         sid = self._gen_sid()
2894         fileid = self._get_file_id(fileid, seed)
2895
2896         #column 8,9 of fileid represent the segment number
2897         #fileid[7:9] should be changed
2898         for index, key in enumerate(keys):
2899
2900             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2901             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2902
2903             info = {
2904                 'id': '%s_part%02d' % (video_id, index),
2905                 'url': download_url,
2906                 'uploader': None,
2907                 'upload_date': None,
2908                 'title': video_title,
2909                 'ext': ext,
2910             }
2911             files_info.append(info)
2912
2913         return files_info
2914
2915
2916 class XNXXIE(InfoExtractor):
2917     """Information extractor for xnxx.com"""
2918
2919     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2920     IE_NAME = u'xnxx'
2921     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2922     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2923     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2924
2925     def _real_extract(self, url):
2926         mobj = re.match(self._VALID_URL, url)
2927         if mobj is None:
2928             raise ExtractorError(u'Invalid URL: %s' % url)
2929         video_id = mobj.group(1)
2930
2931         # Get webpage content
2932         webpage = self._download_webpage(url, video_id)
2933
2934         result = re.search(self.VIDEO_URL_RE, webpage)
2935         if result is None:
2936             raise ExtractorError(u'Unable to extract video url')
2937         video_url = compat_urllib_parse.unquote(result.group(1))
2938
2939         result = re.search(self.VIDEO_TITLE_RE, webpage)
2940         if result is None:
2941             raise ExtractorError(u'Unable to extract video title')
2942         video_title = result.group(1)
2943
2944         result = re.search(self.VIDEO_THUMB_RE, webpage)
2945         if result is None:
2946             raise ExtractorError(u'Unable to extract video thumbnail')
2947         video_thumbnail = result.group(1)
2948
2949         return [{
2950             'id': video_id,
2951             'url': video_url,
2952             'uploader': None,
2953             'upload_date': None,
2954             'title': video_title,
2955             'ext': 'flv',
2956             'thumbnail': video_thumbnail,
2957             'description': None,
2958         }]
2959
2960
2961 class GooglePlusIE(InfoExtractor):
2962     """Information extractor for plus.google.com."""
2963
2964     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2965     IE_NAME = u'plus.google'
2966
2967     def report_extract_entry(self, url):
2968         """Report downloading extry"""
2969         self.to_screen(u'Downloading entry: %s' % url)
2970
2971     def report_date(self, upload_date):
2972         """Report downloading extry"""
2973         self.to_screen(u'Entry date: %s' % upload_date)
2974
2975     def report_uploader(self, uploader):
2976         """Report downloading extry"""
2977         self.to_screen(u'Uploader: %s' % uploader)
2978
2979     def report_title(self, video_title):
2980         """Report downloading extry"""
2981         self.to_screen(u'Title: %s' % video_title)
2982
2983     def report_extract_vid_page(self, video_page):
2984         """Report information extraction."""
2985         self.to_screen(u'Extracting video page: %s' % video_page)
2986
2987     def _real_extract(self, url):
2988         # Extract id from URL
2989         mobj = re.match(self._VALID_URL, url)
2990         if mobj is None:
2991             raise ExtractorError(u'Invalid URL: %s' % url)
2992
2993         post_url = mobj.group(0)
2994         video_id = mobj.group(1)
2995
2996         video_extension = 'flv'
2997
2998         # Step 1, Retrieve post webpage to extract further information
2999         self.report_extract_entry(post_url)
3000         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3001
3002         # Extract update date
3003         upload_date = None
3004         pattern = 'title="Timestamp">(.*?)</a>'
3005         mobj = re.search(pattern, webpage)
3006         if mobj:
3007             upload_date = mobj.group(1)
3008             # Convert timestring to a format suitable for filename
3009             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3010             upload_date = upload_date.strftime('%Y%m%d')
3011         self.report_date(upload_date)
3012
3013         # Extract uploader
3014         uploader = None
3015         pattern = r'rel\="author".*?>(.*?)</a>'
3016         mobj = re.search(pattern, webpage)
3017         if mobj:
3018             uploader = mobj.group(1)
3019         self.report_uploader(uploader)
3020
3021         # Extract title
3022         # Get the first line for title
3023         video_title = u'NA'
3024         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3025         mobj = re.search(pattern, webpage)
3026         if mobj:
3027             video_title = mobj.group(1)
3028         self.report_title(video_title)
3029
3030         # Step 2, Stimulate clicking the image box to launch video
3031         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3032         mobj = re.search(pattern, webpage)
3033         if mobj is None:
3034             raise ExtractorError(u'Unable to extract video page URL')
3035
3036         video_page = mobj.group(1)
3037         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3038         self.report_extract_vid_page(video_page)
3039
3040
3041         # Extract video links on video page
3042         """Extract video links of all sizes"""
3043         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3044         mobj = re.findall(pattern, webpage)
3045         if len(mobj) == 0:
3046             raise ExtractorError(u'Unable to extract video links')
3047
3048         # Sort in resolution
3049         links = sorted(mobj)
3050
3051         # Choose the lowest of the sort, i.e. highest resolution
3052         video_url = links[-1]
3053         # Only get the url. The resolution part in the tuple has no use anymore
3054         video_url = video_url[-1]
3055         # Treat escaped \u0026 style hex
3056         try:
3057             video_url = video_url.decode("unicode_escape")
3058         except AttributeError: # Python 3
3059             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3060
3061
3062         return [{
3063             'id':       video_id,
3064             'url':      video_url,
3065             'uploader': uploader,
3066             'upload_date':  upload_date,
3067             'title':    video_title,
3068             'ext':      video_extension,
3069         }]
3070
3071 class NBAIE(InfoExtractor):
3072     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3073     IE_NAME = u'nba'
3074
3075     def _real_extract(self, url):
3076         mobj = re.match(self._VALID_URL, url)
3077         if mobj is None:
3078             raise ExtractorError(u'Invalid URL: %s' % url)
3079
3080         video_id = mobj.group(1)
3081         if video_id.endswith('/index.html'):
3082             video_id = video_id[:-len('/index.html')]
3083
3084         webpage = self._download_webpage(url, video_id)
3085
3086         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3087         def _findProp(rexp, default=None):
3088             m = re.search(rexp, webpage)
3089             if m:
3090                 return unescapeHTML(m.group(1))
3091             else:
3092                 return default
3093
3094         shortened_video_id = video_id.rpartition('/')[2]
3095         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3096         info = {
3097             'id': shortened_video_id,
3098             'url': video_url,
3099             'ext': 'mp4',
3100             'title': title,
3101             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3102             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3103         }
3104         return [info]
3105
3106 class JustinTVIE(InfoExtractor):
3107     """Information extractor for justin.tv and twitch.tv"""
3108     # TODO: One broadcast may be split into multiple videos. The key
3109     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3110     # starts at 1 and increases. Can we treat all parts as one video?
3111
3112     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3113         (?:
3114             (?P<channelid>[^/]+)|
3115             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3116             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3117         )
3118         /?(?:\#.*)?$
3119         """
3120     _JUSTIN_PAGE_LIMIT = 100
3121     IE_NAME = u'justin.tv'
3122
3123     def report_download_page(self, channel, offset):
3124         """Report attempt to download a single page of videos."""
3125         self.to_screen(u'%s: Downloading video information from %d to %d' %
3126                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3127
3128     # Return count of items, list of *valid* items
3129     def _parse_page(self, url, video_id):
3130         webpage = self._download_webpage(url, video_id,
3131                                          u'Downloading video info JSON',
3132                                          u'unable to download video info JSON')
3133
3134         response = json.loads(webpage)
3135         if type(response) != list:
3136             error_text = response.get('error', 'unknown error')
3137             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3138         info = []
3139         for clip in response:
3140             video_url = clip['video_file_url']
3141             if video_url:
3142                 video_extension = os.path.splitext(video_url)[1][1:]
3143                 video_date = re.sub('-', '', clip['start_time'][:10])
3144                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3145                 video_id = clip['id']
3146                 video_title = clip.get('title', video_id)
3147                 info.append({
3148                     'id': video_id,
3149                     'url': video_url,
3150                     'title': video_title,
3151                     'uploader': clip.get('channel_name', video_uploader_id),
3152                     'uploader_id': video_uploader_id,
3153                     'upload_date': video_date,
3154                     'ext': video_extension,
3155                 })
3156         return (len(response), info)
3157
3158     def _real_extract(self, url):
3159         mobj = re.match(self._VALID_URL, url)
3160         if mobj is None:
3161             raise ExtractorError(u'invalid URL: %s' % url)
3162
3163         api_base = 'http://api.justin.tv'
3164         paged = False
3165         if mobj.group('channelid'):
3166             paged = True
3167             video_id = mobj.group('channelid')
3168             api = api_base + '/channel/archives/%s.json' % video_id
3169         elif mobj.group('chapterid'):
3170             chapter_id = mobj.group('chapterid')
3171
3172             webpage = self._download_webpage(url, chapter_id)
3173             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3174             if not m:
3175                 raise ExtractorError(u'Cannot find archive of a chapter')
3176             archive_id = m.group(1)
3177
3178             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3179             chapter_info_xml = self._download_webpage(api, chapter_id,
3180                                              note=u'Downloading chapter information',
3181                                              errnote=u'Chapter information download failed')
3182             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3183             for a in doc.findall('.//archive'):
3184                 if archive_id == a.find('./id').text:
3185                     break
3186             else:
3187                 raise ExtractorError(u'Could not find chapter in chapter information')
3188
3189             video_url = a.find('./video_file_url').text
3190             video_ext = video_url.rpartition('.')[2] or u'flv'
3191
3192             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3193             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3194                                    note='Downloading chapter metadata',
3195                                    errnote='Download of chapter metadata failed')
3196             chapter_info = json.loads(chapter_info_json)
3197
3198             bracket_start = int(doc.find('.//bracket_start').text)
3199             bracket_end = int(doc.find('.//bracket_end').text)
3200
3201             # TODO determine start (and probably fix up file)
3202             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3203             #video_url += u'?start=' + TODO:start_timestamp
3204             # bracket_start is 13290, but we want 51670615
3205             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3206                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3207
3208             info = {
3209                 'id': u'c' + chapter_id,
3210                 'url': video_url,
3211                 'ext': video_ext,
3212                 'title': chapter_info['title'],
3213                 'thumbnail': chapter_info['preview'],
3214                 'description': chapter_info['description'],
3215                 'uploader': chapter_info['channel']['display_name'],
3216                 'uploader_id': chapter_info['channel']['name'],
3217             }
3218             return [info]
3219         else:
3220             video_id = mobj.group('videoid')
3221             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3222
3223         self.report_extraction(video_id)
3224
3225         info = []
3226         offset = 0
3227         limit = self._JUSTIN_PAGE_LIMIT
3228         while True:
3229             if paged:
3230                 self.report_download_page(video_id, offset)
3231             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3232             page_count, page_info = self._parse_page(page_url, video_id)
3233             info.extend(page_info)
3234             if not paged or page_count != limit:
3235                 break
3236             offset += limit
3237         return info
3238
3239 class FunnyOrDieIE(InfoExtractor):
3240     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3241
3242     def _real_extract(self, url):
3243         mobj = re.match(self._VALID_URL, url)
3244         if mobj is None:
3245             raise ExtractorError(u'invalid URL: %s' % url)
3246
3247         video_id = mobj.group('id')
3248         webpage = self._download_webpage(url, video_id)
3249
3250         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3251         if not m:
3252             raise ExtractorError(u'Unable to find video information')
3253         video_url = unescapeHTML(m.group('url'))
3254
3255         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3256         if not m:
3257             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3258             if not m:
3259                 raise ExtractorError(u'Cannot find video title')
3260         title = clean_html(m.group('title'))
3261
3262         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3263         if m:
3264             desc = unescapeHTML(m.group('desc'))
3265         else:
3266             desc = None
3267
3268         info = {
3269             'id': video_id,
3270             'url': video_url,
3271             'ext': 'mp4',
3272             'title': title,
3273             'description': desc,
3274         }
3275         return [info]
3276
3277 class SteamIE(InfoExtractor):
3278     _VALID_URL = r"""http://store\.steampowered\.com/
3279                 (agecheck/)?
3280                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3281                 (?P<gameID>\d+)/?
3282                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3283                 """
3284
3285     @classmethod
3286     def suitable(cls, url):
3287         """Receives a URL and returns True if suitable for this IE."""
3288         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3289
3290     def _real_extract(self, url):
3291         m = re.match(self._VALID_URL, url, re.VERBOSE)
3292         gameID = m.group('gameID')
3293         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3294         self.report_age_confirmation()
3295         webpage = self._download_webpage(videourl, gameID)
3296         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3297         
3298         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3299         mweb = re.finditer(urlRE, webpage)
3300         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3301         titles = re.finditer(namesRE, webpage)
3302         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3303         thumbs = re.finditer(thumbsRE, webpage)
3304         videos = []
3305         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3306             video_id = vid.group('videoID')
3307             title = vtitle.group('videoName')
3308             video_url = vid.group('videoURL')
3309             video_thumb = thumb.group('thumbnail')
3310             if not video_url:
3311                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3312             info = {
3313                 'id':video_id,
3314                 'url':video_url,
3315                 'ext': 'flv',
3316                 'title': unescapeHTML(title),
3317                 'thumbnail': video_thumb
3318                   }
3319             videos.append(info)
3320         return [self.playlist_result(videos, gameID, game_title)]
3321
3322 class UstreamIE(InfoExtractor):
3323     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3324     IE_NAME = u'ustream'
3325
3326     def _real_extract(self, url):
3327         m = re.match(self._VALID_URL, url)
3328         video_id = m.group('videoID')
3329         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3330         webpage = self._download_webpage(url, video_id)
3331         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3332         title = m.group('title')
3333         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3334         uploader = m.group('uploader')
3335         info = {
3336                 'id':video_id,
3337                 'url':video_url,
3338                 'ext': 'flv',
3339                 'title': title,
3340                 'uploader': uploader
3341                   }
3342         return [info]
3343
3344 class WorldStarHipHopIE(InfoExtractor):
3345     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3346     IE_NAME = u'WorldStarHipHop'
3347
3348     def _real_extract(self, url):
3349         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3350
3351         m = re.match(self._VALID_URL, url)
3352         video_id = m.group('id')
3353
3354         webpage_src = self._download_webpage(url, video_id) 
3355
3356         mobj = re.search(_src_url, webpage_src)
3357
3358         if mobj is not None:
3359             video_url = mobj.group(1)
3360             if 'mp4' in video_url:
3361                 ext = 'mp4'
3362             else:
3363                 ext = 'flv'
3364         else:
3365             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3366
3367         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3368
3369         if mobj is None:
3370             raise ExtractorError(u'Cannot determine title')
3371         title = mobj.group(1)
3372
3373         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3374         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3375         if mobj is not None:
3376             thumbnail = mobj.group(1)
3377         else:
3378             _title = r"""candytitles.*>(.*)</span>"""
3379             mobj = re.search(_title, webpage_src)
3380             if mobj is not None:
3381                 title = mobj.group(1)
3382             thumbnail = None
3383
3384         results = [{
3385                     'id': video_id,
3386                     'url' : video_url,
3387                     'title' : title,
3388                     'thumbnail' : thumbnail,
3389                     'ext' : ext,
3390                     }]
3391         return results
3392
3393 class RBMARadioIE(InfoExtractor):
3394     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3395
3396     def _real_extract(self, url):
3397         m = re.match(self._VALID_URL, url)
3398         video_id = m.group('videoID')
3399
3400         webpage = self._download_webpage(url, video_id)
3401         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3402         if not m:
3403             raise ExtractorError(u'Cannot find metadata')
3404         json_data = m.group(1)
3405
3406         try:
3407             data = json.loads(json_data)
3408         except ValueError as e:
3409             raise ExtractorError(u'Invalid JSON: ' + str(e))
3410
3411         video_url = data['akamai_url'] + '&cbr=256'
3412         url_parts = compat_urllib_parse_urlparse(video_url)
3413         video_ext = url_parts.path.rpartition('.')[2]
3414         info = {
3415                 'id': video_id,
3416                 'url': video_url,
3417                 'ext': video_ext,
3418                 'title': data['title'],
3419                 'description': data.get('teaser_text'),
3420                 'location': data.get('country_of_origin'),
3421                 'uploader': data.get('host', {}).get('name'),
3422                 'uploader_id': data.get('host', {}).get('slug'),
3423                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3424                 'duration': data.get('duration'),
3425         }
3426         return [info]
3427
3428
3429 class YouPornIE(InfoExtractor):
3430     """Information extractor for youporn.com."""
3431     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3432
3433     def _print_formats(self, formats):
3434         """Print all available formats"""
3435         print(u'Available formats:')
3436         print(u'ext\t\tformat')
3437         print(u'---------------------------------')
3438         for format in formats:
3439             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3440
3441     def _specific(self, req_format, formats):
3442         for x in formats:
3443             if(x["format"]==req_format):
3444                 return x
3445         return None
3446
3447     def _real_extract(self, url):
3448         mobj = re.match(self._VALID_URL, url)
3449         if mobj is None:
3450             raise ExtractorError(u'Invalid URL: %s' % url)
3451
3452         video_id = mobj.group('videoid')
3453
3454         req = compat_urllib_request.Request(url)
3455         req.add_header('Cookie', 'age_verified=1')
3456         webpage = self._download_webpage(req, video_id)
3457
3458         # Get the video title
3459         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3460         if result is None:
3461             raise ExtractorError(u'Unable to extract video title')
3462         video_title = result.group('title').strip()
3463
3464         # Get the video date
3465         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3466         if result is None:
3467             self._downloader.report_warning(u'unable to extract video date')
3468             upload_date = None
3469         else:
3470             upload_date = unified_strdate(result.group('date').strip())
3471
3472         # Get the video uploader
3473         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3474         if result is None:
3475             self._downloader.report_warning(u'unable to extract uploader')
3476             video_uploader = None
3477         else:
3478             video_uploader = result.group('uploader').strip()
3479             video_uploader = clean_html( video_uploader )
3480
3481         # Get all of the formats available
3482         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3483         result = re.search(DOWNLOAD_LIST_RE, webpage)
3484         if result is None:
3485             raise ExtractorError(u'Unable to extract download list')
3486         download_list_html = result.group('download_list').strip()
3487
3488         # Get all of the links from the page
3489         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3490         links = re.findall(LINK_RE, download_list_html)
3491         if(len(links) == 0):
3492             raise ExtractorError(u'ERROR: no known formats available for video')
3493
3494         self.to_screen(u'Links found: %d' % len(links))
3495
3496         formats = []
3497         for link in links:
3498
3499             # A link looks like this:
3500             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3501             # A path looks like this:
3502             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3503             video_url = unescapeHTML( link )
3504             path = compat_urllib_parse_urlparse( video_url ).path
3505             extension = os.path.splitext( path )[1][1:]
3506             format = path.split('/')[4].split('_')[:2]
3507             size = format[0]
3508             bitrate = format[1]
3509             format = "-".join( format )
3510             title = u'%s-%s-%s' % (video_title, size, bitrate)
3511
3512             formats.append({
3513                 'id': video_id,
3514                 'url': video_url,
3515                 'uploader': video_uploader,
3516                 'upload_date': upload_date,
3517                 'title': title,
3518                 'ext': extension,
3519                 'format': format,
3520                 'thumbnail': None,
3521                 'description': None,
3522                 'player_url': None
3523             })
3524
3525         if self._downloader.params.get('listformats', None):
3526             self._print_formats(formats)
3527             return
3528
3529         req_format = self._downloader.params.get('format', None)
3530         self.to_screen(u'Format: %s' % req_format)
3531
3532         if req_format is None or req_format == 'best':
3533             return [formats[0]]
3534         elif req_format == 'worst':
3535             return [formats[-1]]
3536         elif req_format in ('-1', 'all'):
3537             return formats
3538         else:
3539             format = self._specific( req_format, formats )
3540             if result is None:
3541                 raise ExtractorError(u'Requested format not available')
3542             return [format]
3543
3544
3545
3546 class PornotubeIE(InfoExtractor):
3547     """Information extractor for pornotube.com."""
3548     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3549
3550     def _real_extract(self, url):
3551         mobj = re.match(self._VALID_URL, url)
3552         if mobj is None:
3553             raise ExtractorError(u'Invalid URL: %s' % url)
3554
3555         video_id = mobj.group('videoid')
3556         video_title = mobj.group('title')
3557
3558         # Get webpage content
3559         webpage = self._download_webpage(url, video_id)
3560
3561         # Get the video URL
3562         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3563         result = re.search(VIDEO_URL_RE, webpage)
3564         if result is None:
3565             raise ExtractorError(u'Unable to extract video url')
3566         video_url = compat_urllib_parse.unquote(result.group('url'))
3567
3568         #Get the uploaded date
3569         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3570         result = re.search(VIDEO_UPLOADED_RE, webpage)
3571         if result is None:
3572             raise ExtractorError(u'Unable to extract video title')
3573         upload_date = unified_strdate(result.group('date'))
3574
3575         info = {'id': video_id,
3576                 'url': video_url,
3577                 'uploader': None,
3578                 'upload_date': upload_date,
3579                 'title': video_title,
3580                 'ext': 'flv',
3581                 'format': 'flv'}
3582
3583         return [info]
3584
3585 class YouJizzIE(InfoExtractor):
3586     """Information extractor for youjizz.com."""
3587     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3588
3589     def _real_extract(self, url):
3590         mobj = re.match(self._VALID_URL, url)
3591         if mobj is None:
3592             raise ExtractorError(u'Invalid URL: %s' % url)
3593
3594         video_id = mobj.group('videoid')
3595
3596         # Get webpage content
3597         webpage = self._download_webpage(url, video_id)
3598
3599         # Get the video title
3600         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3601         if result is None:
3602             raise ExtractorError(u'ERROR: unable to extract video title')
3603         video_title = result.group('title').strip()
3604
3605         # Get the embed page
3606         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3607         if result is None:
3608             raise ExtractorError(u'ERROR: unable to extract embed page')
3609
3610         embed_page_url = result.group(0).strip()
3611         video_id = result.group('videoid')
3612
3613         webpage = self._download_webpage(embed_page_url, video_id)
3614
3615         # Get the video URL
3616         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3617         if result is None:
3618             raise ExtractorError(u'ERROR: unable to extract video url')
3619         video_url = result.group('source')
3620
3621         info = {'id': video_id,
3622                 'url': video_url,
3623                 'title': video_title,
3624                 'ext': 'flv',
3625                 'format': 'flv',
3626                 'player_url': embed_page_url}
3627
3628         return [info]
3629
3630 class EightTracksIE(InfoExtractor):
3631     IE_NAME = '8tracks'
3632     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3633
3634     def _real_extract(self, url):
3635         mobj = re.match(self._VALID_URL, url)
3636         if mobj is None:
3637             raise ExtractorError(u'Invalid URL: %s' % url)
3638         playlist_id = mobj.group('id')
3639
3640         webpage = self._download_webpage(url, playlist_id)
3641
3642         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3643         if not m:
3644             raise ExtractorError(u'Cannot find trax information')
3645         json_like = m.group(1)
3646         data = json.loads(json_like)
3647
3648         session = str(random.randint(0, 1000000000))
3649         mix_id = data['id']
3650         track_count = data['tracks_count']
3651         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3652         next_url = first_url
3653         res = []
3654         for i in itertools.count():
3655             api_json = self._download_webpage(next_url, playlist_id,
3656                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3657                 errnote=u'Failed to download song information')
3658             api_data = json.loads(api_json)
3659             track_data = api_data[u'set']['track']
3660             info = {
3661                 'id': track_data['id'],
3662                 'url': track_data['track_file_stream_url'],
3663                 'title': track_data['performer'] + u' - ' + track_data['name'],
3664                 'raw_title': track_data['name'],
3665                 'uploader_id': data['user']['login'],
3666                 'ext': 'm4a',
3667             }
3668             res.append(info)
3669             if api_data['set']['at_last_track']:
3670                 break
3671             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3672         return res
3673
3674 class KeekIE(InfoExtractor):
3675     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3676     IE_NAME = u'keek'
3677
3678     def _real_extract(self, url):
3679         m = re.match(self._VALID_URL, url)
3680         video_id = m.group('videoID')
3681         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3682         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3683         webpage = self._download_webpage(url, video_id)
3684         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3685         title = unescapeHTML(m.group('title'))
3686         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3687         uploader = clean_html(m.group('uploader'))
3688         info = {
3689                 'id': video_id,
3690                 'url': video_url,
3691                 'ext': 'mp4',
3692                 'title': title,
3693                 'thumbnail': thumbnail,
3694                 'uploader': uploader
3695         }
3696         return [info]
3697
3698 class TEDIE(InfoExtractor):
3699     _VALID_URL=r'''http://www\.ted\.com/
3700                    (
3701                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3702                         |
3703                         ((?P<type_talk>talks)) # We have a simple talk
3704                    )
3705                    (/lang/(.*?))? # The url may contain the language
3706                    /(?P<name>\w+) # Here goes the name and then ".html"
3707                    '''
3708
3709     @classmethod
3710     def suitable(cls, url):
3711         """Receives a URL and returns True if suitable for this IE."""
3712         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3713
3714     def _real_extract(self, url):
3715         m=re.match(self._VALID_URL, url, re.VERBOSE)
3716         if m.group('type_talk'):
3717             return [self._talk_info(url)]
3718         else :
3719             playlist_id=m.group('playlist_id')
3720             name=m.group('name')
3721             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3722             return [self._playlist_videos_info(url,name,playlist_id)]
3723
3724     def _talk_video_link(self,mediaSlug):
3725         '''Returns the video link for that mediaSlug'''
3726         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3727
3728     def _playlist_videos_info(self,url,name,playlist_id=0):
3729         '''Returns the videos of the playlist'''
3730         video_RE=r'''
3731                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3732                      ([.\s]*?)data-playlist_item_id="(\d+)"
3733                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3734                      '''
3735         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3736         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3737         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3738         m_names=re.finditer(video_name_RE,webpage)
3739
3740         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3741         m_playlist = re.search(playlist_RE, webpage)
3742         playlist_title = m_playlist.group('playlist_title')
3743
3744         playlist_entries = []
3745         for m_video, m_name in zip(m_videos,m_names):
3746             video_id=m_video.group('video_id')
3747             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3748             playlist_entries.append(self.url_result(talk_url, 'TED'))
3749         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3750
3751     def _talk_info(self, url, video_id=0):
3752         """Return the video for the talk in the url"""
3753         m=re.match(self._VALID_URL, url,re.VERBOSE)
3754         videoName=m.group('name')
3755         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3756         # If the url includes the language we get the title translated
3757         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3758         title=re.search(title_RE, webpage).group('title')
3759         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3760                         "id":(?P<videoID>[\d]+).*?
3761                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3762         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3763         thumb_match=re.search(thumb_RE,webpage)
3764         info_match=re.search(info_RE,webpage,re.VERBOSE)
3765         video_id=info_match.group('videoID')
3766         mediaSlug=info_match.group('mediaSlug')
3767         video_url=self._talk_video_link(mediaSlug)
3768         info = {
3769                 'id': video_id,
3770                 'url': video_url,
3771                 'ext': 'mp4',
3772                 'title': title,
3773                 'thumbnail': thumb_match.group('thumbnail')
3774                 }
3775         return info
3776
3777 class MySpassIE(InfoExtractor):
3778     _VALID_URL = r'http://www.myspass.de/.*'
3779
3780     def _real_extract(self, url):
3781         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3782
3783         # video id is the last path element of the URL
3784         # usually there is a trailing slash, so also try the second but last
3785         url_path = compat_urllib_parse_urlparse(url).path
3786         url_parent_path, video_id = os.path.split(url_path)
3787         if not video_id:
3788             _, video_id = os.path.split(url_parent_path)
3789
3790         # get metadata
3791         metadata_url = META_DATA_URL_TEMPLATE % video_id
3792         metadata_text = self._download_webpage(metadata_url, video_id)
3793         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3794
3795         # extract values from metadata
3796         url_flv_el = metadata.find('url_flv')
3797         if url_flv_el is None:
3798             raise ExtractorError(u'Unable to extract download url')
3799         video_url = url_flv_el.text
3800         extension = os.path.splitext(video_url)[1][1:]
3801         title_el = metadata.find('title')
3802         if title_el is None:
3803             raise ExtractorError(u'Unable to extract title')
3804         title = title_el.text
3805         format_id_el = metadata.find('format_id')
3806         if format_id_el is None:
3807             format = ext
3808         else:
3809             format = format_id_el.text
3810         description_el = metadata.find('description')
3811         if description_el is not None:
3812             description = description_el.text
3813         else:
3814             description = None
3815         imagePreview_el = metadata.find('imagePreview')
3816         if imagePreview_el is not None:
3817             thumbnail = imagePreview_el.text
3818         else:
3819             thumbnail = None
3820         info = {
3821             'id': video_id,
3822             'url': video_url,
3823             'title': title,
3824             'ext': extension,
3825             'format': format,
3826             'thumbnail': thumbnail,
3827             'description': description
3828         }
3829         return [info]
3830
3831 class SpiegelIE(InfoExtractor):
3832     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3833
3834     def _real_extract(self, url):
3835         m = re.match(self._VALID_URL, url)
3836         video_id = m.group('videoID')
3837
3838         webpage = self._download_webpage(url, video_id)
3839         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3840         if not m:
3841             raise ExtractorError(u'Cannot find title')
3842         video_title = unescapeHTML(m.group(1))
3843
3844         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3845         xml_code = self._download_webpage(xml_url, video_id,
3846                     note=u'Downloading XML', errnote=u'Failed to download XML')
3847
3848         idoc = xml.etree.ElementTree.fromstring(xml_code)
3849         last_type = idoc[-1]
3850         filename = last_type.findall('./filename')[0].text
3851         duration = float(last_type.findall('./duration')[0].text)
3852
3853         video_url = 'http://video2.spiegel.de/flash/' + filename
3854         video_ext = filename.rpartition('.')[2]
3855         info = {
3856             'id': video_id,
3857             'url': video_url,
3858             'ext': video_ext,
3859             'title': video_title,
3860             'duration': duration,
3861         }
3862         return [info]
3863
3864 class LiveLeakIE(InfoExtractor):
3865
3866     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3867     IE_NAME = u'liveleak'
3868
3869     def _real_extract(self, url):
3870         mobj = re.match(self._VALID_URL, url)
3871         if mobj is None:
3872             raise ExtractorError(u'Invalid URL: %s' % url)
3873
3874         video_id = mobj.group('video_id')
3875
3876         webpage = self._download_webpage(url, video_id)
3877
3878         m = re.search(r'file: "(.*?)",', webpage)
3879         if not m:
3880             raise ExtractorError(u'Unable to find video url')
3881         video_url = m.group(1)
3882
3883         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3884         if not m:
3885             raise ExtractorError(u'Cannot find video title')
3886         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3887
3888         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3889         if m:
3890             desc = unescapeHTML(m.group('desc'))
3891         else:
3892             desc = None
3893
3894         m = re.search(r'By:.*?(\w+)</a>', webpage)
3895         if m:
3896             uploader = clean_html(m.group(1))
3897         else:
3898             uploader = None
3899
3900         info = {
3901             'id':  video_id,
3902             'url': video_url,
3903             'ext': 'mp4',
3904             'title': title,
3905             'description': desc,
3906             'uploader': uploader
3907         }
3908
3909         return [info]
3910
3911 class ARDIE(InfoExtractor):
3912     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3913     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3914     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3915
3916     def _real_extract(self, url):
3917         # determine video id from url
3918         m = re.match(self._VALID_URL, url)
3919
3920         numid = re.search(r'documentId=([0-9]+)', url)
3921         if numid:
3922             video_id = numid.group(1)
3923         else:
3924             video_id = m.group('video_id')
3925
3926         # determine title and media streams from webpage
3927         html = self._download_webpage(url, video_id)
3928         title = re.search(self._TITLE, html).group('title')
3929         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3930         if not streams:
3931             assert '"fsk"' in html
3932             raise ExtractorError(u'This video is only available after 8:00 pm')
3933
3934         # choose default media type and highest quality for now
3935         stream = max([s for s in streams if int(s["media_type"]) == 0],
3936                      key=lambda s: int(s["quality"]))
3937
3938         # there's two possibilities: RTMP stream or HTTP download
3939         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3940         if stream['rtmp_url']:
3941             self.to_screen(u'RTMP download detected')
3942             assert stream['video_url'].startswith('mp4:')
3943             info["url"] = stream["rtmp_url"]
3944             info["play_path"] = stream['video_url']
3945         else:
3946             assert stream["video_url"].endswith('.mp4')
3947             info["url"] = stream["video_url"]
3948         return [info]
3949
3950 class TumblrIE(InfoExtractor):
3951     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3952
3953     def _real_extract(self, url):
3954         m_url = re.match(self._VALID_URL, url)
3955         video_id = m_url.group('id')
3956         blog = m_url.group('blog_name')
3957
3958         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3959         webpage = self._download_webpage(url, video_id)
3960
3961         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3962         video = re.search(re_video, webpage)
3963         if video is None:
3964             self.to_screen("No video founded")
3965             return []
3966         video_url = video.group('video_url')
3967         ext = video.group('ext')
3968
3969         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3970         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3971
3972         # The only place where you can get a title, it's not complete,
3973         # but searching in other places doesn't work for all videos
3974         re_title = r'<title>(?P<title>.*?)</title>'
3975         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3976
3977         return [{'id': video_id,
3978                  'url': video_url,
3979                  'title': title,
3980                  'thumbnail': thumb,
3981                  'ext': ext
3982                  }]
3983
3984 class BandcampIE(InfoExtractor):
3985     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3986
3987     def _real_extract(self, url):
3988         mobj = re.match(self._VALID_URL, url)
3989         title = mobj.group('title')
3990         webpage = self._download_webpage(url, title)
3991         # We get the link to the free download page
3992         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3993         if m_download is None:
3994             raise ExtractorError(u'No free songs founded')
3995
3996         download_link = m_download.group(1)
3997         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
3998                        webpage, re.MULTILINE|re.DOTALL).group('id')
3999
4000         download_webpage = self._download_webpage(download_link, id,
4001                                                   'Downloading free downloads page')
4002         # We get the dictionary of the track from some javascrip code
4003         info = re.search(r'items: (.*?),$',
4004                          download_webpage, re.MULTILINE).group(1)
4005         info = json.loads(info)[0]
4006         # We pick mp3-320 for now, until format selection can be easily implemented.
4007         mp3_info = info[u'downloads'][u'mp3-320']
4008         # If we try to use this url it says the link has expired
4009         initial_url = mp3_info[u'url']
4010         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4011         m_url = re.match(re_url, initial_url)
4012         #We build the url we will use to get the final track url
4013         # This url is build in Bandcamp in the script download_bunde_*.js
4014         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4015         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4016         # If we could correctly generate the .rand field the url would be
4017         #in the "download_url" key
4018         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4019
4020         track_info = {'id':id,
4021                       'title' : info[u'title'],
4022                       'ext' : 'mp3',
4023                       'url' : final_url,
4024                       'thumbnail' : info[u'thumb_url'],
4025                       'uploader' : info[u'artist']
4026                       }
4027
4028         return [track_info]
4029
4030 class RedTubeIE(InfoExtractor):
4031     """Information Extractor for redtube"""
4032     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4033
4034     def _real_extract(self,url):
4035         mobj = re.match(self._VALID_URL, url)
4036         if mobj is None:
4037             raise ExtractorError(u'Invalid URL: %s' % url)
4038
4039         video_id = mobj.group('id')
4040         video_extension = 'mp4'        
4041         webpage = self._download_webpage(url, video_id)
4042         self.report_extraction(video_id)
4043         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4044
4045         if mobj is None:
4046             raise ExtractorError(u'Unable to extract media URL')
4047
4048         video_url = mobj.group(1)
4049         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4050         if mobj is None:
4051             raise ExtractorError(u'Unable to extract title')
4052         video_title = mobj.group(1)
4053
4054         return [{
4055             'id':       video_id,
4056             'url':      video_url,
4057             'ext':      video_extension,
4058             'title':    video_title,
4059         }]
4060         
4061 class InaIE(InfoExtractor):
4062     """Information Extractor for Ina.fr"""
4063     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4064
4065     def _real_extract(self,url):
4066         mobj = re.match(self._VALID_URL, url)
4067
4068         video_id = mobj.group('id')
4069         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4070         video_extension = 'mp4'
4071         webpage = self._download_webpage(mrss_url, video_id)
4072
4073         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4074         if mobj is None:
4075             raise ExtractorError(u'Unable to extract media URL')
4076         video_url = mobj.group(1)
4077
4078         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4079         if mobj is None:
4080             raise ExtractorError(u'Unable to extract title')
4081         video_title = mobj.group(1)
4082
4083         return [{
4084             'id':       video_id,
4085             'url':      video_url,
4086             'ext':      video_extension,
4087             'title':    video_title,
4088         }]
4089
4090 def gen_extractors():
4091     """ Return a list of an instance of every supported extractor.
4092     The order does matter; the first extractor matched is the one handling the URL.
4093     """
4094     return [
4095         YoutubePlaylistIE(),
4096         YoutubeChannelIE(),
4097         YoutubeUserIE(),
4098         YoutubeSearchIE(),
4099         YoutubeIE(),
4100         MetacafeIE(),
4101         DailymotionIE(),
4102         GoogleSearchIE(),
4103         PhotobucketIE(),
4104         YahooIE(),
4105         YahooSearchIE(),
4106         DepositFilesIE(),
4107         FacebookIE(),
4108         BlipTVUserIE(),
4109         BlipTVIE(),
4110         VimeoIE(),
4111         MyVideoIE(),
4112         ComedyCentralIE(),
4113         EscapistIE(),
4114         CollegeHumorIE(),
4115         XVideosIE(),
4116         SoundcloudSetIE(),
4117         SoundcloudIE(),
4118         InfoQIE(),
4119         MixcloudIE(),
4120         StanfordOpenClassroomIE(),
4121         MTVIE(),
4122         YoukuIE(),
4123         XNXXIE(),
4124         YouJizzIE(),
4125         PornotubeIE(),
4126         YouPornIE(),
4127         GooglePlusIE(),
4128         ArteTvIE(),
4129         NBAIE(),
4130         WorldStarHipHopIE(),
4131         JustinTVIE(),
4132         FunnyOrDieIE(),
4133         SteamIE(),
4134         UstreamIE(),
4135         RBMARadioIE(),
4136         EightTracksIE(),
4137         KeekIE(),
4138         TEDIE(),
4139         MySpassIE(),
4140         SpiegelIE(),
4141         LiveLeakIE(),
4142         ARDIE(),
4143         TumblrIE(),
4144         BandcampIE(),
4145         RedTubeIE(),
4146         InaIE(),
4147         GenericIE()
4148     ]
4149
4150 def get_info_extractor(ie_name):
4151     """Returns the info extractor class with the given ie_name"""
4152     return globals()[ie_name+'IE']