TumblrIE
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     def report_extraction(self, id_or_name):
152         """Report information extraction."""
153         self.to_screen(u'%s: Extracting information' % id_or_name)
154
155     def report_age_confirmation(self):
156         """Report attempt to confirm age."""
157         self.to_screen(u'Confirming age')
158
159     #Methods for following #608
160     #They set the correct value of the '_type' key
161     def video_result(self, video_info):
162         """Returns a video"""
163         video_info['_type'] = 'video'
164         return video_info
165     def url_result(self, url, ie=None):
166         """Returns a url that points to a page that should be processed"""
167         #TODO: ie should be the class used for getting the info
168         video_info = {'_type': 'url',
169                       'url': url,
170                       'ie_key': ie}
171         return video_info
172     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
173         """Returns a playlist"""
174         video_info = {'_type': 'playlist',
175                       'entries': entries}
176         if playlist_id:
177             video_info['id'] = playlist_id
178         if playlist_title:
179             video_info['title'] = playlist_title
180         return video_info
181
182
183 class YoutubeIE(InfoExtractor):
184     """Information extractor for youtube.com."""
185
186     _VALID_URL = r"""^
187                      (
188                          (?:https?://)?                                       # http(s):// (optional)
189                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
190                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
191                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
192                          (?:                                                  # the various things that can precede the ID:
193                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
194                              |(?:                                             # or the v= param in all its forms
195                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
196                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
197                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
198                                  v=
199                              )
200                          )?                                                   # optional -> youtube.com/xxxx is OK
201                      )?                                                       # all until now is optional -> you can pass the naked ID
202                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
203                      (?(1).+)?                                                # if we found the ID, everything can follow
204                      $"""
205     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
206     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
207     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
208     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
209     _NETRC_MACHINE = 'youtube'
210     # Listed in order of quality
211     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
212     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
213     _video_extensions = {
214         '13': '3gp',
215         '17': 'mp4',
216         '18': 'mp4',
217         '22': 'mp4',
218         '37': 'mp4',
219         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
220         '43': 'webm',
221         '44': 'webm',
222         '45': 'webm',
223         '46': 'webm',
224     }
225     _video_dimensions = {
226         '5': '240x400',
227         '6': '???',
228         '13': '???',
229         '17': '144x176',
230         '18': '360x640',
231         '22': '720x1280',
232         '34': '360x640',
233         '35': '480x854',
234         '37': '1080x1920',
235         '38': '3072x4096',
236         '43': '360x640',
237         '44': '480x854',
238         '45': '720x1280',
239         '46': '1080x1920',
240     }
241     IE_NAME = u'youtube'
242
243     @classmethod
244     def suitable(cls, url):
245         """Receives a URL and returns True if suitable for this IE."""
246         if YoutubePlaylistIE.suitable(url): return False
247         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
248
249     def report_lang(self):
250         """Report attempt to set language."""
251         self.to_screen(u'Setting language')
252
253     def report_login(self):
254         """Report attempt to log in."""
255         self.to_screen(u'Logging in')
256
257     def report_video_webpage_download(self, video_id):
258         """Report attempt to download video webpage."""
259         self.to_screen(u'%s: Downloading video webpage' % video_id)
260
261     def report_video_info_webpage_download(self, video_id):
262         """Report attempt to download video info webpage."""
263         self.to_screen(u'%s: Downloading video info webpage' % video_id)
264
265     def report_video_subtitles_download(self, video_id):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Checking available subtitles' % video_id)
268
269     def report_video_subtitles_request(self, video_id, sub_lang, format):
270         """Report attempt to download video info webpage."""
271         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
272
273     def report_video_subtitles_available(self, video_id, sub_lang_list):
274         """Report available subtitles."""
275         sub_lang = ",".join(list(sub_lang_list.keys()))
276         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
277
278     def report_information_extraction(self, video_id):
279         """Report attempt to extract video information."""
280         self.to_screen(u'%s: Extracting video information' % video_id)
281
282     def report_unavailable_format(self, video_id, format):
283         """Report extracted video URL."""
284         self.to_screen(u'%s: Format %s not available' % (video_id, format))
285
286     def report_rtmp_download(self):
287         """Indicate the download will use the RTMP protocol."""
288         self.to_screen(u'RTMP download detected')
289
290     def _get_available_subtitles(self, video_id):
291         self.report_video_subtitles_download(video_id)
292         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
293         try:
294             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296             return (u'unable to download video subtitles: %s' % compat_str(err), None)
297         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299         if not sub_lang_list:
300             return (u'video doesn\'t have subtitles', None)
301         return sub_lang_list
302
303     def _list_available_subtitles(self, video_id):
304         sub_lang_list = self._get_available_subtitles(video_id)
305         self.report_video_subtitles_available(video_id, sub_lang_list)
306
307     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
308         """
309         Return tuple:
310         (error_message, sub_lang, sub)
311         """
312         self.report_video_subtitles_request(video_id, sub_lang, format)
313         params = compat_urllib_parse.urlencode({
314             'lang': sub_lang,
315             'name': sub_name,
316             'v': video_id,
317             'fmt': format,
318         })
319         url = 'http://www.youtube.com/api/timedtext?' + params
320         try:
321             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
324         if not sub:
325             return (u'Did not fetch video subtitles', None, None)
326         return (None, sub_lang, sub)
327
328     def _extract_subtitle(self, video_id):
329         """
330         Return a list with a tuple:
331         [(error_message, sub_lang, sub)]
332         """
333         sub_lang_list = self._get_available_subtitles(video_id)
334         sub_format = self._downloader.params.get('subtitlesformat')
335         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336             return [(sub_lang_list[0], None, None)]
337         if self._downloader.params.get('subtitleslang', False):
338             sub_lang = self._downloader.params.get('subtitleslang')
339         elif 'en' in sub_lang_list:
340             sub_lang = 'en'
341         else:
342             sub_lang = list(sub_lang_list.keys())[0]
343         if not sub_lang in sub_lang_list:
344             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
345
346         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
347         return [subtitle]
348
349     def _extract_all_subtitles(self, video_id):
350         sub_lang_list = self._get_available_subtitles(video_id)
351         sub_format = self._downloader.params.get('subtitlesformat')
352         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353             return [(sub_lang_list[0], None, None)]
354         subtitles = []
355         for sub_lang in sub_lang_list:
356             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357             subtitles.append(subtitle)
358         return subtitles
359
360     def _print_formats(self, formats):
361         print('Available formats:')
362         for x in formats:
363             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
364
365     def _real_initialize(self):
366         if self._downloader is None:
367             return
368
369         username = None
370         password = None
371         downloader_params = self._downloader.params
372
373         # Attempt to use provided username and password or .netrc data
374         if downloader_params.get('username', None) is not None:
375             username = downloader_params['username']
376             password = downloader_params['password']
377         elif downloader_params.get('usenetrc', False):
378             try:
379                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
380                 if info is not None:
381                     username = info[0]
382                     password = info[2]
383                 else:
384                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385             except (IOError, netrc.NetrcParseError) as err:
386                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
387                 return
388
389         # Set language
390         request = compat_urllib_request.Request(self._LANG_URL)
391         try:
392             self.report_lang()
393             compat_urllib_request.urlopen(request).read()
394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
396             return
397
398         # No authentication to be performed
399         if username is None:
400             return
401
402         request = compat_urllib_request.Request(self._LOGIN_URL)
403         try:
404             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
407             return
408
409         galx = None
410         dsh = None
411         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
412         if match:
413           galx = match.group(1)
414
415         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
416         if match:
417           dsh = match.group(1)
418
419         # Log in
420         login_form_strs = {
421                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
422                 u'Email': username,
423                 u'GALX': galx,
424                 u'Passwd': password,
425                 u'PersistentCookie': u'yes',
426                 u'_utf8': u'霱',
427                 u'bgresponse': u'js_disabled',
428                 u'checkConnection': u'',
429                 u'checkedDomains': u'youtube',
430                 u'dnConn': u'',
431                 u'dsh': dsh,
432                 u'pstMsg': u'0',
433                 u'rmShown': u'1',
434                 u'secTok': u'',
435                 u'signIn': u'Sign in',
436                 u'timeStmp': u'',
437                 u'service': u'youtube',
438                 u'uilel': u'3',
439                 u'hl': u'en_US',
440         }
441         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
442         # chokes on unicode
443         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
446         try:
447             self.report_login()
448             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450                 self._downloader.report_warning(u'unable to log in: bad username or password')
451                 return
452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
454             return
455
456         # Confirm age
457         age_form = {
458                 'next_url':     '/',
459                 'action_confirm':   'Confirm',
460                 }
461         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
462         try:
463             self.report_age_confirmation()
464             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
467             return
468
469     def _extract_id(self, url):
470         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
471         if mobj is None:
472             self._downloader.report_error(u'invalid URL: %s' % url)
473             return
474         video_id = mobj.group(2)
475         return video_id
476
477     def _real_extract(self, url):
478         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479         mobj = re.search(self._NEXT_URL_RE, url)
480         if mobj:
481             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482         video_id = self._extract_id(url)
483
484         # Get video webpage
485         self.report_video_webpage_download(video_id)
486         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487         request = compat_urllib_request.Request(url)
488         try:
489             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
492             return
493
494         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
495
496         # Attempt to extract SWF player URL
497         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
498         if mobj is not None:
499             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
500         else:
501             player_url = None
502
503         # Get video info
504         self.report_video_info_webpage_download(video_id)
505         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507                     % (video_id, el_type))
508             video_info_webpage = self._download_webpage(video_info_url, video_id,
509                                     note=False,
510                                     errnote='unable to download video info webpage')
511             video_info = compat_parse_qs(video_info_webpage)
512             if 'token' in video_info:
513                 break
514         if 'token' not in video_info:
515             if 'reason' in video_info:
516                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
517             else:
518                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
519             return
520
521         # Check for "rental" videos
522         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523             self._downloader.report_error(u'"rental" videos not supported')
524             return
525
526         # Start extracting information
527         self.report_information_extraction(video_id)
528
529         # uploader
530         if 'author' not in video_info:
531             self._downloader.report_error(u'unable to extract uploader name')
532             return
533         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
534
535         # uploader_id
536         video_uploader_id = None
537         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
538         if mobj is not None:
539             video_uploader_id = mobj.group(1)
540         else:
541             self._downloader.report_warning(u'unable to extract uploader nickname')
542
543         # title
544         if 'title' not in video_info:
545             self._downloader.report_error(u'unable to extract video title')
546             return
547         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
548
549         # thumbnail image
550         if 'thumbnail_url' not in video_info:
551             self._downloader.report_warning(u'unable to extract video thumbnail')
552             video_thumbnail = ''
553         else:   # don't panic if we can't find it
554             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
555
556         # upload date
557         upload_date = None
558         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
559         if mobj is not None:
560             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562             for expression in format_expressions:
563                 try:
564                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
565                 except:
566                     pass
567
568         # description
569         video_description = get_element_by_id("eow-description", video_webpage)
570         if video_description:
571             video_description = clean_html(video_description)
572         else:
573             video_description = ''
574
575         # subtitles
576         video_subtitles = None
577
578         if self._downloader.params.get('writesubtitles', False):
579             video_subtitles = self._extract_subtitle(video_id)
580             if video_subtitles:
581                 (sub_error, sub_lang, sub) = video_subtitles[0]
582                 if sub_error:
583                     self._downloader.report_error(sub_error)
584
585         if self._downloader.params.get('allsubtitles', False):
586             video_subtitles = self._extract_all_subtitles(video_id)
587             for video_subtitle in video_subtitles:
588                 (sub_error, sub_lang, sub) = video_subtitle
589                 if sub_error:
590                     self._downloader.report_error(sub_error)
591
592         if self._downloader.params.get('listsubtitles', False):
593             sub_lang_list = self._list_available_subtitles(video_id)
594             return
595
596         if 'length_seconds' not in video_info:
597             self._downloader.report_warning(u'unable to extract video duration')
598             video_duration = ''
599         else:
600             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
601
602         # token
603         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
604
605         # Decide which formats to download
606         req_format = self._downloader.params.get('format', None)
607
608         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
609             self.report_rtmp_download()
610             video_url_list = [(None, video_info['conn'][0])]
611         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
612             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
613             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
614             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
615             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
616
617             format_limit = self._downloader.params.get('format_limit', None)
618             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
619             if format_limit is not None and format_limit in available_formats:
620                 format_list = available_formats[available_formats.index(format_limit):]
621             else:
622                 format_list = available_formats
623             existing_formats = [x for x in format_list if x in url_map]
624             if len(existing_formats) == 0:
625                 raise ExtractorError(u'no known formats available for video')
626             if self._downloader.params.get('listformats', None):
627                 self._print_formats(existing_formats)
628                 return
629             if req_format is None or req_format == 'best':
630                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
631             elif req_format == 'worst':
632                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
633             elif req_format in ('-1', 'all'):
634                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
635             else:
636                 # Specific formats. We pick the first in a slash-delimeted sequence.
637                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
638                 req_formats = req_format.split('/')
639                 video_url_list = None
640                 for rf in req_formats:
641                     if rf in url_map:
642                         video_url_list = [(rf, url_map[rf])]
643                         break
644                 if video_url_list is None:
645                     raise ExtractorError(u'requested format not available')
646         else:
647             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
648
649         results = []
650         for format_param, video_real_url in video_url_list:
651             # Extension
652             video_extension = self._video_extensions.get(format_param, 'flv')
653
654             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
655                                               self._video_dimensions.get(format_param, '???'))
656
657             results.append({
658                 'id':       video_id,
659                 'url':      video_real_url,
660                 'uploader': video_uploader,
661                 'uploader_id': video_uploader_id,
662                 'upload_date':  upload_date,
663                 'title':    video_title,
664                 'ext':      video_extension,
665                 'format':   video_format,
666                 'thumbnail':    video_thumbnail,
667                 'description':  video_description,
668                 'player_url':   player_url,
669                 'subtitles':    video_subtitles,
670                 'duration':     video_duration
671             })
672         return results
673
674
675 class MetacafeIE(InfoExtractor):
676     """Information Extractor for metacafe.com."""
677
678     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
679     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
680     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
681     IE_NAME = u'metacafe'
682
683     def __init__(self, downloader=None):
684         InfoExtractor.__init__(self, downloader)
685
686     def report_disclaimer(self):
687         """Report disclaimer retrieval."""
688         self.to_screen(u'Retrieving disclaimer')
689
690     def report_download_webpage(self, video_id):
691         """Report webpage download."""
692         self.to_screen(u'%s: Downloading webpage' % video_id)
693
694     def _real_initialize(self):
695         # Retrieve disclaimer
696         request = compat_urllib_request.Request(self._DISCLAIMER)
697         try:
698             self.report_disclaimer()
699             disclaimer = compat_urllib_request.urlopen(request).read()
700         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
702             return
703
704         # Confirm age
705         disclaimer_form = {
706             'filters': '0',
707             'submit': "Continue - I'm over 18",
708             }
709         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
710         try:
711             self.report_age_confirmation()
712             disclaimer = compat_urllib_request.urlopen(request).read()
713         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
714             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
715             return
716
717     def _real_extract(self, url):
718         # Extract id and simplified title from URL
719         mobj = re.match(self._VALID_URL, url)
720         if mobj is None:
721             self._downloader.report_error(u'invalid URL: %s' % url)
722             return
723
724         video_id = mobj.group(1)
725
726         # Check if video comes from YouTube
727         mobj2 = re.match(r'^yt-(.*)$', video_id)
728         if mobj2 is not None:
729             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
730
731         # Retrieve video webpage to extract further information
732         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
733
734         # Extract URL, uploader and title from webpage
735         self.report_extraction(video_id)
736         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
737         if mobj is not None:
738             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
739             video_extension = mediaURL[-3:]
740
741             # Extract gdaKey if available
742             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
743             if mobj is None:
744                 video_url = mediaURL
745             else:
746                 gdaKey = mobj.group(1)
747                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
748         else:
749             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
750             if mobj is None:
751                 self._downloader.report_error(u'unable to extract media URL')
752                 return
753             vardict = compat_parse_qs(mobj.group(1))
754             if 'mediaData' not in vardict:
755                 self._downloader.report_error(u'unable to extract media URL')
756                 return
757             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
758             if mobj is None:
759                 self._downloader.report_error(u'unable to extract media URL')
760                 return
761             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
762             video_extension = mediaURL[-3:]
763             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
764
765         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
766         if mobj is None:
767             self._downloader.report_error(u'unable to extract title')
768             return
769         video_title = mobj.group(1).decode('utf-8')
770
771         mobj = re.search(r'submitter=(.*?);', webpage)
772         if mobj is None:
773             self._downloader.report_error(u'unable to extract uploader nickname')
774             return
775         video_uploader = mobj.group(1)
776
777         return [{
778             'id':       video_id.decode('utf-8'),
779             'url':      video_url.decode('utf-8'),
780             'uploader': video_uploader.decode('utf-8'),
781             'upload_date':  None,
782             'title':    video_title,
783             'ext':      video_extension.decode('utf-8'),
784         }]
785
786
787 class DailymotionIE(InfoExtractor):
788     """Information Extractor for Dailymotion"""
789
790     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
791     IE_NAME = u'dailymotion'
792     _WORKING = False
793
794     def __init__(self, downloader=None):
795         InfoExtractor.__init__(self, downloader)
796
797     def _real_extract(self, url):
798         # Extract id and simplified title from URL
799         mobj = re.match(self._VALID_URL, url)
800         if mobj is None:
801             self._downloader.report_error(u'invalid URL: %s' % url)
802             return
803
804         video_id = mobj.group(1).split('_')[0].split('?')[0]
805
806         video_extension = 'mp4'
807
808         # Retrieve video webpage to extract further information
809         request = compat_urllib_request.Request(url)
810         request.add_header('Cookie', 'family_filter=off')
811         webpage = self._download_webpage(request, video_id)
812
813         # Extract URL, uploader and title from webpage
814         self.report_extraction(video_id)
815         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
816         if mobj is None:
817             self._downloader.report_error(u'unable to extract media URL')
818             return
819         flashvars = compat_urllib_parse.unquote(mobj.group(1))
820
821         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
822             if key in flashvars:
823                 max_quality = key
824                 self.to_screen(u'Using %s' % key)
825                 break
826         else:
827             self._downloader.report_error(u'unable to extract video URL')
828             return
829
830         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
831         if mobj is None:
832             self._downloader.report_error(u'unable to extract video URL')
833             return
834
835         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
836
837         # TODO: support choosing qualities
838
839         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
840         if mobj is None:
841             self._downloader.report_error(u'unable to extract title')
842             return
843         video_title = unescapeHTML(mobj.group('title'))
844
845         video_uploader = None
846         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
847         if mobj is None:
848             # lookin for official user
849             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
850             if mobj_official is None:
851                 self._downloader.report_warning(u'unable to extract uploader nickname')
852             else:
853                 video_uploader = mobj_official.group(1)
854         else:
855             video_uploader = mobj.group(1)
856
857         video_upload_date = None
858         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
859         if mobj is not None:
860             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
861
862         return [{
863             'id':       video_id,
864             'url':      video_url,
865             'uploader': video_uploader,
866             'upload_date':  video_upload_date,
867             'title':    video_title,
868             'ext':      video_extension,
869         }]
870
871
872 class PhotobucketIE(InfoExtractor):
873     """Information extractor for photobucket.com."""
874
875     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
876     IE_NAME = u'photobucket'
877
878     def __init__(self, downloader=None):
879         InfoExtractor.__init__(self, downloader)
880
881     def report_download_webpage(self, video_id):
882         """Report webpage download."""
883         self.to_screen(u'%s: Downloading webpage' % video_id)
884
885     def _real_extract(self, url):
886         # Extract id from URL
887         mobj = re.match(self._VALID_URL, url)
888         if mobj is None:
889             self._downloader.report_error(u'Invalid URL: %s' % url)
890             return
891
892         video_id = mobj.group(1)
893
894         video_extension = 'flv'
895
896         # Retrieve video webpage to extract further information
897         request = compat_urllib_request.Request(url)
898         try:
899             self.report_download_webpage(video_id)
900             webpage = compat_urllib_request.urlopen(request).read()
901         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
902             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
903             return
904
905         # Extract URL, uploader, and title from webpage
906         self.report_extraction(video_id)
907         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
908         if mobj is None:
909             self._downloader.report_error(u'unable to extract media URL')
910             return
911         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
912
913         video_url = mediaURL
914
915         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
916         if mobj is None:
917             self._downloader.report_error(u'unable to extract title')
918             return
919         video_title = mobj.group(1).decode('utf-8')
920
921         video_uploader = mobj.group(2).decode('utf-8')
922
923         return [{
924             'id':       video_id.decode('utf-8'),
925             'url':      video_url.decode('utf-8'),
926             'uploader': video_uploader,
927             'upload_date':  None,
928             'title':    video_title,
929             'ext':      video_extension.decode('utf-8'),
930         }]
931
932
933 class YahooIE(InfoExtractor):
934     """Information extractor for video.yahoo.com."""
935
936     _WORKING = False
937     # _VALID_URL matches all Yahoo! Video URLs
938     # _VPAGE_URL matches only the extractable '/watch/' URLs
939     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
940     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
941     IE_NAME = u'video.yahoo'
942
943     def __init__(self, downloader=None):
944         InfoExtractor.__init__(self, downloader)
945
946     def report_download_webpage(self, video_id):
947         """Report webpage download."""
948         self.to_screen(u'%s: Downloading webpage' % video_id)
949
950     def _real_extract(self, url, new_video=True):
951         # Extract ID from URL
952         mobj = re.match(self._VALID_URL, url)
953         if mobj is None:
954             self._downloader.report_error(u'Invalid URL: %s' % url)
955             return
956
957         video_id = mobj.group(2)
958         video_extension = 'flv'
959
960         # Rewrite valid but non-extractable URLs as
961         # extractable English language /watch/ URLs
962         if re.match(self._VPAGE_URL, url) is None:
963             request = compat_urllib_request.Request(url)
964             try:
965                 webpage = compat_urllib_request.urlopen(request).read()
966             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
967                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
968                 return
969
970             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
971             if mobj is None:
972                 self._downloader.report_error(u'Unable to extract id field')
973                 return
974             yahoo_id = mobj.group(1)
975
976             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
977             if mobj is None:
978                 self._downloader.report_error(u'Unable to extract vid field')
979                 return
980             yahoo_vid = mobj.group(1)
981
982             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
983             return self._real_extract(url, new_video=False)
984
985         # Retrieve video webpage to extract further information
986         request = compat_urllib_request.Request(url)
987         try:
988             self.report_download_webpage(video_id)
989             webpage = compat_urllib_request.urlopen(request).read()
990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
992             return
993
994         # Extract uploader and title from webpage
995         self.report_extraction(video_id)
996         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
997         if mobj is None:
998             self._downloader.report_error(u'unable to extract video title')
999             return
1000         video_title = mobj.group(1).decode('utf-8')
1001
1002         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003         if mobj is None:
1004             self._downloader.report_error(u'unable to extract video uploader')
1005             return
1006         video_uploader = mobj.group(1).decode('utf-8')
1007
1008         # Extract video thumbnail
1009         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video thumbnail')
1012             return
1013         video_thumbnail = mobj.group(1).decode('utf-8')
1014
1015         # Extract video description
1016         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video description')
1019             return
1020         video_description = mobj.group(1).decode('utf-8')
1021         if not video_description:
1022             video_description = 'No description available.'
1023
1024         # Extract video height and width
1025         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026         if mobj is None:
1027             self._downloader.report_error(u'unable to extract video height')
1028             return
1029         yv_video_height = mobj.group(1)
1030
1031         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video width')
1034             return
1035         yv_video_width = mobj.group(1)
1036
1037         # Retrieve video playlist to extract media URL
1038         # I'm not completely sure what all these options are, but we
1039         # seem to need most of them, otherwise the server sends a 401.
1040         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1041         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1042         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1043                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1044                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045         try:
1046             self.report_download_webpage(video_id)
1047             webpage = compat_urllib_request.urlopen(request).read()
1048         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1049             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1050             return
1051
1052         # Extract media URL from playlist XML
1053         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054         if mobj is None:
1055             self._downloader.report_error(u'Unable to extract media URL')
1056             return
1057         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1058         video_url = unescapeHTML(video_url)
1059
1060         return [{
1061             'id':       video_id.decode('utf-8'),
1062             'url':      video_url,
1063             'uploader': video_uploader,
1064             'upload_date':  None,
1065             'title':    video_title,
1066             'ext':      video_extension.decode('utf-8'),
1067             'thumbnail':    video_thumbnail.decode('utf-8'),
1068             'description':  video_description,
1069         }]
1070
1071
1072 class VimeoIE(InfoExtractor):
1073     """Information extractor for vimeo.com."""
1074
1075     # _VALID_URL matches Vimeo URLs
1076     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1077     IE_NAME = u'vimeo'
1078
1079     def __init__(self, downloader=None):
1080         InfoExtractor.__init__(self, downloader)
1081
1082     def report_download_webpage(self, video_id):
1083         """Report webpage download."""
1084         self.to_screen(u'%s: Downloading webpage' % video_id)
1085
1086     def _real_extract(self, url, new_video=True):
1087         # Extract ID from URL
1088         mobj = re.match(self._VALID_URL, url)
1089         if mobj is None:
1090             self._downloader.report_error(u'Invalid URL: %s' % url)
1091             return
1092
1093         video_id = mobj.group('id')
1094         if not mobj.group('proto'):
1095             url = 'https://' + url
1096         if mobj.group('direct_link'):
1097             url = 'https://vimeo.com/' + video_id
1098
1099         # Retrieve video webpage to extract further information
1100         request = compat_urllib_request.Request(url, None, std_headers)
1101         try:
1102             self.report_download_webpage(video_id)
1103             webpage_bytes = compat_urllib_request.urlopen(request).read()
1104             webpage = webpage_bytes.decode('utf-8')
1105         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1107             return
1108
1109         # Now we begin extracting as much information as we can from what we
1110         # retrieved. First we extract the information common to all extractors,
1111         # and latter we extract those that are Vimeo specific.
1112         self.report_extraction(video_id)
1113
1114         # Extract the config JSON
1115         try:
1116             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117             config = json.loads(config)
1118         except:
1119             self._downloader.report_error(u'unable to extract info section')
1120             return
1121
1122         # Extract title
1123         video_title = config["video"]["title"]
1124
1125         # Extract uploader and uploader_id
1126         video_uploader = config["video"]["owner"]["name"]
1127         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1128
1129         # Extract video thumbnail
1130         video_thumbnail = config["video"]["thumbnail"]
1131
1132         # Extract video description
1133         video_description = get_element_by_attribute("itemprop", "description", webpage)
1134         if video_description: video_description = clean_html(video_description)
1135         else: video_description = u''
1136
1137         # Extract upload date
1138         video_upload_date = None
1139         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1140         if mobj is not None:
1141             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1142
1143         # Vimeo specific: extract request signature and timestamp
1144         sig = config['request']['signature']
1145         timestamp = config['request']['timestamp']
1146
1147         # Vimeo specific: extract video codec and quality information
1148         # First consider quality, then codecs, then take everything
1149         # TODO bind to format param
1150         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1151         files = { 'hd': [], 'sd': [], 'other': []}
1152         for codec_name, codec_extension in codecs:
1153             if codec_name in config["video"]["files"]:
1154                 if 'hd' in config["video"]["files"][codec_name]:
1155                     files['hd'].append((codec_name, codec_extension, 'hd'))
1156                 elif 'sd' in config["video"]["files"][codec_name]:
1157                     files['sd'].append((codec_name, codec_extension, 'sd'))
1158                 else:
1159                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1160
1161         for quality in ('hd', 'sd', 'other'):
1162             if len(files[quality]) > 0:
1163                 video_quality = files[quality][0][2]
1164                 video_codec = files[quality][0][0]
1165                 video_extension = files[quality][0][1]
1166                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1167                 break
1168         else:
1169             self._downloader.report_error(u'no known codec found')
1170             return
1171
1172         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1174
1175         return [{
1176             'id':       video_id,
1177             'url':      video_url,
1178             'uploader': video_uploader,
1179             'uploader_id': video_uploader_id,
1180             'upload_date':  video_upload_date,
1181             'title':    video_title,
1182             'ext':      video_extension,
1183             'thumbnail':    video_thumbnail,
1184             'description':  video_description,
1185         }]
1186
1187
1188 class ArteTvIE(InfoExtractor):
1189     """arte.tv information extractor."""
1190
1191     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192     _LIVE_URL = r'index-[0-9]+\.html$'
1193
1194     IE_NAME = u'arte.tv'
1195
1196     def __init__(self, downloader=None):
1197         InfoExtractor.__init__(self, downloader)
1198
1199     def report_download_webpage(self, video_id):
1200         """Report webpage download."""
1201         self.to_screen(u'%s: Downloading webpage' % video_id)
1202
1203     def fetch_webpage(self, url):
1204         request = compat_urllib_request.Request(url)
1205         try:
1206             self.report_download_webpage(url)
1207             webpage = compat_urllib_request.urlopen(request).read()
1208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210             return
1211         except ValueError as err:
1212             self._downloader.report_error(u'Invalid URL: %s' % url)
1213             return
1214         return webpage
1215
1216     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1217         page = self.fetch_webpage(url)
1218         mobj = re.search(regex, page, regexFlags)
1219         info = {}
1220
1221         if mobj is None:
1222             self._downloader.report_error(u'Invalid URL: %s' % url)
1223             return
1224
1225         for (i, key, err) in matchTuples:
1226             if mobj.group(i) is None:
1227                 self._downloader.trouble(err)
1228                 return
1229             else:
1230                 info[key] = mobj.group(i)
1231
1232         return info
1233
1234     def extractLiveStream(self, url):
1235         video_lang = url.split('/')[-4]
1236         info = self.grep_webpage(
1237             url,
1238             r'src="(.*?/videothek_js.*?\.js)',
1239             0,
1240             [
1241                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1242             ]
1243         )
1244         http_host = url.split('/')[2]
1245         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1246         info = self.grep_webpage(
1247             next_url,
1248             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1249                 '(http://.*?\.swf).*?' +
1250                 '(rtmp://.*?)\'',
1251             re.DOTALL,
1252             [
1253                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1254                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1255                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1256             ]
1257         )
1258         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1259
1260     def extractPlus7Stream(self, url):
1261         video_lang = url.split('/')[-3]
1262         info = self.grep_webpage(
1263             url,
1264             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1265             0,
1266             [
1267                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1268             ]
1269         )
1270         next_url = compat_urllib_parse.unquote(info.get('url'))
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1274             0,
1275             [
1276                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1277             ]
1278         )
1279         next_url = compat_urllib_parse.unquote(info.get('url'))
1280
1281         info = self.grep_webpage(
1282             next_url,
1283             r'<video id="(.*?)".*?>.*?' +
1284                 '<name>(.*?)</name>.*?' +
1285                 '<dateVideo>(.*?)</dateVideo>.*?' +
1286                 '<url quality="hd">(.*?)</url>',
1287             re.DOTALL,
1288             [
1289                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1290                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1291                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1292                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1293             ]
1294         )
1295
1296         return {
1297             'id':           info.get('id'),
1298             'url':          compat_urllib_parse.unquote(info.get('url')),
1299             'uploader':     u'arte.tv',
1300             'upload_date':  info.get('date'),
1301             'title':        info.get('title').decode('utf-8'),
1302             'ext':          u'mp4',
1303             'format':       u'NA',
1304             'player_url':   None,
1305         }
1306
1307     def _real_extract(self, url):
1308         video_id = url.split('/')[-1]
1309         self.report_extraction(video_id)
1310
1311         if re.search(self._LIVE_URL, video_id) is not None:
1312             self.extractLiveStream(url)
1313             return
1314         else:
1315             info = self.extractPlus7Stream(url)
1316
1317         return [info]
1318
1319
1320 class GenericIE(InfoExtractor):
1321     """Generic last-resort information extractor."""
1322
1323     _VALID_URL = r'.*'
1324     IE_NAME = u'generic'
1325
1326     def __init__(self, downloader=None):
1327         InfoExtractor.__init__(self, downloader)
1328
1329     def report_download_webpage(self, video_id):
1330         """Report webpage download."""
1331         if not self._downloader.params.get('test', False):
1332             self._downloader.report_warning(u'Falling back on generic information extractor.')
1333         self.to_screen(u'%s: Downloading webpage' % video_id)
1334
1335     def report_following_redirect(self, new_url):
1336         """Report information extraction."""
1337         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1338
1339     def _test_redirect(self, url):
1340         """Check if it is a redirect, like url shorteners, in case return the new url."""
1341         class HeadRequest(compat_urllib_request.Request):
1342             def get_method(self):
1343                 return "HEAD"
1344
1345         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1346             """
1347             Subclass the HTTPRedirectHandler to make it use our
1348             HeadRequest also on the redirected URL
1349             """
1350             def redirect_request(self, req, fp, code, msg, headers, newurl):
1351                 if code in (301, 302, 303, 307):
1352                     newurl = newurl.replace(' ', '%20')
1353                     newheaders = dict((k,v) for k,v in req.headers.items()
1354                                       if k.lower() not in ("content-length", "content-type"))
1355                     return HeadRequest(newurl,
1356                                        headers=newheaders,
1357                                        origin_req_host=req.get_origin_req_host(),
1358                                        unverifiable=True)
1359                 else:
1360                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1361
1362         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1363             """
1364             Fallback to GET if HEAD is not allowed (405 HTTP error)
1365             """
1366             def http_error_405(self, req, fp, code, msg, headers):
1367                 fp.read()
1368                 fp.close()
1369
1370                 newheaders = dict((k,v) for k,v in req.headers.items()
1371                                   if k.lower() not in ("content-length", "content-type"))
1372                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1373                                                  headers=newheaders,
1374                                                  origin_req_host=req.get_origin_req_host(),
1375                                                  unverifiable=True))
1376
1377         # Build our opener
1378         opener = compat_urllib_request.OpenerDirector()
1379         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1380                         HTTPMethodFallback, HEADRedirectHandler,
1381                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1382             opener.add_handler(handler())
1383
1384         response = opener.open(HeadRequest(url))
1385         new_url = response.geturl()
1386
1387         if url == new_url:
1388             return False
1389
1390         self.report_following_redirect(new_url)
1391         return new_url
1392
1393     def _real_extract(self, url):
1394         new_url = self._test_redirect(url)
1395         if new_url: return [self.url_result(new_url)]
1396
1397         video_id = url.split('/')[-1]
1398         try:
1399             webpage = self._download_webpage(url, video_id)
1400         except ValueError as err:
1401             # since this is the last-resort InfoExtractor, if
1402             # this error is thrown, it'll be thrown here
1403             self._downloader.report_error(u'Invalid URL: %s' % url)
1404             return
1405
1406         self.report_extraction(video_id)
1407         # Start with something easy: JW Player in SWFObject
1408         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Broaden the search a little bit
1411             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             # Broaden the search a little bit: JWPlayer JS loader
1414             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             self._downloader.report_error(u'Invalid URL: %s' % url)
1417             return
1418
1419         # It's possible that one of the regexes
1420         # matched, but returned an empty group:
1421         if mobj.group(1) is None:
1422             self._downloader.report_error(u'Invalid URL: %s' % url)
1423             return
1424
1425         video_url = compat_urllib_parse.unquote(mobj.group(1))
1426         video_id = os.path.basename(video_url)
1427
1428         # here's a fun little line of code for you:
1429         video_extension = os.path.splitext(video_id)[1][1:]
1430         video_id = os.path.splitext(video_id)[0]
1431
1432         # it's tempting to parse this further, but you would
1433         # have to take into account all the variations like
1434         #   Video Title - Site Name
1435         #   Site Name | Video Title
1436         #   Video Title - Tagline | Site Name
1437         # and so on and so forth; it's just not practical
1438         mobj = re.search(r'<title>(.*)</title>', webpage)
1439         if mobj is None:
1440             self._downloader.report_error(u'unable to extract title')
1441             return
1442         video_title = mobj.group(1)
1443
1444         # video uploader is domain name
1445         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1446         if mobj is None:
1447             self._downloader.report_error(u'unable to extract title')
1448             return
1449         video_uploader = mobj.group(1)
1450
1451         return [{
1452             'id':       video_id,
1453             'url':      video_url,
1454             'uploader': video_uploader,
1455             'upload_date':  None,
1456             'title':    video_title,
1457             'ext':      video_extension,
1458         }]
1459
1460
1461 class YoutubeSearchIE(InfoExtractor):
1462     """Information Extractor for YouTube search queries."""
1463     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1464     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1465     _max_youtube_results = 1000
1466     IE_NAME = u'youtube:search'
1467
1468     def __init__(self, downloader=None):
1469         InfoExtractor.__init__(self, downloader)
1470
1471     def report_download_page(self, query, pagenum):
1472         """Report attempt to download search page with given number."""
1473         query = query.decode(preferredencoding())
1474         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1475
1476     def _real_extract(self, query):
1477         mobj = re.match(self._VALID_URL, query)
1478         if mobj is None:
1479             self._downloader.report_error(u'invalid search query "%s"' % query)
1480             return
1481
1482         prefix, query = query.split(':')
1483         prefix = prefix[8:]
1484         query = query.encode('utf-8')
1485         if prefix == '':
1486             return self._get_n_results(query, 1)
1487         elif prefix == 'all':
1488             self._get_n_results(query, self._max_youtube_results)
1489         else:
1490             try:
1491                 n = int(prefix)
1492                 if n <= 0:
1493                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1494                     return
1495                 elif n > self._max_youtube_results:
1496                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1497                     n = self._max_youtube_results
1498                 return self._get_n_results(query, n)
1499             except ValueError: # parsing prefix as integer fails
1500                 return self._get_n_results(query, 1)
1501
1502     def _get_n_results(self, query, n):
1503         """Get a specified number of results for a query"""
1504
1505         video_ids = []
1506         pagenum = 0
1507         limit = n
1508
1509         while (50 * pagenum) < limit:
1510             self.report_download_page(query, pagenum+1)
1511             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1512             request = compat_urllib_request.Request(result_url)
1513             try:
1514                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1515             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1516                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1517                 return
1518             api_response = json.loads(data)['data']
1519
1520             if not 'items' in api_response:
1521                 self._downloader.trouble(u'[youtube] No video results')
1522                 return
1523
1524             new_ids = list(video['id'] for video in api_response['items'])
1525             video_ids += new_ids
1526
1527             limit = min(n, api_response['totalItems'])
1528             pagenum += 1
1529
1530         if len(video_ids) > n:
1531             video_ids = video_ids[:n]
1532         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1533         return videos
1534
1535
1536 class GoogleSearchIE(InfoExtractor):
1537     """Information Extractor for Google Video search queries."""
1538     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1539     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1540     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1541     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1542     _max_google_results = 1000
1543     IE_NAME = u'video.google:search'
1544
1545     def __init__(self, downloader=None):
1546         InfoExtractor.__init__(self, downloader)
1547
1548     def report_download_page(self, query, pagenum):
1549         """Report attempt to download playlist page with given number."""
1550         query = query.decode(preferredencoding())
1551         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1552
1553     def _real_extract(self, query):
1554         mobj = re.match(self._VALID_URL, query)
1555         if mobj is None:
1556             self._downloader.report_error(u'invalid search query "%s"' % query)
1557             return
1558
1559         prefix, query = query.split(':')
1560         prefix = prefix[8:]
1561         query = query.encode('utf-8')
1562         if prefix == '':
1563             self._download_n_results(query, 1)
1564             return
1565         elif prefix == 'all':
1566             self._download_n_results(query, self._max_google_results)
1567             return
1568         else:
1569             try:
1570                 n = int(prefix)
1571                 if n <= 0:
1572                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1573                     return
1574                 elif n > self._max_google_results:
1575                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1576                     n = self._max_google_results
1577                 self._download_n_results(query, n)
1578                 return
1579             except ValueError: # parsing prefix as integer fails
1580                 self._download_n_results(query, 1)
1581                 return
1582
1583     def _download_n_results(self, query, n):
1584         """Downloads a specified number of results for a query"""
1585
1586         video_ids = []
1587         pagenum = 0
1588
1589         while True:
1590             self.report_download_page(query, pagenum)
1591             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1592             request = compat_urllib_request.Request(result_url)
1593             try:
1594                 page = compat_urllib_request.urlopen(request).read()
1595             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1596                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1597                 return
1598
1599             # Extract video identifiers
1600             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601                 video_id = mobj.group(1)
1602                 if video_id not in video_ids:
1603                     video_ids.append(video_id)
1604                     if len(video_ids) == n:
1605                         # Specified n videos reached
1606                         for id in video_ids:
1607                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1608                         return
1609
1610             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1611                 for id in video_ids:
1612                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1613                 return
1614
1615             pagenum = pagenum + 1
1616
1617
1618 class YahooSearchIE(InfoExtractor):
1619     """Information Extractor for Yahoo! Video search queries."""
1620
1621     _WORKING = False
1622     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1623     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1624     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1625     _MORE_PAGES_INDICATOR = r'\s*Next'
1626     _max_yahoo_results = 1000
1627     IE_NAME = u'video.yahoo:search'
1628
1629     def __init__(self, downloader=None):
1630         InfoExtractor.__init__(self, downloader)
1631
1632     def report_download_page(self, query, pagenum):
1633         """Report attempt to download playlist page with given number."""
1634         query = query.decode(preferredencoding())
1635         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1636
1637     def _real_extract(self, query):
1638         mobj = re.match(self._VALID_URL, query)
1639         if mobj is None:
1640             self._downloader.report_error(u'invalid search query "%s"' % query)
1641             return
1642
1643         prefix, query = query.split(':')
1644         prefix = prefix[8:]
1645         query = query.encode('utf-8')
1646         if prefix == '':
1647             self._download_n_results(query, 1)
1648             return
1649         elif prefix == 'all':
1650             self._download_n_results(query, self._max_yahoo_results)
1651             return
1652         else:
1653             try:
1654                 n = int(prefix)
1655                 if n <= 0:
1656                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1657                     return
1658                 elif n > self._max_yahoo_results:
1659                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1660                     n = self._max_yahoo_results
1661                 self._download_n_results(query, n)
1662                 return
1663             except ValueError: # parsing prefix as integer fails
1664                 self._download_n_results(query, 1)
1665                 return
1666
1667     def _download_n_results(self, query, n):
1668         """Downloads a specified number of results for a query"""
1669
1670         video_ids = []
1671         already_seen = set()
1672         pagenum = 1
1673
1674         while True:
1675             self.report_download_page(query, pagenum)
1676             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1677             request = compat_urllib_request.Request(result_url)
1678             try:
1679                 page = compat_urllib_request.urlopen(request).read()
1680             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1681                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1682                 return
1683
1684             # Extract video identifiers
1685             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1686                 video_id = mobj.group(1)
1687                 if video_id not in already_seen:
1688                     video_ids.append(video_id)
1689                     already_seen.add(video_id)
1690                     if len(video_ids) == n:
1691                         # Specified n videos reached
1692                         for id in video_ids:
1693                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1694                         return
1695
1696             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1697                 for id in video_ids:
1698                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1699                 return
1700
1701             pagenum = pagenum + 1
1702
1703
1704 class YoutubePlaylistIE(InfoExtractor):
1705     """Information Extractor for YouTube playlists."""
1706
1707     _VALID_URL = r"""(?:
1708                         (?:https?://)?
1709                         (?:\w+\.)?
1710                         youtube\.com/
1711                         (?:
1712                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1713                            \? (?:.*?&)*? (?:p|a|list)=
1714                         |  p/
1715                         )
1716                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1717                         .*
1718                      |
1719                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1720                      )"""
1721     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1722     _MAX_RESULTS = 50
1723     IE_NAME = u'youtube:playlist'
1724
1725     def __init__(self, downloader=None):
1726         InfoExtractor.__init__(self, downloader)
1727
1728     @classmethod
1729     def suitable(cls, url):
1730         """Receives a URL and returns True if suitable for this IE."""
1731         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1732
1733     def report_download_page(self, playlist_id, pagenum):
1734         """Report attempt to download playlist page with given number."""
1735         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1736
1737     def _real_extract(self, url):
1738         # Extract playlist id
1739         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1740         if mobj is None:
1741             self._downloader.report_error(u'invalid url: %s' % url)
1742             return
1743
1744         # Download playlist videos from API
1745         playlist_id = mobj.group(1) or mobj.group(2)
1746         page_num = 1
1747         videos = []
1748
1749         while True:
1750             self.report_download_page(playlist_id, page_num)
1751
1752             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1753             try:
1754                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1755             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1756                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1757                 return
1758
1759             try:
1760                 response = json.loads(page)
1761             except ValueError as err:
1762                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1763                 return
1764
1765             if 'feed' not in response:
1766                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1767                 return
1768             if 'entry' not in response['feed']:
1769                 # Number of videos is a multiple of self._MAX_RESULTS
1770                 break
1771
1772             playlist_title = response['feed']['title']['$t']
1773
1774             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1775                         for entry in response['feed']['entry']
1776                         if 'content' in entry ]
1777
1778             if len(response['feed']['entry']) < self._MAX_RESULTS:
1779                 break
1780             page_num += 1
1781
1782         videos = [v[1] for v in sorted(videos)]
1783
1784         url_results = [self.url_result(url, 'Youtube') for url in videos]
1785         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1786
1787
1788 class YoutubeChannelIE(InfoExtractor):
1789     """Information Extractor for YouTube channels."""
1790
1791     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1792     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1793     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1794     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1795     IE_NAME = u'youtube:channel'
1796
1797     def report_download_page(self, channel_id, pagenum):
1798         """Report attempt to download channel page with given number."""
1799         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1800
1801     def extract_videos_from_page(self, page):
1802         ids_in_page = []
1803         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1804             if mobj.group(1) not in ids_in_page:
1805                 ids_in_page.append(mobj.group(1))
1806         return ids_in_page
1807
1808     def _real_extract(self, url):
1809         # Extract channel id
1810         mobj = re.match(self._VALID_URL, url)
1811         if mobj is None:
1812             self._downloader.report_error(u'invalid url: %s' % url)
1813             return
1814
1815         # Download channel page
1816         channel_id = mobj.group(1)
1817         video_ids = []
1818         pagenum = 1
1819
1820         self.report_download_page(channel_id, pagenum)
1821         url = self._TEMPLATE_URL % (channel_id, pagenum)
1822         request = compat_urllib_request.Request(url)
1823         try:
1824             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1825         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1826             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1827             return
1828
1829         # Extract video identifiers
1830         ids_in_page = self.extract_videos_from_page(page)
1831         video_ids.extend(ids_in_page)
1832
1833         # Download any subsequent channel pages using the json-based channel_ajax query
1834         if self._MORE_PAGES_INDICATOR in page:
1835             while True:
1836                 pagenum = pagenum + 1
1837
1838                 self.report_download_page(channel_id, pagenum)
1839                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1840                 request = compat_urllib_request.Request(url)
1841                 try:
1842                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1843                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1844                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1845                     return
1846
1847                 page = json.loads(page)
1848
1849                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1850                 video_ids.extend(ids_in_page)
1851
1852                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1853                     break
1854
1855         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1856
1857         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1858         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1859         return [self.playlist_result(url_entries, channel_id)]
1860
1861
1862 class YoutubeUserIE(InfoExtractor):
1863     """Information Extractor for YouTube users."""
1864
1865     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1866     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1867     _GDATA_PAGE_SIZE = 50
1868     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1869     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1870     IE_NAME = u'youtube:user'
1871
1872     def __init__(self, downloader=None):
1873         InfoExtractor.__init__(self, downloader)
1874
1875     def report_download_page(self, username, start_index):
1876         """Report attempt to download user page."""
1877         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1878                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1879
1880     def _real_extract(self, url):
1881         # Extract username
1882         mobj = re.match(self._VALID_URL, url)
1883         if mobj is None:
1884             self._downloader.report_error(u'invalid url: %s' % url)
1885             return
1886
1887         username = mobj.group(1)
1888
1889         # Download video ids using YouTube Data API. Result size per
1890         # query is limited (currently to 50 videos) so we need to query
1891         # page by page until there are no video ids - it means we got
1892         # all of them.
1893
1894         video_ids = []
1895         pagenum = 0
1896
1897         while True:
1898             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1899             self.report_download_page(username, start_index)
1900
1901             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1902
1903             try:
1904                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1905             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1906                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1907                 return
1908
1909             # Extract video identifiers
1910             ids_in_page = []
1911
1912             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1913                 if mobj.group(1) not in ids_in_page:
1914                     ids_in_page.append(mobj.group(1))
1915
1916             video_ids.extend(ids_in_page)
1917
1918             # A little optimization - if current page is not
1919             # "full", ie. does not contain PAGE_SIZE video ids then
1920             # we can assume that this page is the last one - there
1921             # are no more ids on further pages - no need to query
1922             # again.
1923
1924             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1925                 break
1926
1927             pagenum += 1
1928
1929         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1930         url_results = [self.url_result(url, 'Youtube') for url in urls]
1931         return [self.playlist_result(url_results, playlist_title = username)]
1932
1933
1934 class BlipTVUserIE(InfoExtractor):
1935     """Information Extractor for blip.tv users."""
1936
1937     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1938     _PAGE_SIZE = 12
1939     IE_NAME = u'blip.tv:user'
1940
1941     def __init__(self, downloader=None):
1942         InfoExtractor.__init__(self, downloader)
1943
1944     def report_download_page(self, username, pagenum):
1945         """Report attempt to download user page."""
1946         self.to_screen(u'user %s: Downloading video ids from page %d' %
1947                 (username, pagenum))
1948
1949     def _real_extract(self, url):
1950         # Extract username
1951         mobj = re.match(self._VALID_URL, url)
1952         if mobj is None:
1953             self._downloader.report_error(u'invalid url: %s' % url)
1954             return
1955
1956         username = mobj.group(1)
1957
1958         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1959
1960         request = compat_urllib_request.Request(url)
1961
1962         try:
1963             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1964             mobj = re.search(r'data-users-id="([^"]+)"', page)
1965             page_base = page_base % mobj.group(1)
1966         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1967             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1968             return
1969
1970
1971         # Download video ids using BlipTV Ajax calls. Result size per
1972         # query is limited (currently to 12 videos) so we need to query
1973         # page by page until there are no video ids - it means we got
1974         # all of them.
1975
1976         video_ids = []
1977         pagenum = 1
1978
1979         while True:
1980             self.report_download_page(username, pagenum)
1981             url = page_base + "&page=" + str(pagenum)
1982             request = compat_urllib_request.Request( url )
1983             try:
1984                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1985             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1986                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1987                 return
1988
1989             # Extract video identifiers
1990             ids_in_page = []
1991
1992             for mobj in re.finditer(r'href="/([^"]+)"', page):
1993                 if mobj.group(1) not in ids_in_page:
1994                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1995
1996             video_ids.extend(ids_in_page)
1997
1998             # A little optimization - if current page is not
1999             # "full", ie. does not contain PAGE_SIZE video ids then
2000             # we can assume that this page is the last one - there
2001             # are no more ids on further pages - no need to query
2002             # again.
2003
2004             if len(ids_in_page) < self._PAGE_SIZE:
2005                 break
2006
2007             pagenum += 1
2008
2009         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2010         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2011         return [self.playlist_result(url_entries, playlist_title = username)]
2012
2013
2014 class DepositFilesIE(InfoExtractor):
2015     """Information extractor for depositfiles.com"""
2016
2017     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2018
2019     def report_download_webpage(self, file_id):
2020         """Report webpage download."""
2021         self.to_screen(u'%s: Downloading webpage' % file_id)
2022
2023     def _real_extract(self, url):
2024         file_id = url.split('/')[-1]
2025         # Rebuild url in english locale
2026         url = 'http://depositfiles.com/en/files/' + file_id
2027
2028         # Retrieve file webpage with 'Free download' button pressed
2029         free_download_indication = { 'gateway_result' : '1' }
2030         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2031         try:
2032             self.report_download_webpage(file_id)
2033             webpage = compat_urllib_request.urlopen(request).read()
2034         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2035             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2036             return
2037
2038         # Search for the real file URL
2039         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2040         if (mobj is None) or (mobj.group(1) is None):
2041             # Try to figure out reason of the error.
2042             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2043             if (mobj is not None) and (mobj.group(1) is not None):
2044                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2045                 self._downloader.report_error(u'%s' % restriction_message)
2046             else:
2047                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2048             return
2049
2050         file_url = mobj.group(1)
2051         file_extension = os.path.splitext(file_url)[1][1:]
2052
2053         # Search for file title
2054         mobj = re.search(r'<b title="(.*?)">', webpage)
2055         if mobj is None:
2056             self._downloader.report_error(u'unable to extract title')
2057             return
2058         file_title = mobj.group(1).decode('utf-8')
2059
2060         return [{
2061             'id':       file_id.decode('utf-8'),
2062             'url':      file_url.decode('utf-8'),
2063             'uploader': None,
2064             'upload_date':  None,
2065             'title':    file_title,
2066             'ext':      file_extension.decode('utf-8'),
2067         }]
2068
2069
2070 class FacebookIE(InfoExtractor):
2071     """Information Extractor for Facebook"""
2072
2073     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2074     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2075     _NETRC_MACHINE = 'facebook'
2076     IE_NAME = u'facebook'
2077
2078     def report_login(self):
2079         """Report attempt to log in."""
2080         self.to_screen(u'Logging in')
2081
2082     def _real_initialize(self):
2083         if self._downloader is None:
2084             return
2085
2086         useremail = None
2087         password = None
2088         downloader_params = self._downloader.params
2089
2090         # Attempt to use provided username and password or .netrc data
2091         if downloader_params.get('username', None) is not None:
2092             useremail = downloader_params['username']
2093             password = downloader_params['password']
2094         elif downloader_params.get('usenetrc', False):
2095             try:
2096                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2097                 if info is not None:
2098                     useremail = info[0]
2099                     password = info[2]
2100                 else:
2101                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2102             except (IOError, netrc.NetrcParseError) as err:
2103                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2104                 return
2105
2106         if useremail is None:
2107             return
2108
2109         # Log in
2110         login_form = {
2111             'email': useremail,
2112             'pass': password,
2113             'login': 'Log+In'
2114             }
2115         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2116         try:
2117             self.report_login()
2118             login_results = compat_urllib_request.urlopen(request).read()
2119             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2120                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2121                 return
2122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2123             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2124             return
2125
2126     def _real_extract(self, url):
2127         mobj = re.match(self._VALID_URL, url)
2128         if mobj is None:
2129             self._downloader.report_error(u'invalid URL: %s' % url)
2130             return
2131         video_id = mobj.group('ID')
2132
2133         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2134         webpage = self._download_webpage(url, video_id)
2135
2136         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2137         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2138         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2139         if not m:
2140             raise ExtractorError(u'Cannot parse data')
2141         data = dict(json.loads(m.group(1)))
2142         params_raw = compat_urllib_parse.unquote(data['params'])
2143         params = json.loads(params_raw)
2144         video_data = params['video_data'][0]
2145         video_url = video_data.get('hd_src')
2146         if not video_url:
2147             video_url = video_data['sd_src']
2148         if not video_url:
2149             raise ExtractorError(u'Cannot find video URL')
2150         video_duration = int(video_data['video_duration'])
2151         thumbnail = video_data['thumbnail_src']
2152
2153         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2154         if not m:
2155             raise ExtractorError(u'Cannot find title in webpage')
2156         video_title = unescapeHTML(m.group(1))
2157
2158         info = {
2159             'id': video_id,
2160             'title': video_title,
2161             'url': video_url,
2162             'ext': 'mp4',
2163             'duration': video_duration,
2164             'thumbnail': thumbnail,
2165         }
2166         return [info]
2167
2168
2169 class BlipTVIE(InfoExtractor):
2170     """Information extractor for blip.tv"""
2171
2172     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2173     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2174     IE_NAME = u'blip.tv'
2175
2176     def report_direct_download(self, title):
2177         """Report information extraction."""
2178         self.to_screen(u'%s: Direct download detected' % title)
2179
2180     def _real_extract(self, url):
2181         mobj = re.match(self._VALID_URL, url)
2182         if mobj is None:
2183             self._downloader.report_error(u'invalid URL: %s' % url)
2184             return
2185
2186         urlp = compat_urllib_parse_urlparse(url)
2187         if urlp.path.startswith('/play/'):
2188             request = compat_urllib_request.Request(url)
2189             response = compat_urllib_request.urlopen(request)
2190             redirecturl = response.geturl()
2191             rurlp = compat_urllib_parse_urlparse(redirecturl)
2192             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2193             url = 'http://blip.tv/a/a-' + file_id
2194             return self._real_extract(url)
2195
2196
2197         if '?' in url:
2198             cchar = '&'
2199         else:
2200             cchar = '?'
2201         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2202         request = compat_urllib_request.Request(json_url)
2203         request.add_header('User-Agent', 'iTunes/10.6.1')
2204         self.report_extraction(mobj.group(1))
2205         info = None
2206         try:
2207             urlh = compat_urllib_request.urlopen(request)
2208             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2209                 basename = url.split('/')[-1]
2210                 title,ext = os.path.splitext(basename)
2211                 title = title.decode('UTF-8')
2212                 ext = ext.replace('.', '')
2213                 self.report_direct_download(title)
2214                 info = {
2215                     'id': title,
2216                     'url': url,
2217                     'uploader': None,
2218                     'upload_date': None,
2219                     'title': title,
2220                     'ext': ext,
2221                     'urlhandle': urlh
2222                 }
2223         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2224             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2225         if info is None: # Regular URL
2226             try:
2227                 json_code_bytes = urlh.read()
2228                 json_code = json_code_bytes.decode('utf-8')
2229             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2230                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2231                 return
2232
2233             try:
2234                 json_data = json.loads(json_code)
2235                 if 'Post' in json_data:
2236                     data = json_data['Post']
2237                 else:
2238                     data = json_data
2239
2240                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2241                 video_url = data['media']['url']
2242                 umobj = re.match(self._URL_EXT, video_url)
2243                 if umobj is None:
2244                     raise ValueError('Can not determine filename extension')
2245                 ext = umobj.group(1)
2246
2247                 info = {
2248                     'id': data['item_id'],
2249                     'url': video_url,
2250                     'uploader': data['display_name'],
2251                     'upload_date': upload_date,
2252                     'title': data['title'],
2253                     'ext': ext,
2254                     'format': data['media']['mimeType'],
2255                     'thumbnail': data['thumbnailUrl'],
2256                     'description': data['description'],
2257                     'player_url': data['embedUrl'],
2258                     'user_agent': 'iTunes/10.6.1',
2259                 }
2260             except (ValueError,KeyError) as err:
2261                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2262                 return
2263
2264         return [info]
2265
2266
2267 class MyVideoIE(InfoExtractor):
2268     """Information Extractor for myvideo.de."""
2269
2270     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2271     IE_NAME = u'myvideo'
2272
2273     def __init__(self, downloader=None):
2274         InfoExtractor.__init__(self, downloader)
2275
2276     def _real_extract(self,url):
2277         mobj = re.match(self._VALID_URL, url)
2278         if mobj is None:
2279             self._download.report_error(u'invalid URL: %s' % url)
2280             return
2281
2282         video_id = mobj.group(1)
2283
2284         # Get video webpage
2285         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2286         webpage = self._download_webpage(webpage_url, video_id)
2287
2288         self.report_extraction(video_id)
2289         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2290                  webpage)
2291         if mobj is None:
2292             self._downloader.report_error(u'unable to extract media URL')
2293             return
2294         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2295
2296         mobj = re.search('<title>([^<]+)</title>', webpage)
2297         if mobj is None:
2298             self._downloader.report_error(u'unable to extract title')
2299             return
2300
2301         video_title = mobj.group(1)
2302
2303         return [{
2304             'id':       video_id,
2305             'url':      video_url,
2306             'uploader': None,
2307             'upload_date':  None,
2308             'title':    video_title,
2309             'ext':      u'flv',
2310         }]
2311
2312 class ComedyCentralIE(InfoExtractor):
2313     """Information extractor for The Daily Show and Colbert Report """
2314
2315     # urls can be abbreviations like :thedailyshow or :colbert
2316     # urls for episodes like:
2317     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2318     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2319     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2320     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2321                       |(https?://)?(www\.)?
2322                           (?P<showname>thedailyshow|colbertnation)\.com/
2323                          (full-episodes/(?P<episode>.*)|
2324                           (?P<clip>
2325                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2326                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2327                      $"""
2328
2329     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2330
2331     _video_extensions = {
2332         '3500': 'mp4',
2333         '2200': 'mp4',
2334         '1700': 'mp4',
2335         '1200': 'mp4',
2336         '750': 'mp4',
2337         '400': 'mp4',
2338     }
2339     _video_dimensions = {
2340         '3500': '1280x720',
2341         '2200': '960x540',
2342         '1700': '768x432',
2343         '1200': '640x360',
2344         '750': '512x288',
2345         '400': '384x216',
2346     }
2347
2348     @classmethod
2349     def suitable(cls, url):
2350         """Receives a URL and returns True if suitable for this IE."""
2351         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2352
2353     def report_config_download(self, episode_id, media_id):
2354         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2355
2356     def report_index_download(self, episode_id):
2357         self.to_screen(u'%s: Downloading show index' % episode_id)
2358
2359     def _print_formats(self, formats):
2360         print('Available formats:')
2361         for x in formats:
2362             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2363
2364
2365     def _real_extract(self, url):
2366         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2367         if mobj is None:
2368             self._downloader.report_error(u'invalid URL: %s' % url)
2369             return
2370
2371         if mobj.group('shortname'):
2372             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2373                 url = u'http://www.thedailyshow.com/full-episodes/'
2374             else:
2375                 url = u'http://www.colbertnation.com/full-episodes/'
2376             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2377             assert mobj is not None
2378
2379         if mobj.group('clip'):
2380             if mobj.group('showname') == 'thedailyshow':
2381                 epTitle = mobj.group('tdstitle')
2382             else:
2383                 epTitle = mobj.group('cntitle')
2384             dlNewest = False
2385         else:
2386             dlNewest = not mobj.group('episode')
2387             if dlNewest:
2388                 epTitle = mobj.group('showname')
2389             else:
2390                 epTitle = mobj.group('episode')
2391
2392         req = compat_urllib_request.Request(url)
2393         self.report_extraction(epTitle)
2394         try:
2395             htmlHandle = compat_urllib_request.urlopen(req)
2396             html = htmlHandle.read()
2397             webpage = html.decode('utf-8')
2398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2399             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2400             return
2401         if dlNewest:
2402             url = htmlHandle.geturl()
2403             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2404             if mobj is None:
2405                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2406                 return
2407             if mobj.group('episode') == '':
2408                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2409                 return
2410             epTitle = mobj.group('episode')
2411
2412         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2413
2414         if len(mMovieParams) == 0:
2415             # The Colbert Report embeds the information in a without
2416             # a URL prefix; so extract the alternate reference
2417             # and then add the URL prefix manually.
2418
2419             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2420             if len(altMovieParams) == 0:
2421                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2422                 return
2423             else:
2424                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2425
2426         uri = mMovieParams[0][1]
2427         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2428         self.report_index_download(epTitle)
2429         try:
2430             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2431         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2433             return
2434
2435         results = []
2436
2437         idoc = xml.etree.ElementTree.fromstring(indexXml)
2438         itemEls = idoc.findall('.//item')
2439         for partNum,itemEl in enumerate(itemEls):
2440             mediaId = itemEl.findall('./guid')[0].text
2441             shortMediaId = mediaId.split(':')[-1]
2442             showId = mediaId.split(':')[-2].replace('.com', '')
2443             officialTitle = itemEl.findall('./title')[0].text
2444             officialDate = itemEl.findall('./pubDate')[0].text
2445
2446             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2447                         compat_urllib_parse.urlencode({'uri': mediaId}))
2448             configReq = compat_urllib_request.Request(configUrl)
2449             self.report_config_download(epTitle, shortMediaId)
2450             try:
2451                 configXml = compat_urllib_request.urlopen(configReq).read()
2452             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2454                 return
2455
2456             cdoc = xml.etree.ElementTree.fromstring(configXml)
2457             turls = []
2458             for rendition in cdoc.findall('.//rendition'):
2459                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2460                 turls.append(finfo)
2461
2462             if len(turls) == 0:
2463                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2464                 continue
2465
2466             if self._downloader.params.get('listformats', None):
2467                 self._print_formats([i[0] for i in turls])
2468                 return
2469
2470             # For now, just pick the highest bitrate
2471             format,rtmp_video_url = turls[-1]
2472
2473             # Get the format arg from the arg stream
2474             req_format = self._downloader.params.get('format', None)
2475
2476             # Select format if we can find one
2477             for f,v in turls:
2478                 if f == req_format:
2479                     format, rtmp_video_url = f, v
2480                     break
2481
2482             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2483             if not m:
2484                 raise ExtractorError(u'Cannot transform RTMP url')
2485             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2486             video_url = base + m.group('finalid')
2487
2488             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2489             info = {
2490                 'id': shortMediaId,
2491                 'url': video_url,
2492                 'uploader': showId,
2493                 'upload_date': officialDate,
2494                 'title': effTitle,
2495                 'ext': 'mp4',
2496                 'format': format,
2497                 'thumbnail': None,
2498                 'description': officialTitle,
2499             }
2500             results.append(info)
2501
2502         return results
2503
2504
2505 class EscapistIE(InfoExtractor):
2506     """Information extractor for The Escapist """
2507
2508     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2509     IE_NAME = u'escapist'
2510
2511     def report_config_download(self, showName):
2512         self.to_screen(u'%s: Downloading configuration' % showName)
2513
2514     def _real_extract(self, url):
2515         mobj = re.match(self._VALID_URL, url)
2516         if mobj is None:
2517             self._downloader.report_error(u'invalid URL: %s' % url)
2518             return
2519         showName = mobj.group('showname')
2520         videoId = mobj.group('episode')
2521
2522         self.report_extraction(showName)
2523         try:
2524             webPage = compat_urllib_request.urlopen(url)
2525             webPageBytes = webPage.read()
2526             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2527             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2528         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2529             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2530             return
2531
2532         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2533         description = unescapeHTML(descMatch.group(1))
2534         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2535         imgUrl = unescapeHTML(imgMatch.group(1))
2536         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2537         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2538         configUrlMatch = re.search('config=(.*)$', playerUrl)
2539         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2540
2541         self.report_config_download(showName)
2542         try:
2543             configJSON = compat_urllib_request.urlopen(configUrl)
2544             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2545             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2546         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2548             return
2549
2550         # Technically, it's JavaScript, not JSON
2551         configJSON = configJSON.replace("'", '"')
2552
2553         try:
2554             config = json.loads(configJSON)
2555         except (ValueError,) as err:
2556             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2557             return
2558
2559         playlist = config['playlist']
2560         videoUrl = playlist[1]['url']
2561
2562         info = {
2563             'id': videoId,
2564             'url': videoUrl,
2565             'uploader': showName,
2566             'upload_date': None,
2567             'title': showName,
2568             'ext': 'mp4',
2569             'thumbnail': imgUrl,
2570             'description': description,
2571             'player_url': playerUrl,
2572         }
2573
2574         return [info]
2575
2576 class CollegeHumorIE(InfoExtractor):
2577     """Information extractor for collegehumor.com"""
2578
2579     _WORKING = False
2580     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2581     IE_NAME = u'collegehumor'
2582
2583     def report_manifest(self, video_id):
2584         """Report information extraction."""
2585         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2586
2587     def _real_extract(self, url):
2588         mobj = re.match(self._VALID_URL, url)
2589         if mobj is None:
2590             self._downloader.report_error(u'invalid URL: %s' % url)
2591             return
2592         video_id = mobj.group('videoid')
2593
2594         info = {
2595             'id': video_id,
2596             'uploader': None,
2597             'upload_date': None,
2598         }
2599
2600         self.report_extraction(video_id)
2601         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2602         try:
2603             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2604         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2606             return
2607
2608         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2609         try:
2610             videoNode = mdoc.findall('./video')[0]
2611             info['description'] = videoNode.findall('./description')[0].text
2612             info['title'] = videoNode.findall('./caption')[0].text
2613             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2614             manifest_url = videoNode.findall('./file')[0].text
2615         except IndexError:
2616             self._downloader.report_error(u'Invalid metadata XML file')
2617             return
2618
2619         manifest_url += '?hdcore=2.10.3'
2620         self.report_manifest(video_id)
2621         try:
2622             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2623         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2624             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2625             return
2626
2627         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2628         try:
2629             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2630             node_id = media_node.attrib['url']
2631             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2632         except IndexError as err:
2633             self._downloader.report_error(u'Invalid manifest file')
2634             return
2635
2636         url_pr = compat_urllib_parse_urlparse(manifest_url)
2637         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2638
2639         info['url'] = url
2640         info['ext'] = 'f4f'
2641         return [info]
2642
2643
2644 class XVideosIE(InfoExtractor):
2645     """Information extractor for xvideos.com"""
2646
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2648     IE_NAME = u'xvideos'
2649
2650     def _real_extract(self, url):
2651         mobj = re.match(self._VALID_URL, url)
2652         if mobj is None:
2653             self._downloader.report_error(u'invalid URL: %s' % url)
2654             return
2655         video_id = mobj.group(1)
2656
2657         webpage = self._download_webpage(url, video_id)
2658
2659         self.report_extraction(video_id)
2660
2661
2662         # Extract video URL
2663         mobj = re.search(r'flv_url=(.+?)&', webpage)
2664         if mobj is None:
2665             self._downloader.report_error(u'unable to extract video url')
2666             return
2667         video_url = compat_urllib_parse.unquote(mobj.group(1))
2668
2669
2670         # Extract title
2671         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2672         if mobj is None:
2673             self._downloader.report_error(u'unable to extract video title')
2674             return
2675         video_title = mobj.group(1)
2676
2677
2678         # Extract video thumbnail
2679         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2680         if mobj is None:
2681             self._downloader.report_error(u'unable to extract video thumbnail')
2682             return
2683         video_thumbnail = mobj.group(0)
2684
2685         info = {
2686             'id': video_id,
2687             'url': video_url,
2688             'uploader': None,
2689             'upload_date': None,
2690             'title': video_title,
2691             'ext': 'flv',
2692             'thumbnail': video_thumbnail,
2693             'description': None,
2694         }
2695
2696         return [info]
2697
2698
2699 class SoundcloudIE(InfoExtractor):
2700     """Information extractor for soundcloud.com
2701        To access the media, the uid of the song and a stream token
2702        must be extracted from the page source and the script must make
2703        a request to media.soundcloud.com/crossdomain.xml. Then
2704        the media can be grabbed by requesting from an url composed
2705        of the stream token and uid
2706      """
2707
2708     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2709     IE_NAME = u'soundcloud'
2710
2711     def __init__(self, downloader=None):
2712         InfoExtractor.__init__(self, downloader)
2713
2714     def report_resolve(self, video_id):
2715         """Report information extraction."""
2716         self.to_screen(u'%s: Resolving id' % video_id)
2717
2718     def _real_extract(self, url):
2719         mobj = re.match(self._VALID_URL, url)
2720         if mobj is None:
2721             self._downloader.report_error(u'invalid URL: %s' % url)
2722             return
2723
2724         # extract uploader (which is in the url)
2725         uploader = mobj.group(1)
2726         # extract simple title (uploader + slug of song title)
2727         slug_title =  mobj.group(2)
2728         simple_title = uploader + u'-' + slug_title
2729
2730         self.report_resolve('%s/%s' % (uploader, slug_title))
2731
2732         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2733         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2734         request = compat_urllib_request.Request(resolv_url)
2735         try:
2736             info_json_bytes = compat_urllib_request.urlopen(request).read()
2737             info_json = info_json_bytes.decode('utf-8')
2738         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2740             return
2741
2742         info = json.loads(info_json)
2743         video_id = info['id']
2744         self.report_extraction('%s/%s' % (uploader, slug_title))
2745
2746         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2747         request = compat_urllib_request.Request(streams_url)
2748         try:
2749             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2750             stream_json = stream_json_bytes.decode('utf-8')
2751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2753             return
2754
2755         streams = json.loads(stream_json)
2756         mediaURL = streams['http_mp3_128_url']
2757
2758         return [{
2759             'id':       info['id'],
2760             'url':      mediaURL,
2761             'uploader': info['user']['username'],
2762             'upload_date':  info['created_at'],
2763             'title':    info['title'],
2764             'ext':      u'mp3',
2765             'description': info['description'],
2766         }]
2767
2768 class SoundcloudSetIE(InfoExtractor):
2769     """Information extractor for soundcloud.com sets
2770        To access the media, the uid of the song and a stream token
2771        must be extracted from the page source and the script must make
2772        a request to media.soundcloud.com/crossdomain.xml. Then
2773        the media can be grabbed by requesting from an url composed
2774        of the stream token and uid
2775      """
2776
2777     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2778     IE_NAME = u'soundcloud'
2779
2780     def __init__(self, downloader=None):
2781         InfoExtractor.__init__(self, downloader)
2782
2783     def report_resolve(self, video_id):
2784         """Report information extraction."""
2785         self.to_screen(u'%s: Resolving id' % video_id)
2786
2787     def _real_extract(self, url):
2788         mobj = re.match(self._VALID_URL, url)
2789         if mobj is None:
2790             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2791             return
2792
2793         # extract uploader (which is in the url)
2794         uploader = mobj.group(1)
2795         # extract simple title (uploader + slug of song title)
2796         slug_title =  mobj.group(2)
2797         simple_title = uploader + u'-' + slug_title
2798
2799         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2800
2801         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2802         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803         request = compat_urllib_request.Request(resolv_url)
2804         try:
2805             info_json_bytes = compat_urllib_request.urlopen(request).read()
2806             info_json = info_json_bytes.decode('utf-8')
2807         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2809             return
2810
2811         videos = []
2812         info = json.loads(info_json)
2813         if 'errors' in info:
2814             for err in info['errors']:
2815                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2816             return
2817
2818         for track in info['tracks']:
2819             video_id = track['id']
2820             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2821
2822             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2823             request = compat_urllib_request.Request(streams_url)
2824             try:
2825                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2826                 stream_json = stream_json_bytes.decode('utf-8')
2827             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2828                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2829                 return
2830
2831             streams = json.loads(stream_json)
2832             mediaURL = streams['http_mp3_128_url']
2833
2834             videos.append({
2835                 'id':       video_id,
2836                 'url':      mediaURL,
2837                 'uploader': track['user']['username'],
2838                 'upload_date':  track['created_at'],
2839                 'title':    track['title'],
2840                 'ext':      u'mp3',
2841                 'description': track['description'],
2842             })
2843         return videos
2844
2845
2846 class InfoQIE(InfoExtractor):
2847     """Information extractor for infoq.com"""
2848     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2849
2850     def _real_extract(self, url):
2851         mobj = re.match(self._VALID_URL, url)
2852         if mobj is None:
2853             self._downloader.report_error(u'invalid URL: %s' % url)
2854             return
2855
2856         webpage = self._download_webpage(url, video_id=url)
2857         self.report_extraction(url)
2858
2859         # Extract video URL
2860         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2861         if mobj is None:
2862             self._downloader.report_error(u'unable to extract video url')
2863             return
2864         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2865         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2866
2867         # Extract title
2868         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2869         if mobj is None:
2870             self._downloader.report_error(u'unable to extract video title')
2871             return
2872         video_title = mobj.group(1)
2873
2874         # Extract description
2875         video_description = u'No description available.'
2876         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2877         if mobj is not None:
2878             video_description = mobj.group(1)
2879
2880         video_filename = video_url.split('/')[-1]
2881         video_id, extension = video_filename.split('.')
2882
2883         info = {
2884             'id': video_id,
2885             'url': video_url,
2886             'uploader': None,
2887             'upload_date': None,
2888             'title': video_title,
2889             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2890             'thumbnail': None,
2891             'description': video_description,
2892         }
2893
2894         return [info]
2895
2896 class MixcloudIE(InfoExtractor):
2897     """Information extractor for www.mixcloud.com"""
2898
2899     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2900     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2901     IE_NAME = u'mixcloud'
2902
2903     def __init__(self, downloader=None):
2904         InfoExtractor.__init__(self, downloader)
2905
2906     def report_download_json(self, file_id):
2907         """Report JSON download."""
2908         self.to_screen(u'Downloading json')
2909
2910     def get_urls(self, jsonData, fmt, bitrate='best'):
2911         """Get urls from 'audio_formats' section in json"""
2912         file_url = None
2913         try:
2914             bitrate_list = jsonData[fmt]
2915             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2916                 bitrate = max(bitrate_list) # select highest
2917
2918             url_list = jsonData[fmt][bitrate]
2919         except TypeError: # we have no bitrate info.
2920             url_list = jsonData[fmt]
2921         return url_list
2922
2923     def check_urls(self, url_list):
2924         """Returns 1st active url from list"""
2925         for url in url_list:
2926             try:
2927                 compat_urllib_request.urlopen(url)
2928                 return url
2929             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2930                 url = None
2931
2932         return None
2933
2934     def _print_formats(self, formats):
2935         print('Available formats:')
2936         for fmt in formats.keys():
2937             for b in formats[fmt]:
2938                 try:
2939                     ext = formats[fmt][b][0]
2940                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2941                 except TypeError: # we have no bitrate info
2942                     ext = formats[fmt][0]
2943                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2944                     break
2945
2946     def _real_extract(self, url):
2947         mobj = re.match(self._VALID_URL, url)
2948         if mobj is None:
2949             self._downloader.report_error(u'invalid URL: %s' % url)
2950             return
2951         # extract uploader & filename from url
2952         uploader = mobj.group(1).decode('utf-8')
2953         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2954
2955         # construct API request
2956         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2957         # retrieve .json file with links to files
2958         request = compat_urllib_request.Request(file_url)
2959         try:
2960             self.report_download_json(file_url)
2961             jsonData = compat_urllib_request.urlopen(request).read()
2962         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2963             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2964             return
2965
2966         # parse JSON
2967         json_data = json.loads(jsonData)
2968         player_url = json_data['player_swf_url']
2969         formats = dict(json_data['audio_formats'])
2970
2971         req_format = self._downloader.params.get('format', None)
2972         bitrate = None
2973
2974         if self._downloader.params.get('listformats', None):
2975             self._print_formats(formats)
2976             return
2977
2978         if req_format is None or req_format == 'best':
2979             for format_param in formats.keys():
2980                 url_list = self.get_urls(formats, format_param)
2981                 # check urls
2982                 file_url = self.check_urls(url_list)
2983                 if file_url is not None:
2984                     break # got it!
2985         else:
2986             if req_format not in formats:
2987                 self._downloader.report_error(u'format is not available')
2988                 return
2989
2990             url_list = self.get_urls(formats, req_format)
2991             file_url = self.check_urls(url_list)
2992             format_param = req_format
2993
2994         return [{
2995             'id': file_id.decode('utf-8'),
2996             'url': file_url.decode('utf-8'),
2997             'uploader': uploader.decode('utf-8'),
2998             'upload_date': None,
2999             'title': json_data['name'],
3000             'ext': file_url.split('.')[-1].decode('utf-8'),
3001             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3002             'thumbnail': json_data['thumbnail_url'],
3003             'description': json_data['description'],
3004             'player_url': player_url.decode('utf-8'),
3005         }]
3006
3007 class StanfordOpenClassroomIE(InfoExtractor):
3008     """Information extractor for Stanford's Open ClassRoom"""
3009
3010     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3011     IE_NAME = u'stanfordoc'
3012
3013     def report_download_webpage(self, objid):
3014         """Report information extraction."""
3015         self.to_screen(u'%s: Downloading webpage' % objid)
3016
3017     def _real_extract(self, url):
3018         mobj = re.match(self._VALID_URL, url)
3019         if mobj is None:
3020             raise ExtractorError(u'Invalid URL: %s' % url)
3021
3022         if mobj.group('course') and mobj.group('video'): # A specific video
3023             course = mobj.group('course')
3024             video = mobj.group('video')
3025             info = {
3026                 'id': course + '_' + video,
3027                 'uploader': None,
3028                 'upload_date': None,
3029             }
3030
3031             self.report_extraction(info['id'])
3032             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3033             xmlUrl = baseUrl + video + '.xml'
3034             try:
3035                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3036             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3038                 return
3039             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3040             try:
3041                 info['title'] = mdoc.findall('./title')[0].text
3042                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3043             except IndexError:
3044                 self._downloader.report_error(u'Invalid metadata XML file')
3045                 return
3046             info['ext'] = info['url'].rpartition('.')[2]
3047             return [info]
3048         elif mobj.group('course'): # A course page
3049             course = mobj.group('course')
3050             info = {
3051                 'id': course,
3052                 'type': 'playlist',
3053                 'uploader': None,
3054                 'upload_date': None,
3055             }
3056
3057             coursepage = self._download_webpage(url, info['id'],
3058                                         note='Downloading course info page',
3059                                         errnote='Unable to download course info page')
3060
3061             m = re.search('<h1>([^<]+)</h1>', coursepage)
3062             if m:
3063                 info['title'] = unescapeHTML(m.group(1))
3064             else:
3065                 info['title'] = info['id']
3066
3067             m = re.search('<description>([^<]+)</description>', coursepage)
3068             if m:
3069                 info['description'] = unescapeHTML(m.group(1))
3070
3071             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3072             info['list'] = [
3073                 {
3074                     'type': 'reference',
3075                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3076                 }
3077                     for vpage in links]
3078             results = []
3079             for entry in info['list']:
3080                 assert entry['type'] == 'reference'
3081                 results += self.extract(entry['url'])
3082             return results
3083         else: # Root page
3084             info = {
3085                 'id': 'Stanford OpenClassroom',
3086                 'type': 'playlist',
3087                 'uploader': None,
3088                 'upload_date': None,
3089             }
3090
3091             self.report_download_webpage(info['id'])
3092             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3093             try:
3094                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3095             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3096                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3097                 return
3098
3099             info['title'] = info['id']
3100
3101             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3102             info['list'] = [
3103                 {
3104                     'type': 'reference',
3105                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3106                 }
3107                     for cpage in links]
3108
3109             results = []
3110             for entry in info['list']:
3111                 assert entry['type'] == 'reference'
3112                 results += self.extract(entry['url'])
3113             return results
3114
3115 class MTVIE(InfoExtractor):
3116     """Information extractor for MTV.com"""
3117
3118     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3119     IE_NAME = u'mtv'
3120
3121     def _real_extract(self, url):
3122         mobj = re.match(self._VALID_URL, url)
3123         if mobj is None:
3124             self._downloader.report_error(u'invalid URL: %s' % url)
3125             return
3126         if not mobj.group('proto'):
3127             url = 'http://' + url
3128         video_id = mobj.group('videoid')
3129
3130         webpage = self._download_webpage(url, video_id)
3131
3132         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3133         if mobj is None:
3134             self._downloader.report_error(u'unable to extract song name')
3135             return
3136         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3137         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3138         if mobj is None:
3139             self._downloader.report_error(u'unable to extract performer')
3140             return
3141         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3142         video_title = performer + ' - ' + song_name
3143
3144         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3145         if mobj is None:
3146             self._downloader.report_error(u'unable to mtvn_uri')
3147             return
3148         mtvn_uri = mobj.group(1)
3149
3150         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3151         if mobj is None:
3152             self._downloader.report_error(u'unable to extract content id')
3153             return
3154         content_id = mobj.group(1)
3155
3156         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3157         self.report_extraction(video_id)
3158         request = compat_urllib_request.Request(videogen_url)
3159         try:
3160             metadataXml = compat_urllib_request.urlopen(request).read()
3161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3162             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3163             return
3164
3165         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3166         renditions = mdoc.findall('.//rendition')
3167
3168         # For now, always pick the highest quality.
3169         rendition = renditions[-1]
3170
3171         try:
3172             _,_,ext = rendition.attrib['type'].partition('/')
3173             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3174             video_url = rendition.find('./src').text
3175         except KeyError:
3176             self._downloader.trouble('Invalid rendition field.')
3177             return
3178
3179         info = {
3180             'id': video_id,
3181             'url': video_url,
3182             'uploader': performer,
3183             'upload_date': None,
3184             'title': video_title,
3185             'ext': ext,
3186             'format': format,
3187         }
3188
3189         return [info]
3190
3191
3192 class YoukuIE(InfoExtractor):
3193     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3194
3195     def report_download_webpage(self, file_id):
3196         """Report webpage download."""
3197         self.to_screen(u'%s: Downloading webpage' % file_id)
3198
3199     def _gen_sid(self):
3200         nowTime = int(time.time() * 1000)
3201         random1 = random.randint(1000,1998)
3202         random2 = random.randint(1000,9999)
3203
3204         return "%d%d%d" %(nowTime,random1,random2)
3205
3206     def _get_file_ID_mix_string(self, seed):
3207         mixed = []
3208         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3209         seed = float(seed)
3210         for i in range(len(source)):
3211             seed  =  (seed * 211 + 30031 ) % 65536
3212             index  =  math.floor(seed / 65536 * len(source) )
3213             mixed.append(source[int(index)])
3214             source.remove(source[int(index)])
3215         #return ''.join(mixed)
3216         return mixed
3217
3218     def _get_file_id(self, fileId, seed):
3219         mixed = self._get_file_ID_mix_string(seed)
3220         ids = fileId.split('*')
3221         realId = []
3222         for ch in ids:
3223             if ch:
3224                 realId.append(mixed[int(ch)])
3225         return ''.join(realId)
3226
3227     def _real_extract(self, url):
3228         mobj = re.match(self._VALID_URL, url)
3229         if mobj is None:
3230             self._downloader.report_error(u'invalid URL: %s' % url)
3231             return
3232         video_id = mobj.group('ID')
3233
3234         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3235
3236         request = compat_urllib_request.Request(info_url, None, std_headers)
3237         try:
3238             self.report_download_webpage(video_id)
3239             jsondata = compat_urllib_request.urlopen(request).read()
3240         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3241             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3242             return
3243
3244         self.report_extraction(video_id)
3245         try:
3246             jsonstr = jsondata.decode('utf-8')
3247             config = json.loads(jsonstr)
3248
3249             video_title =  config['data'][0]['title']
3250             seed = config['data'][0]['seed']
3251
3252             format = self._downloader.params.get('format', None)
3253             supported_format = list(config['data'][0]['streamfileids'].keys())
3254
3255             if format is None or format == 'best':
3256                 if 'hd2' in supported_format:
3257                     format = 'hd2'
3258                 else:
3259                     format = 'flv'
3260                 ext = u'flv'
3261             elif format == 'worst':
3262                 format = 'mp4'
3263                 ext = u'mp4'
3264             else:
3265                 format = 'flv'
3266                 ext = u'flv'
3267
3268
3269             fileid = config['data'][0]['streamfileids'][format]
3270             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3271         except (UnicodeDecodeError, ValueError, KeyError):
3272             self._downloader.report_error(u'unable to extract info section')
3273             return
3274
3275         files_info=[]
3276         sid = self._gen_sid()
3277         fileid = self._get_file_id(fileid, seed)
3278
3279         #column 8,9 of fileid represent the segment number
3280         #fileid[7:9] should be changed
3281         for index, key in enumerate(keys):
3282
3283             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3284             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3285
3286             info = {
3287                 'id': '%s_part%02d' % (video_id, index),
3288                 'url': download_url,
3289                 'uploader': None,
3290                 'upload_date': None,
3291                 'title': video_title,
3292                 'ext': ext,
3293             }
3294             files_info.append(info)
3295
3296         return files_info
3297
3298
3299 class XNXXIE(InfoExtractor):
3300     """Information extractor for xnxx.com"""
3301
3302     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3303     IE_NAME = u'xnxx'
3304     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3305     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3306     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3307
3308     def report_webpage(self, video_id):
3309         """Report information extraction"""
3310         self.to_screen(u'%s: Downloading webpage' % video_id)
3311
3312     def _real_extract(self, url):
3313         mobj = re.match(self._VALID_URL, url)
3314         if mobj is None:
3315             self._downloader.report_error(u'invalid URL: %s' % url)
3316             return
3317         video_id = mobj.group(1)
3318
3319         self.report_webpage(video_id)
3320
3321         # Get webpage content
3322         try:
3323             webpage_bytes = compat_urllib_request.urlopen(url).read()
3324             webpage = webpage_bytes.decode('utf-8')
3325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3327             return
3328
3329         result = re.search(self.VIDEO_URL_RE, webpage)
3330         if result is None:
3331             self._downloader.report_error(u'unable to extract video url')
3332             return
3333         video_url = compat_urllib_parse.unquote(result.group(1))
3334
3335         result = re.search(self.VIDEO_TITLE_RE, webpage)
3336         if result is None:
3337             self._downloader.report_error(u'unable to extract video title')
3338             return
3339         video_title = result.group(1)
3340
3341         result = re.search(self.VIDEO_THUMB_RE, webpage)
3342         if result is None:
3343             self._downloader.report_error(u'unable to extract video thumbnail')
3344             return
3345         video_thumbnail = result.group(1)
3346
3347         return [{
3348             'id': video_id,
3349             'url': video_url,
3350             'uploader': None,
3351             'upload_date': None,
3352             'title': video_title,
3353             'ext': 'flv',
3354             'thumbnail': video_thumbnail,
3355             'description': None,
3356         }]
3357
3358
3359 class GooglePlusIE(InfoExtractor):
3360     """Information extractor for plus.google.com."""
3361
3362     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3363     IE_NAME = u'plus.google'
3364
3365     def __init__(self, downloader=None):
3366         InfoExtractor.__init__(self, downloader)
3367
3368     def report_extract_entry(self, url):
3369         """Report downloading extry"""
3370         self.to_screen(u'Downloading entry: %s' % url)
3371
3372     def report_date(self, upload_date):
3373         """Report downloading extry"""
3374         self.to_screen(u'Entry date: %s' % upload_date)
3375
3376     def report_uploader(self, uploader):
3377         """Report downloading extry"""
3378         self.to_screen(u'Uploader: %s' % uploader)
3379
3380     def report_title(self, video_title):
3381         """Report downloading extry"""
3382         self.to_screen(u'Title: %s' % video_title)
3383
3384     def report_extract_vid_page(self, video_page):
3385         """Report information extraction."""
3386         self.to_screen(u'Extracting video page: %s' % video_page)
3387
3388     def _real_extract(self, url):
3389         # Extract id from URL
3390         mobj = re.match(self._VALID_URL, url)
3391         if mobj is None:
3392             self._downloader.report_error(u'Invalid URL: %s' % url)
3393             return
3394
3395         post_url = mobj.group(0)
3396         video_id = mobj.group(1)
3397
3398         video_extension = 'flv'
3399
3400         # Step 1, Retrieve post webpage to extract further information
3401         self.report_extract_entry(post_url)
3402         request = compat_urllib_request.Request(post_url)
3403         try:
3404             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3406             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3407             return
3408
3409         # Extract update date
3410         upload_date = None
3411         pattern = 'title="Timestamp">(.*?)</a>'
3412         mobj = re.search(pattern, webpage)
3413         if mobj:
3414             upload_date = mobj.group(1)
3415             # Convert timestring to a format suitable for filename
3416             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3417             upload_date = upload_date.strftime('%Y%m%d')
3418         self.report_date(upload_date)
3419
3420         # Extract uploader
3421         uploader = None
3422         pattern = r'rel\="author".*?>(.*?)</a>'
3423         mobj = re.search(pattern, webpage)
3424         if mobj:
3425             uploader = mobj.group(1)
3426         self.report_uploader(uploader)
3427
3428         # Extract title
3429         # Get the first line for title
3430         video_title = u'NA'
3431         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3432         mobj = re.search(pattern, webpage)
3433         if mobj:
3434             video_title = mobj.group(1)
3435         self.report_title(video_title)
3436
3437         # Step 2, Stimulate clicking the image box to launch video
3438         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3439         mobj = re.search(pattern, webpage)
3440         if mobj is None:
3441             self._downloader.report_error(u'unable to extract video page URL')
3442
3443         video_page = mobj.group(1)
3444         request = compat_urllib_request.Request(video_page)
3445         try:
3446             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3448             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3449             return
3450         self.report_extract_vid_page(video_page)
3451
3452
3453         # Extract video links on video page
3454         """Extract video links of all sizes"""
3455         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3456         mobj = re.findall(pattern, webpage)
3457         if len(mobj) == 0:
3458             self._downloader.report_error(u'unable to extract video links')
3459
3460         # Sort in resolution
3461         links = sorted(mobj)
3462
3463         # Choose the lowest of the sort, i.e. highest resolution
3464         video_url = links[-1]
3465         # Only get the url. The resolution part in the tuple has no use anymore
3466         video_url = video_url[-1]
3467         # Treat escaped \u0026 style hex
3468         try:
3469             video_url = video_url.decode("unicode_escape")
3470         except AttributeError: # Python 3
3471             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3472
3473
3474         return [{
3475             'id':       video_id,
3476             'url':      video_url,
3477             'uploader': uploader,
3478             'upload_date':  upload_date,
3479             'title':    video_title,
3480             'ext':      video_extension,
3481         }]
3482
3483 class NBAIE(InfoExtractor):
3484     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3485     IE_NAME = u'nba'
3486
3487     def _real_extract(self, url):
3488         mobj = re.match(self._VALID_URL, url)
3489         if mobj is None:
3490             self._downloader.report_error(u'invalid URL: %s' % url)
3491             return
3492
3493         video_id = mobj.group(1)
3494         if video_id.endswith('/index.html'):
3495             video_id = video_id[:-len('/index.html')]
3496
3497         webpage = self._download_webpage(url, video_id)
3498
3499         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3500         def _findProp(rexp, default=None):
3501             m = re.search(rexp, webpage)
3502             if m:
3503                 return unescapeHTML(m.group(1))
3504             else:
3505                 return default
3506
3507         shortened_video_id = video_id.rpartition('/')[2]
3508         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3509         info = {
3510             'id': shortened_video_id,
3511             'url': video_url,
3512             'ext': 'mp4',
3513             'title': title,
3514             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3515             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3516         }
3517         return [info]
3518
3519 class JustinTVIE(InfoExtractor):
3520     """Information extractor for justin.tv and twitch.tv"""
3521     # TODO: One broadcast may be split into multiple videos. The key
3522     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3523     # starts at 1 and increases. Can we treat all parts as one video?
3524
3525     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3526         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3527     _JUSTIN_PAGE_LIMIT = 100
3528     IE_NAME = u'justin.tv'
3529
3530     def report_download_page(self, channel, offset):
3531         """Report attempt to download a single page of videos."""
3532         self.to_screen(u'%s: Downloading video information from %d to %d' %
3533                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3534
3535     # Return count of items, list of *valid* items
3536     def _parse_page(self, url):
3537         try:
3538             urlh = compat_urllib_request.urlopen(url)
3539             webpage_bytes = urlh.read()
3540             webpage = webpage_bytes.decode('utf-8', 'ignore')
3541         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3542             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3543             return
3544
3545         response = json.loads(webpage)
3546         if type(response) != list:
3547             error_text = response.get('error', 'unknown error')
3548             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3549             return
3550         info = []
3551         for clip in response:
3552             video_url = clip['video_file_url']
3553             if video_url:
3554                 video_extension = os.path.splitext(video_url)[1][1:]
3555                 video_date = re.sub('-', '', clip['start_time'][:10])
3556                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3557                 video_id = clip['id']
3558                 video_title = clip.get('title', video_id)
3559                 info.append({
3560                     'id': video_id,
3561                     'url': video_url,
3562                     'title': video_title,
3563                     'uploader': clip.get('channel_name', video_uploader_id),
3564                     'uploader_id': video_uploader_id,
3565                     'upload_date': video_date,
3566                     'ext': video_extension,
3567                 })
3568         return (len(response), info)
3569
3570     def _real_extract(self, url):
3571         mobj = re.match(self._VALID_URL, url)
3572         if mobj is None:
3573             self._downloader.report_error(u'invalid URL: %s' % url)
3574             return
3575
3576         api = 'http://api.justin.tv'
3577         video_id = mobj.group(mobj.lastindex)
3578         paged = False
3579         if mobj.lastindex == 1:
3580             paged = True
3581             api += '/channel/archives/%s.json'
3582         else:
3583             api += '/broadcast/by_archive/%s.json'
3584         api = api % (video_id,)
3585
3586         self.report_extraction(video_id)
3587
3588         info = []
3589         offset = 0
3590         limit = self._JUSTIN_PAGE_LIMIT
3591         while True:
3592             if paged:
3593                 self.report_download_page(video_id, offset)
3594             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3595             page_count, page_info = self._parse_page(page_url)
3596             info.extend(page_info)
3597             if not paged or page_count != limit:
3598                 break
3599             offset += limit
3600         return info
3601
3602 class FunnyOrDieIE(InfoExtractor):
3603     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3604
3605     def _real_extract(self, url):
3606         mobj = re.match(self._VALID_URL, url)
3607         if mobj is None:
3608             self._downloader.report_error(u'invalid URL: %s' % url)
3609             return
3610
3611         video_id = mobj.group('id')
3612         webpage = self._download_webpage(url, video_id)
3613
3614         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3615         if not m:
3616             self._downloader.report_error(u'unable to find video information')
3617         video_url = unescapeHTML(m.group('url'))
3618
3619         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3620         if not m:
3621             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3622             if not m:
3623                 self._downloader.trouble(u'Cannot find video title')
3624         title = clean_html(m.group('title'))
3625
3626         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3627         if m:
3628             desc = unescapeHTML(m.group('desc'))
3629         else:
3630             desc = None
3631
3632         info = {
3633             'id': video_id,
3634             'url': video_url,
3635             'ext': 'mp4',
3636             'title': title,
3637             'description': desc,
3638         }
3639         return [info]
3640
3641 class SteamIE(InfoExtractor):
3642     _VALID_URL = r"""http://store.steampowered.com/
3643                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3644                 (?P<gameID>\d+)/?
3645                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3646                 """
3647
3648     @classmethod
3649     def suitable(cls, url):
3650         """Receives a URL and returns True if suitable for this IE."""
3651         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3652
3653     def _real_extract(self, url):
3654         m = re.match(self._VALID_URL, url, re.VERBOSE)
3655         gameID = m.group('gameID')
3656         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3657         self.report_age_confirmation()
3658         webpage = self._download_webpage(videourl, gameID)
3659         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3660         
3661         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3662         mweb = re.finditer(urlRE, webpage)
3663         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3664         titles = re.finditer(namesRE, webpage)
3665         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3666         thumbs = re.finditer(thumbsRE, webpage)
3667         videos = []
3668         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3669             video_id = vid.group('videoID')
3670             title = vtitle.group('videoName')
3671             video_url = vid.group('videoURL')
3672             video_thumb = thumb.group('thumbnail')
3673             if not video_url:
3674                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3675             info = {
3676                 'id':video_id,
3677                 'url':video_url,
3678                 'ext': 'flv',
3679                 'title': unescapeHTML(title),
3680                 'thumbnail': video_thumb
3681                   }
3682             videos.append(info)
3683         return [self.playlist_result(videos, gameID, game_title)]
3684
3685 class UstreamIE(InfoExtractor):
3686     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3687     IE_NAME = u'ustream'
3688
3689     def _real_extract(self, url):
3690         m = re.match(self._VALID_URL, url)
3691         video_id = m.group('videoID')
3692         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3693         webpage = self._download_webpage(url, video_id)
3694         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3695         title = m.group('title')
3696         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3697         uploader = m.group('uploader')
3698         info = {
3699                 'id':video_id,
3700                 'url':video_url,
3701                 'ext': 'flv',
3702                 'title': title,
3703                 'uploader': uploader
3704                   }
3705         return [info]
3706
3707 class WorldStarHipHopIE(InfoExtractor):
3708     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3709     IE_NAME = u'WorldStarHipHop'
3710
3711     def _real_extract(self, url):
3712         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3713
3714         webpage_src = compat_urllib_request.urlopen(url).read()
3715         webpage_src = webpage_src.decode('utf-8')
3716
3717         mobj = re.search(_src_url, webpage_src)
3718
3719         m = re.match(self._VALID_URL, url)
3720         video_id = m.group('id')
3721
3722         if mobj is not None:
3723             video_url = mobj.group()
3724             if 'mp4' in video_url:
3725                 ext = 'mp4'
3726             else:
3727                 ext = 'flv'
3728         else:
3729             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3730             return
3731
3732         _title = r"""<title>(.*)</title>"""
3733
3734         mobj = re.search(_title, webpage_src)
3735
3736         if mobj is not None:
3737             title = mobj.group(1)
3738         else:
3739             title = 'World Start Hip Hop - %s' % time.ctime()
3740
3741         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3742         mobj = re.search(_thumbnail, webpage_src)
3743
3744         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3745         if mobj is not None:
3746             thumbnail = mobj.group(1)
3747         else:
3748             _title = r"""candytitles.*>(.*)</span>"""
3749             mobj = re.search(_title, webpage_src)
3750             if mobj is not None:
3751                 title = mobj.group(1)
3752             thumbnail = None
3753
3754         results = [{
3755                     'id': video_id,
3756                     'url' : video_url,
3757                     'title' : title,
3758                     'thumbnail' : thumbnail,
3759                     'ext' : ext,
3760                     }]
3761         return results
3762
3763 class RBMARadioIE(InfoExtractor):
3764     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3765
3766     def _real_extract(self, url):
3767         m = re.match(self._VALID_URL, url)
3768         video_id = m.group('videoID')
3769
3770         webpage = self._download_webpage(url, video_id)
3771         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3772         if not m:
3773             raise ExtractorError(u'Cannot find metadata')
3774         json_data = m.group(1)
3775
3776         try:
3777             data = json.loads(json_data)
3778         except ValueError as e:
3779             raise ExtractorError(u'Invalid JSON: ' + str(e))
3780
3781         video_url = data['akamai_url'] + '&cbr=256'
3782         url_parts = compat_urllib_parse_urlparse(video_url)
3783         video_ext = url_parts.path.rpartition('.')[2]
3784         info = {
3785                 'id': video_id,
3786                 'url': video_url,
3787                 'ext': video_ext,
3788                 'title': data['title'],
3789                 'description': data.get('teaser_text'),
3790                 'location': data.get('country_of_origin'),
3791                 'uploader': data.get('host', {}).get('name'),
3792                 'uploader_id': data.get('host', {}).get('slug'),
3793                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3794                 'duration': data.get('duration'),
3795         }
3796         return [info]
3797
3798
3799 class YouPornIE(InfoExtractor):
3800     """Information extractor for youporn.com."""
3801     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3802
3803     def _print_formats(self, formats):
3804         """Print all available formats"""
3805         print(u'Available formats:')
3806         print(u'ext\t\tformat')
3807         print(u'---------------------------------')
3808         for format in formats:
3809             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3810
3811     def _specific(self, req_format, formats):
3812         for x in formats:
3813             if(x["format"]==req_format):
3814                 return x
3815         return None
3816
3817     def _real_extract(self, url):
3818         mobj = re.match(self._VALID_URL, url)
3819         if mobj is None:
3820             self._downloader.report_error(u'invalid URL: %s' % url)
3821             return
3822
3823         video_id = mobj.group('videoid')
3824
3825         req = compat_urllib_request.Request(url)
3826         req.add_header('Cookie', 'age_verified=1')
3827         webpage = self._download_webpage(req, video_id)
3828
3829         # Get the video title
3830         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3831         if result is None:
3832             raise ExtractorError(u'Unable to extract video title')
3833         video_title = result.group('title').strip()
3834
3835         # Get the video date
3836         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3837         if result is None:
3838             self._downloader.report_warning(u'unable to extract video date')
3839             upload_date = None
3840         else:
3841             upload_date = result.group('date').strip()
3842
3843         # Get the video uploader
3844         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3845         if result is None:
3846             self._downloader.report_warning(u'unable to extract uploader')
3847             video_uploader = None
3848         else:
3849             video_uploader = result.group('uploader').strip()
3850             video_uploader = clean_html( video_uploader )
3851
3852         # Get all of the formats available
3853         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3854         result = re.search(DOWNLOAD_LIST_RE, webpage)
3855         if result is None:
3856             raise ExtractorError(u'Unable to extract download list')
3857         download_list_html = result.group('download_list').strip()
3858
3859         # Get all of the links from the page
3860         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3861         links = re.findall(LINK_RE, download_list_html)
3862         if(len(links) == 0):
3863             raise ExtractorError(u'ERROR: no known formats available for video')
3864
3865         self.to_screen(u'Links found: %d' % len(links))
3866
3867         formats = []
3868         for link in links:
3869
3870             # A link looks like this:
3871             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3872             # A path looks like this:
3873             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3874             video_url = unescapeHTML( link )
3875             path = compat_urllib_parse_urlparse( video_url ).path
3876             extension = os.path.splitext( path )[1][1:]
3877             format = path.split('/')[4].split('_')[:2]
3878             size = format[0]
3879             bitrate = format[1]
3880             format = "-".join( format )
3881             title = u'%s-%s-%s' % (video_title, size, bitrate)
3882
3883             formats.append({
3884                 'id': video_id,
3885                 'url': video_url,
3886                 'uploader': video_uploader,
3887                 'upload_date': upload_date,
3888                 'title': title,
3889                 'ext': extension,
3890                 'format': format,
3891                 'thumbnail': None,
3892                 'description': None,
3893                 'player_url': None
3894             })
3895
3896         if self._downloader.params.get('listformats', None):
3897             self._print_formats(formats)
3898             return
3899
3900         req_format = self._downloader.params.get('format', None)
3901         self.to_screen(u'Format: %s' % req_format)
3902
3903         if req_format is None or req_format == 'best':
3904             return [formats[0]]
3905         elif req_format == 'worst':
3906             return [formats[-1]]
3907         elif req_format in ('-1', 'all'):
3908             return formats
3909         else:
3910             format = self._specific( req_format, formats )
3911             if result is None:
3912                 self._downloader.report_error(u'requested format not available')
3913                 return
3914             return [format]
3915
3916
3917
3918 class PornotubeIE(InfoExtractor):
3919     """Information extractor for pornotube.com."""
3920     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3921
3922     def _real_extract(self, url):
3923         mobj = re.match(self._VALID_URL, url)
3924         if mobj is None:
3925             self._downloader.report_error(u'invalid URL: %s' % url)
3926             return
3927
3928         video_id = mobj.group('videoid')
3929         video_title = mobj.group('title')
3930
3931         # Get webpage content
3932         webpage = self._download_webpage(url, video_id)
3933
3934         # Get the video URL
3935         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3936         result = re.search(VIDEO_URL_RE, webpage)
3937         if result is None:
3938             self._downloader.report_error(u'unable to extract video url')
3939             return
3940         video_url = compat_urllib_parse.unquote(result.group('url'))
3941
3942         #Get the uploaded date
3943         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3944         result = re.search(VIDEO_UPLOADED_RE, webpage)
3945         if result is None:
3946             self._downloader.report_error(u'unable to extract video title')
3947             return
3948         upload_date = result.group('date')
3949
3950         info = {'id': video_id,
3951                 'url': video_url,
3952                 'uploader': None,
3953                 'upload_date': upload_date,
3954                 'title': video_title,
3955                 'ext': 'flv',
3956                 'format': 'flv'}
3957
3958         return [info]
3959
3960 class YouJizzIE(InfoExtractor):
3961     """Information extractor for youjizz.com."""
3962     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3963
3964     def _real_extract(self, url):
3965         mobj = re.match(self._VALID_URL, url)
3966         if mobj is None:
3967             self._downloader.report_error(u'invalid URL: %s' % url)
3968             return
3969
3970         video_id = mobj.group('videoid')
3971
3972         # Get webpage content
3973         webpage = self._download_webpage(url, video_id)
3974
3975         # Get the video title
3976         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3977         if result is None:
3978             raise ExtractorError(u'ERROR: unable to extract video title')
3979         video_title = result.group('title').strip()
3980
3981         # Get the embed page
3982         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3983         if result is None:
3984             raise ExtractorError(u'ERROR: unable to extract embed page')
3985
3986         embed_page_url = result.group(0).strip()
3987         video_id = result.group('videoid')
3988
3989         webpage = self._download_webpage(embed_page_url, video_id)
3990
3991         # Get the video URL
3992         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3993         if result is None:
3994             raise ExtractorError(u'ERROR: unable to extract video url')
3995         video_url = result.group('source')
3996
3997         info = {'id': video_id,
3998                 'url': video_url,
3999                 'title': video_title,
4000                 'ext': 'flv',
4001                 'format': 'flv',
4002                 'player_url': embed_page_url}
4003
4004         return [info]
4005
4006 class EightTracksIE(InfoExtractor):
4007     IE_NAME = '8tracks'
4008     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4009
4010     def _real_extract(self, url):
4011         mobj = re.match(self._VALID_URL, url)
4012         if mobj is None:
4013             raise ExtractorError(u'Invalid URL: %s' % url)
4014         playlist_id = mobj.group('id')
4015
4016         webpage = self._download_webpage(url, playlist_id)
4017
4018         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4019         if not m:
4020             raise ExtractorError(u'Cannot find trax information')
4021         json_like = m.group(1)
4022         data = json.loads(json_like)
4023
4024         session = str(random.randint(0, 1000000000))
4025         mix_id = data['id']
4026         track_count = data['tracks_count']
4027         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4028         next_url = first_url
4029         res = []
4030         for i in itertools.count():
4031             api_json = self._download_webpage(next_url, playlist_id,
4032                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4033                 errnote=u'Failed to download song information')
4034             api_data = json.loads(api_json)
4035             track_data = api_data[u'set']['track']
4036             info = {
4037                 'id': track_data['id'],
4038                 'url': track_data['track_file_stream_url'],
4039                 'title': track_data['performer'] + u' - ' + track_data['name'],
4040                 'raw_title': track_data['name'],
4041                 'uploader_id': data['user']['login'],
4042                 'ext': 'm4a',
4043             }
4044             res.append(info)
4045             if api_data['set']['at_last_track']:
4046                 break
4047             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4048         return res
4049
4050 class KeekIE(InfoExtractor):
4051     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4052     IE_NAME = u'keek'
4053
4054     def _real_extract(self, url):
4055         m = re.match(self._VALID_URL, url)
4056         video_id = m.group('videoID')
4057         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4058         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4059         webpage = self._download_webpage(url, video_id)
4060         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4061         title = unescapeHTML(m.group('title'))
4062         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4063         uploader = clean_html(m.group('uploader'))
4064         info = {
4065                 'id': video_id,
4066                 'url': video_url,
4067                 'ext': 'mp4',
4068                 'title': title,
4069                 'thumbnail': thumbnail,
4070                 'uploader': uploader
4071         }
4072         return [info]
4073
4074 class TEDIE(InfoExtractor):
4075     _VALID_URL=r'''http://www.ted.com/
4076                    (
4077                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4078                         |
4079                         ((?P<type_talk>talks)) # We have a simple talk
4080                    )
4081                    /(?P<name>\w+) # Here goes the name and then ".html"
4082                    '''
4083
4084     @classmethod
4085     def suitable(cls, url):
4086         """Receives a URL and returns True if suitable for this IE."""
4087         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4088
4089     def _real_extract(self, url):
4090         m=re.match(self._VALID_URL, url, re.VERBOSE)
4091         if m.group('type_talk'):
4092             return [self._talk_info(url)]
4093         else :
4094             playlist_id=m.group('playlist_id')
4095             name=m.group('name')
4096             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4097             return [self._playlist_videos_info(url,name,playlist_id)]
4098
4099     def _talk_video_link(self,mediaSlug):
4100         '''Returns the video link for that mediaSlug'''
4101         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4102
4103     def _playlist_videos_info(self,url,name,playlist_id=0):
4104         '''Returns the videos of the playlist'''
4105         video_RE=r'''
4106                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4107                      ([.\s]*?)data-playlist_item_id="(\d+)"
4108                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4109                      '''
4110         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4111         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4112         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4113         m_names=re.finditer(video_name_RE,webpage)
4114
4115         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4116         m_playlist = re.search(playlist_RE, webpage)
4117         playlist_title = m_playlist.group('playlist_title')
4118
4119         playlist_entries = []
4120         for m_video, m_name in zip(m_videos,m_names):
4121             video_id=m_video.group('video_id')
4122             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4123             playlist_entries.append(self.url_result(talk_url, 'TED'))
4124         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4125
4126     def _talk_info(self, url, video_id=0):
4127         """Return the video for the talk in the url"""
4128         m=re.match(self._VALID_URL, url,re.VERBOSE)
4129         videoName=m.group('name')
4130         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4131         # If the url includes the language we get the title translated
4132         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4133         title=re.search(title_RE, webpage).group('title')
4134         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4135                         "id":(?P<videoID>[\d]+).*?
4136                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4137         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4138         thumb_match=re.search(thumb_RE,webpage)
4139         info_match=re.search(info_RE,webpage,re.VERBOSE)
4140         video_id=info_match.group('videoID')
4141         mediaSlug=info_match.group('mediaSlug')
4142         video_url=self._talk_video_link(mediaSlug)
4143         info = {
4144                 'id': video_id,
4145                 'url': video_url,
4146                 'ext': 'mp4',
4147                 'title': title,
4148                 'thumbnail': thumb_match.group('thumbnail')
4149                 }
4150         return info
4151
4152 class MySpassIE(InfoExtractor):
4153     _VALID_URL = r'http://www.myspass.de/.*'
4154
4155     def _real_extract(self, url):
4156         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4157
4158         # video id is the last path element of the URL
4159         # usually there is a trailing slash, so also try the second but last
4160         url_path = compat_urllib_parse_urlparse(url).path
4161         url_parent_path, video_id = os.path.split(url_path)
4162         if not video_id:
4163             _, video_id = os.path.split(url_parent_path)
4164
4165         # get metadata
4166         metadata_url = META_DATA_URL_TEMPLATE % video_id
4167         metadata_text = self._download_webpage(metadata_url, video_id)
4168         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4169
4170         # extract values from metadata
4171         url_flv_el = metadata.find('url_flv')
4172         if url_flv_el is None:
4173             self._downloader.report_error(u'unable to extract download url')
4174             return
4175         video_url = url_flv_el.text
4176         extension = os.path.splitext(video_url)[1][1:]
4177         title_el = metadata.find('title')
4178         if title_el is None:
4179             self._downloader.report_error(u'unable to extract title')
4180             return
4181         title = title_el.text
4182         format_id_el = metadata.find('format_id')
4183         if format_id_el is None:
4184             format = ext
4185         else:
4186             format = format_id_el.text
4187         description_el = metadata.find('description')
4188         if description_el is not None:
4189             description = description_el.text
4190         else:
4191             description = None
4192         imagePreview_el = metadata.find('imagePreview')
4193         if imagePreview_el is not None:
4194             thumbnail = imagePreview_el.text
4195         else:
4196             thumbnail = None
4197         info = {
4198             'id': video_id,
4199             'url': video_url,
4200             'title': title,
4201             'ext': extension,
4202             'format': format,
4203             'thumbnail': thumbnail,
4204             'description': description
4205         }
4206         return [info]
4207
4208 class SpiegelIE(InfoExtractor):
4209     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4210
4211     def _real_extract(self, url):
4212         m = re.match(self._VALID_URL, url)
4213         video_id = m.group('videoID')
4214
4215         webpage = self._download_webpage(url, video_id)
4216         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4217         if not m:
4218             raise ExtractorError(u'Cannot find title')
4219         video_title = unescapeHTML(m.group(1))
4220
4221         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4222         xml_code = self._download_webpage(xml_url, video_id,
4223                     note=u'Downloading XML', errnote=u'Failed to download XML')
4224
4225         idoc = xml.etree.ElementTree.fromstring(xml_code)
4226         last_type = idoc[-1]
4227         filename = last_type.findall('./filename')[0].text
4228         duration = float(last_type.findall('./duration')[0].text)
4229
4230         video_url = 'http://video2.spiegel.de/flash/' + filename
4231         video_ext = filename.rpartition('.')[2]
4232         info = {
4233             'id': video_id,
4234             'url': video_url,
4235             'ext': video_ext,
4236             'title': video_title,
4237             'duration': duration,
4238         }
4239         return [info]
4240
4241 class LiveLeakIE(InfoExtractor):
4242
4243     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4244     IE_NAME = u'liveleak'
4245
4246     def _real_extract(self, url):
4247         mobj = re.match(self._VALID_URL, url)
4248         if mobj is None:
4249             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4250             return
4251
4252         video_id = mobj.group('video_id')
4253
4254         webpage = self._download_webpage(url, video_id)
4255
4256         m = re.search(r'file: "(.*?)",', webpage)
4257         if not m:
4258             self._downloader.report_error(u'unable to find video url')
4259             return
4260         video_url = m.group(1)
4261
4262         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4263         if not m:
4264             self._downloader.trouble(u'Cannot find video title')
4265         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4266
4267         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4268         if m:
4269             desc = unescapeHTML(m.group('desc'))
4270         else:
4271             desc = None
4272
4273         m = re.search(r'By:.*?(\w+)</a>', webpage)
4274         if m:
4275             uploader = clean_html(m.group(1))
4276         else:
4277             uploader = None
4278
4279         info = {
4280             'id':  video_id,
4281             'url': video_url,
4282             'ext': 'mp4',
4283             'title': title,
4284             'description': desc,
4285             'uploader': uploader
4286         }
4287
4288         return [info]
4289
4290 class ARDIE(InfoExtractor):
4291     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4292     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4293     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4294
4295     def _real_extract(self, url):
4296         # determine video id from url
4297         m = re.match(self._VALID_URL, url)
4298
4299         numid = re.search(r'documentId=([0-9]+)', url)
4300         if numid:
4301             video_id = numid.group(1)
4302         else:
4303             video_id = m.group('video_id')
4304
4305         # determine title and media streams from webpage
4306         html = self._download_webpage(url, video_id)
4307         title = re.search(self._TITLE, html).group('title')
4308         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4309         if not streams:
4310             assert '"fsk"' in html
4311             self._downloader.report_error(u'this video is only available after 8:00 pm')
4312             return
4313
4314         # choose default media type and highest quality for now
4315         stream = max([s for s in streams if int(s["media_type"]) == 0],
4316                      key=lambda s: int(s["quality"]))
4317
4318         # there's two possibilities: RTMP stream or HTTP download
4319         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4320         if stream['rtmp_url']:
4321             self.to_screen(u'RTMP download detected')
4322             assert stream['video_url'].startswith('mp4:')
4323             info["url"] = stream["rtmp_url"]
4324             info["play_path"] = stream['video_url']
4325         else:
4326             assert stream["video_url"].endswith('.mp4')
4327             info["url"] = stream["video_url"]
4328         return [info]
4329
4330 class TumblrIE(InfoExtractor):
4331     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4332
4333     def _real_extract(self, url):
4334         m_url = re.match(self._VALID_URL, url)
4335         video_id = m_url.group('id')
4336         blog = m_url.group('blog_name')
4337
4338         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4339         webpage = self._download_webpage(url, video_id)
4340
4341         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4342         video = re.search(re_video, webpage)
4343         if video is None:
4344             self.to_screen("No video founded")
4345             return []
4346         video_url = video.group('video_url')
4347         ext = video.group('ext')
4348
4349         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4350         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4351
4352         # The only place where you can get a title, it's not complete,
4353         # but searching in other places doesn't work for all videos
4354         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4355         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4356
4357         return [{'id': video_id,
4358                  'url': video_url,
4359                  'title': title,
4360                  'thumbnail': thumb,
4361                  'ext': ext
4362                  }]
4363
4364
4365 def gen_extractors():
4366     """ Return a list of an instance of every supported extractor.
4367     The order does matter; the first extractor matched is the one handling the URL.
4368     """
4369     return [
4370         YoutubePlaylistIE(),
4371         YoutubeChannelIE(),
4372         YoutubeUserIE(),
4373         YoutubeSearchIE(),
4374         YoutubeIE(),
4375         MetacafeIE(),
4376         DailymotionIE(),
4377         GoogleSearchIE(),
4378         PhotobucketIE(),
4379         YahooIE(),
4380         YahooSearchIE(),
4381         DepositFilesIE(),
4382         FacebookIE(),
4383         BlipTVUserIE(),
4384         BlipTVIE(),
4385         VimeoIE(),
4386         MyVideoIE(),
4387         ComedyCentralIE(),
4388         EscapistIE(),
4389         CollegeHumorIE(),
4390         XVideosIE(),
4391         SoundcloudSetIE(),
4392         SoundcloudIE(),
4393         InfoQIE(),
4394         MixcloudIE(),
4395         StanfordOpenClassroomIE(),
4396         MTVIE(),
4397         YoukuIE(),
4398         XNXXIE(),
4399         YouJizzIE(),
4400         PornotubeIE(),
4401         YouPornIE(),
4402         GooglePlusIE(),
4403         ArteTvIE(),
4404         NBAIE(),
4405         WorldStarHipHopIE(),
4406         JustinTVIE(),
4407         FunnyOrDieIE(),
4408         SteamIE(),
4409         UstreamIE(),
4410         RBMARadioIE(),
4411         EightTracksIE(),
4412         KeekIE(),
4413         TEDIE(),
4414         MySpassIE(),
4415         SpiegelIE(),
4416         LiveLeakIE(),
4417         ARDIE(),
4418         TumblrIE(),
4419         GenericIE()
4420     ]
4421
4422 def get_info_extractor(ie_name):
4423     """Returns the info extractor class with the given ie_name"""
4424     return globals()[ie_name+'IE']