Updated README
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     def report_extraction(self, id_or_name):
152         """Report information extraction."""
153         self.to_screen(u'%s: Extracting information' % id_or_name)
154
155     def report_age_confirmation(self):
156         """Report attempt to confirm age."""
157         self.to_screen(u'Confirming age')
158
159     #Methods for following #608
160     #They set the correct value of the '_type' key
161     def video_result(self, video_info):
162         """Returns a video"""
163         video_info['_type'] = 'video'
164         return video_info
165     def url_result(self, url, ie=None):
166         """Returns a url that points to a page that should be processed"""
167         #TODO: ie should be the class used for getting the info
168         video_info = {'_type': 'url',
169                       'url': url,
170                       'ie_key': ie}
171         return video_info
172     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
173         """Returns a playlist"""
174         video_info = {'_type': 'playlist',
175                       'entries': entries}
176         if playlist_id:
177             video_info['id'] = playlist_id
178         if playlist_title:
179             video_info['title'] = playlist_title
180         return video_info
181
182
183 class YoutubeIE(InfoExtractor):
184     """Information extractor for youtube.com."""
185
186     _VALID_URL = r"""^
187                      (
188                          (?:https?://)?                                       # http(s):// (optional)
189                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
190                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
191                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
192                          (?:                                                  # the various things that can precede the ID:
193                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
194                              |(?:                                             # or the v= param in all its forms
195                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
196                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
197                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
198                                  v=
199                              )
200                          )?                                                   # optional -> youtube.com/xxxx is OK
201                      )?                                                       # all until now is optional -> you can pass the naked ID
202                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
203                      (?(1).+)?                                                # if we found the ID, everything can follow
204                      $"""
205     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
206     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
207     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
208     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
209     _NETRC_MACHINE = 'youtube'
210     # Listed in order of quality
211     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
212     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
213     _video_extensions = {
214         '13': '3gp',
215         '17': 'mp4',
216         '18': 'mp4',
217         '22': 'mp4',
218         '37': 'mp4',
219         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
220         '43': 'webm',
221         '44': 'webm',
222         '45': 'webm',
223         '46': 'webm',
224     }
225     _video_dimensions = {
226         '5': '240x400',
227         '6': '???',
228         '13': '???',
229         '17': '144x176',
230         '18': '360x640',
231         '22': '720x1280',
232         '34': '360x640',
233         '35': '480x854',
234         '37': '1080x1920',
235         '38': '3072x4096',
236         '43': '360x640',
237         '44': '480x854',
238         '45': '720x1280',
239         '46': '1080x1920',
240     }
241     IE_NAME = u'youtube'
242
243     @classmethod
244     def suitable(cls, url):
245         """Receives a URL and returns True if suitable for this IE."""
246         if YoutubePlaylistIE.suitable(url): return False
247         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
248
249     def report_lang(self):
250         """Report attempt to set language."""
251         self.to_screen(u'Setting language')
252
253     def report_login(self):
254         """Report attempt to log in."""
255         self.to_screen(u'Logging in')
256
257     def report_video_webpage_download(self, video_id):
258         """Report attempt to download video webpage."""
259         self.to_screen(u'%s: Downloading video webpage' % video_id)
260
261     def report_video_info_webpage_download(self, video_id):
262         """Report attempt to download video info webpage."""
263         self.to_screen(u'%s: Downloading video info webpage' % video_id)
264
265     def report_video_subtitles_download(self, video_id):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Checking available subtitles' % video_id)
268
269     def report_video_subtitles_request(self, video_id, sub_lang, format):
270         """Report attempt to download video info webpage."""
271         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
272
273     def report_video_subtitles_available(self, video_id, sub_lang_list):
274         """Report available subtitles."""
275         sub_lang = ",".join(list(sub_lang_list.keys()))
276         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
277
278     def report_information_extraction(self, video_id):
279         """Report attempt to extract video information."""
280         self.to_screen(u'%s: Extracting video information' % video_id)
281
282     def report_unavailable_format(self, video_id, format):
283         """Report extracted video URL."""
284         self.to_screen(u'%s: Format %s not available' % (video_id, format))
285
286     def report_rtmp_download(self):
287         """Indicate the download will use the RTMP protocol."""
288         self.to_screen(u'RTMP download detected')
289
290     def _get_available_subtitles(self, video_id):
291         self.report_video_subtitles_download(video_id)
292         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
293         try:
294             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296             return (u'unable to download video subtitles: %s' % compat_str(err), None)
297         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299         if not sub_lang_list:
300             return (u'video doesn\'t have subtitles', None)
301         return sub_lang_list
302
303     def _list_available_subtitles(self, video_id):
304         sub_lang_list = self._get_available_subtitles(video_id)
305         self.report_video_subtitles_available(video_id, sub_lang_list)
306
307     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
308         """
309         Return tuple:
310         (error_message, sub_lang, sub)
311         """
312         self.report_video_subtitles_request(video_id, sub_lang, format)
313         params = compat_urllib_parse.urlencode({
314             'lang': sub_lang,
315             'name': sub_name,
316             'v': video_id,
317             'fmt': format,
318         })
319         url = 'http://www.youtube.com/api/timedtext?' + params
320         try:
321             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
324         if not sub:
325             return (u'Did not fetch video subtitles', None, None)
326         return (None, sub_lang, sub)
327
328     def _extract_subtitle(self, video_id):
329         """
330         Return a list with a tuple:
331         [(error_message, sub_lang, sub)]
332         """
333         sub_lang_list = self._get_available_subtitles(video_id)
334         sub_format = self._downloader.params.get('subtitlesformat')
335         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336             return [(sub_lang_list[0], None, None)]
337         if self._downloader.params.get('subtitleslang', False):
338             sub_lang = self._downloader.params.get('subtitleslang')
339         elif 'en' in sub_lang_list:
340             sub_lang = 'en'
341         else:
342             sub_lang = list(sub_lang_list.keys())[0]
343         if not sub_lang in sub_lang_list:
344             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
345
346         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
347         return [subtitle]
348
349     def _extract_all_subtitles(self, video_id):
350         sub_lang_list = self._get_available_subtitles(video_id)
351         sub_format = self._downloader.params.get('subtitlesformat')
352         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353             return [(sub_lang_list[0], None, None)]
354         subtitles = []
355         for sub_lang in sub_lang_list:
356             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357             subtitles.append(subtitle)
358         return subtitles
359
360     def _print_formats(self, formats):
361         print('Available formats:')
362         for x in formats:
363             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
364
365     def _real_initialize(self):
366         if self._downloader is None:
367             return
368
369         username = None
370         password = None
371         downloader_params = self._downloader.params
372
373         # Attempt to use provided username and password or .netrc data
374         if downloader_params.get('username', None) is not None:
375             username = downloader_params['username']
376             password = downloader_params['password']
377         elif downloader_params.get('usenetrc', False):
378             try:
379                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
380                 if info is not None:
381                     username = info[0]
382                     password = info[2]
383                 else:
384                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385             except (IOError, netrc.NetrcParseError) as err:
386                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
387                 return
388
389         # Set language
390         request = compat_urllib_request.Request(self._LANG_URL)
391         try:
392             self.report_lang()
393             compat_urllib_request.urlopen(request).read()
394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
396             return
397
398         # No authentication to be performed
399         if username is None:
400             return
401
402         request = compat_urllib_request.Request(self._LOGIN_URL)
403         try:
404             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
407             return
408
409         galx = None
410         dsh = None
411         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
412         if match:
413           galx = match.group(1)
414
415         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
416         if match:
417           dsh = match.group(1)
418
419         # Log in
420         login_form_strs = {
421                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
422                 u'Email': username,
423                 u'GALX': galx,
424                 u'Passwd': password,
425                 u'PersistentCookie': u'yes',
426                 u'_utf8': u'霱',
427                 u'bgresponse': u'js_disabled',
428                 u'checkConnection': u'',
429                 u'checkedDomains': u'youtube',
430                 u'dnConn': u'',
431                 u'dsh': dsh,
432                 u'pstMsg': u'0',
433                 u'rmShown': u'1',
434                 u'secTok': u'',
435                 u'signIn': u'Sign in',
436                 u'timeStmp': u'',
437                 u'service': u'youtube',
438                 u'uilel': u'3',
439                 u'hl': u'en_US',
440         }
441         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
442         # chokes on unicode
443         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
446         try:
447             self.report_login()
448             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450                 self._downloader.report_warning(u'unable to log in: bad username or password')
451                 return
452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
454             return
455
456         # Confirm age
457         age_form = {
458                 'next_url':     '/',
459                 'action_confirm':   'Confirm',
460                 }
461         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
462         try:
463             self.report_age_confirmation()
464             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
467             return
468
469     def _extract_id(self, url):
470         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
471         if mobj is None:
472             self._downloader.report_error(u'invalid URL: %s' % url)
473             return
474         video_id = mobj.group(2)
475         return video_id
476
477     def _real_extract(self, url):
478         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479         mobj = re.search(self._NEXT_URL_RE, url)
480         if mobj:
481             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482         video_id = self._extract_id(url)
483
484         # Get video webpage
485         self.report_video_webpage_download(video_id)
486         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487         request = compat_urllib_request.Request(url)
488         try:
489             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
492             return
493
494         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
495
496         # Attempt to extract SWF player URL
497         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
498         if mobj is not None:
499             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
500         else:
501             player_url = None
502
503         # Get video info
504         self.report_video_info_webpage_download(video_id)
505         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507                     % (video_id, el_type))
508             video_info_webpage = self._download_webpage(video_info_url, video_id,
509                                     note=False,
510                                     errnote='unable to download video info webpage')
511             video_info = compat_parse_qs(video_info_webpage)
512             if 'token' in video_info:
513                 break
514         if 'token' not in video_info:
515             if 'reason' in video_info:
516                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
517             else:
518                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
519             return
520
521         # Check for "rental" videos
522         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523             self._downloader.report_error(u'"rental" videos not supported')
524             return
525
526         # Start extracting information
527         self.report_information_extraction(video_id)
528
529         # uploader
530         if 'author' not in video_info:
531             self._downloader.report_error(u'unable to extract uploader name')
532             return
533         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
534
535         # uploader_id
536         video_uploader_id = None
537         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
538         if mobj is not None:
539             video_uploader_id = mobj.group(1)
540         else:
541             self._downloader.report_warning(u'unable to extract uploader nickname')
542
543         # title
544         if 'title' not in video_info:
545             self._downloader.report_error(u'unable to extract video title')
546             return
547         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
548
549         # thumbnail image
550         if 'thumbnail_url' not in video_info:
551             self._downloader.report_warning(u'unable to extract video thumbnail')
552             video_thumbnail = ''
553         else:   # don't panic if we can't find it
554             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
555
556         # upload date
557         upload_date = None
558         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
559         if mobj is not None:
560             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562             for expression in format_expressions:
563                 try:
564                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
565                 except:
566                     pass
567
568         # description
569         video_description = get_element_by_id("eow-description", video_webpage)
570         if video_description:
571             video_description = clean_html(video_description)
572         else:
573             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574             if fd_mobj:
575                 video_description = unescapeHTML(fd_mobj.group(1))
576             else:
577                 video_description = u''
578
579         # subtitles
580         video_subtitles = None
581
582         if self._downloader.params.get('writesubtitles', False):
583             video_subtitles = self._extract_subtitle(video_id)
584             if video_subtitles:
585                 (sub_error, sub_lang, sub) = video_subtitles[0]
586                 if sub_error:
587                     self._downloader.report_error(sub_error)
588
589         if self._downloader.params.get('allsubtitles', False):
590             video_subtitles = self._extract_all_subtitles(video_id)
591             for video_subtitle in video_subtitles:
592                 (sub_error, sub_lang, sub) = video_subtitle
593                 if sub_error:
594                     self._downloader.report_error(sub_error)
595
596         if self._downloader.params.get('listsubtitles', False):
597             sub_lang_list = self._list_available_subtitles(video_id)
598             return
599
600         if 'length_seconds' not in video_info:
601             self._downloader.report_warning(u'unable to extract video duration')
602             video_duration = ''
603         else:
604             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
605
606         # token
607         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608
609         # Decide which formats to download
610         req_format = self._downloader.params.get('format', None)
611
612         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
613             self.report_rtmp_download()
614             video_url_list = [(None, video_info['conn'][0])]
615         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
616             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
617             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
618             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
619             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620
621             format_limit = self._downloader.params.get('format_limit', None)
622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623             if format_limit is not None and format_limit in available_formats:
624                 format_list = available_formats[available_formats.index(format_limit):]
625             else:
626                 format_list = available_formats
627             existing_formats = [x for x in format_list if x in url_map]
628             if len(existing_formats) == 0:
629                 raise ExtractorError(u'no known formats available for video')
630             if self._downloader.params.get('listformats', None):
631                 self._print_formats(existing_formats)
632                 return
633             if req_format is None or req_format == 'best':
634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635             elif req_format == 'worst':
636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637             elif req_format in ('-1', 'all'):
638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639             else:
640                 # Specific formats. We pick the first in a slash-delimeted sequence.
641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642                 req_formats = req_format.split('/')
643                 video_url_list = None
644                 for rf in req_formats:
645                     if rf in url_map:
646                         video_url_list = [(rf, url_map[rf])]
647                         break
648                 if video_url_list is None:
649                     raise ExtractorError(u'requested format not available')
650         else:
651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
652
653         results = []
654         for format_param, video_real_url in video_url_list:
655             # Extension
656             video_extension = self._video_extensions.get(format_param, 'flv')
657
658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659                                               self._video_dimensions.get(format_param, '???'))
660
661             results.append({
662                 'id':       video_id,
663                 'url':      video_real_url,
664                 'uploader': video_uploader,
665                 'uploader_id': video_uploader_id,
666                 'upload_date':  upload_date,
667                 'title':    video_title,
668                 'ext':      video_extension,
669                 'format':   video_format,
670                 'thumbnail':    video_thumbnail,
671                 'description':  video_description,
672                 'player_url':   player_url,
673                 'subtitles':    video_subtitles,
674                 'duration':     video_duration
675             })
676         return results
677
678
679 class MetacafeIE(InfoExtractor):
680     """Information Extractor for metacafe.com."""
681
682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685     IE_NAME = u'metacafe'
686
687     def __init__(self, downloader=None):
688         InfoExtractor.__init__(self, downloader)
689
690     def report_disclaimer(self):
691         """Report disclaimer retrieval."""
692         self.to_screen(u'Retrieving disclaimer')
693
694     def report_download_webpage(self, video_id):
695         """Report webpage download."""
696         self.to_screen(u'%s: Downloading webpage' % video_id)
697
698     def _real_initialize(self):
699         # Retrieve disclaimer
700         request = compat_urllib_request.Request(self._DISCLAIMER)
701         try:
702             self.report_disclaimer()
703             disclaimer = compat_urllib_request.urlopen(request).read()
704         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
706             return
707
708         # Confirm age
709         disclaimer_form = {
710             'filters': '0',
711             'submit': "Continue - I'm over 18",
712             }
713         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
714         try:
715             self.report_age_confirmation()
716             disclaimer = compat_urllib_request.urlopen(request).read()
717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
719             return
720
721     def _real_extract(self, url):
722         # Extract id and simplified title from URL
723         mobj = re.match(self._VALID_URL, url)
724         if mobj is None:
725             self._downloader.report_error(u'invalid URL: %s' % url)
726             return
727
728         video_id = mobj.group(1)
729
730         # Check if video comes from YouTube
731         mobj2 = re.match(r'^yt-(.*)$', video_id)
732         if mobj2 is not None:
733             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
734
735         # Retrieve video webpage to extract further information
736         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
737
738         # Extract URL, uploader and title from webpage
739         self.report_extraction(video_id)
740         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741         if mobj is not None:
742             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
743             video_extension = mediaURL[-3:]
744
745             # Extract gdaKey if available
746             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747             if mobj is None:
748                 video_url = mediaURL
749             else:
750                 gdaKey = mobj.group(1)
751                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752         else:
753             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754             if mobj is None:
755                 self._downloader.report_error(u'unable to extract media URL')
756                 return
757             vardict = compat_parse_qs(mobj.group(1))
758             if 'mediaData' not in vardict:
759                 self._downloader.report_error(u'unable to extract media URL')
760                 return
761             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
762             if mobj is None:
763                 self._downloader.report_error(u'unable to extract media URL')
764                 return
765             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
766             video_extension = mediaURL[-3:]
767             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
768
769         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770         if mobj is None:
771             self._downloader.report_error(u'unable to extract title')
772             return
773         video_title = mobj.group(1).decode('utf-8')
774
775         mobj = re.search(r'submitter=(.*?);', webpage)
776         if mobj is None:
777             self._downloader.report_error(u'unable to extract uploader nickname')
778             return
779         video_uploader = mobj.group(1)
780
781         return [{
782             'id':       video_id.decode('utf-8'),
783             'url':      video_url.decode('utf-8'),
784             'uploader': video_uploader.decode('utf-8'),
785             'upload_date':  None,
786             'title':    video_title,
787             'ext':      video_extension.decode('utf-8'),
788         }]
789
790
791 class DailymotionIE(InfoExtractor):
792     """Information Extractor for Dailymotion"""
793
794     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
795     IE_NAME = u'dailymotion'
796     _WORKING = False
797
798     def __init__(self, downloader=None):
799         InfoExtractor.__init__(self, downloader)
800
801     def _real_extract(self, url):
802         # Extract id and simplified title from URL
803         mobj = re.match(self._VALID_URL, url)
804         if mobj is None:
805             self._downloader.report_error(u'invalid URL: %s' % url)
806             return
807
808         video_id = mobj.group(1).split('_')[0].split('?')[0]
809
810         video_extension = 'mp4'
811
812         # Retrieve video webpage to extract further information
813         request = compat_urllib_request.Request(url)
814         request.add_header('Cookie', 'family_filter=off')
815         webpage = self._download_webpage(request, video_id)
816
817         # Extract URL, uploader and title from webpage
818         self.report_extraction(video_id)
819         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
820         if mobj is None:
821             self._downloader.report_error(u'unable to extract media URL')
822             return
823         flashvars = compat_urllib_parse.unquote(mobj.group(1))
824
825         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
826             if key in flashvars:
827                 max_quality = key
828                 self.to_screen(u'Using %s' % key)
829                 break
830         else:
831             self._downloader.report_error(u'unable to extract video URL')
832             return
833
834         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
835         if mobj is None:
836             self._downloader.report_error(u'unable to extract video URL')
837             return
838
839         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
840
841         # TODO: support choosing qualities
842
843         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
844         if mobj is None:
845             self._downloader.report_error(u'unable to extract title')
846             return
847         video_title = unescapeHTML(mobj.group('title'))
848
849         video_uploader = None
850         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
851         if mobj is None:
852             # lookin for official user
853             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
854             if mobj_official is None:
855                 self._downloader.report_warning(u'unable to extract uploader nickname')
856             else:
857                 video_uploader = mobj_official.group(1)
858         else:
859             video_uploader = mobj.group(1)
860
861         video_upload_date = None
862         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
863         if mobj is not None:
864             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
865
866         return [{
867             'id':       video_id,
868             'url':      video_url,
869             'uploader': video_uploader,
870             'upload_date':  video_upload_date,
871             'title':    video_title,
872             'ext':      video_extension,
873         }]
874
875
876 class PhotobucketIE(InfoExtractor):
877     """Information extractor for photobucket.com."""
878
879     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
880     IE_NAME = u'photobucket'
881
882     def __init__(self, downloader=None):
883         InfoExtractor.__init__(self, downloader)
884
885     def report_download_webpage(self, video_id):
886         """Report webpage download."""
887         self.to_screen(u'%s: Downloading webpage' % video_id)
888
889     def _real_extract(self, url):
890         # Extract id from URL
891         mobj = re.match(self._VALID_URL, url)
892         if mobj is None:
893             self._downloader.report_error(u'Invalid URL: %s' % url)
894             return
895
896         video_id = mobj.group(1)
897
898         video_extension = 'flv'
899
900         # Retrieve video webpage to extract further information
901         request = compat_urllib_request.Request(url)
902         try:
903             self.report_download_webpage(video_id)
904             webpage = compat_urllib_request.urlopen(request).read()
905         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
906             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
907             return
908
909         # Extract URL, uploader, and title from webpage
910         self.report_extraction(video_id)
911         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
912         if mobj is None:
913             self._downloader.report_error(u'unable to extract media URL')
914             return
915         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
916
917         video_url = mediaURL
918
919         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
920         if mobj is None:
921             self._downloader.report_error(u'unable to extract title')
922             return
923         video_title = mobj.group(1).decode('utf-8')
924
925         video_uploader = mobj.group(2).decode('utf-8')
926
927         return [{
928             'id':       video_id.decode('utf-8'),
929             'url':      video_url.decode('utf-8'),
930             'uploader': video_uploader,
931             'upload_date':  None,
932             'title':    video_title,
933             'ext':      video_extension.decode('utf-8'),
934         }]
935
936
937 class YahooIE(InfoExtractor):
938     """Information extractor for video.yahoo.com."""
939
940     _WORKING = False
941     # _VALID_URL matches all Yahoo! Video URLs
942     # _VPAGE_URL matches only the extractable '/watch/' URLs
943     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
944     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
945     IE_NAME = u'video.yahoo'
946
947     def __init__(self, downloader=None):
948         InfoExtractor.__init__(self, downloader)
949
950     def report_download_webpage(self, video_id):
951         """Report webpage download."""
952         self.to_screen(u'%s: Downloading webpage' % video_id)
953
954     def _real_extract(self, url, new_video=True):
955         # Extract ID from URL
956         mobj = re.match(self._VALID_URL, url)
957         if mobj is None:
958             self._downloader.report_error(u'Invalid URL: %s' % url)
959             return
960
961         video_id = mobj.group(2)
962         video_extension = 'flv'
963
964         # Rewrite valid but non-extractable URLs as
965         # extractable English language /watch/ URLs
966         if re.match(self._VPAGE_URL, url) is None:
967             request = compat_urllib_request.Request(url)
968             try:
969                 webpage = compat_urllib_request.urlopen(request).read()
970             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
971                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
972                 return
973
974             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
975             if mobj is None:
976                 self._downloader.report_error(u'Unable to extract id field')
977                 return
978             yahoo_id = mobj.group(1)
979
980             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
981             if mobj is None:
982                 self._downloader.report_error(u'Unable to extract vid field')
983                 return
984             yahoo_vid = mobj.group(1)
985
986             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
987             return self._real_extract(url, new_video=False)
988
989         # Retrieve video webpage to extract further information
990         request = compat_urllib_request.Request(url)
991         try:
992             self.report_download_webpage(video_id)
993             webpage = compat_urllib_request.urlopen(request).read()
994         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
995             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
996             return
997
998         # Extract uploader and title from webpage
999         self.report_extraction(video_id)
1000         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video title')
1003             return
1004         video_title = mobj.group(1).decode('utf-8')
1005
1006         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1007         if mobj is None:
1008             self._downloader.report_error(u'unable to extract video uploader')
1009             return
1010         video_uploader = mobj.group(1).decode('utf-8')
1011
1012         # Extract video thumbnail
1013         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video thumbnail')
1016             return
1017         video_thumbnail = mobj.group(1).decode('utf-8')
1018
1019         # Extract video description
1020         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1021         if mobj is None:
1022             self._downloader.report_error(u'unable to extract video description')
1023             return
1024         video_description = mobj.group(1).decode('utf-8')
1025         if not video_description:
1026             video_description = 'No description available.'
1027
1028         # Extract video height and width
1029         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1030         if mobj is None:
1031             self._downloader.report_error(u'unable to extract video height')
1032             return
1033         yv_video_height = mobj.group(1)
1034
1035         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'unable to extract video width')
1038             return
1039         yv_video_width = mobj.group(1)
1040
1041         # Retrieve video playlist to extract media URL
1042         # I'm not completely sure what all these options are, but we
1043         # seem to need most of them, otherwise the server sends a 401.
1044         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1045         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1046         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1047                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1048                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1049         try:
1050             self.report_download_webpage(video_id)
1051             webpage = compat_urllib_request.urlopen(request).read()
1052         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1053             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1054             return
1055
1056         # Extract media URL from playlist XML
1057         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1058         if mobj is None:
1059             self._downloader.report_error(u'Unable to extract media URL')
1060             return
1061         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1062         video_url = unescapeHTML(video_url)
1063
1064         return [{
1065             'id':       video_id.decode('utf-8'),
1066             'url':      video_url,
1067             'uploader': video_uploader,
1068             'upload_date':  None,
1069             'title':    video_title,
1070             'ext':      video_extension.decode('utf-8'),
1071             'thumbnail':    video_thumbnail.decode('utf-8'),
1072             'description':  video_description,
1073         }]
1074
1075
1076 class VimeoIE(InfoExtractor):
1077     """Information extractor for vimeo.com."""
1078
1079     # _VALID_URL matches Vimeo URLs
1080     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1081     IE_NAME = u'vimeo'
1082
1083     def __init__(self, downloader=None):
1084         InfoExtractor.__init__(self, downloader)
1085
1086     def report_download_webpage(self, video_id):
1087         """Report webpage download."""
1088         self.to_screen(u'%s: Downloading webpage' % video_id)
1089
1090     def _real_extract(self, url, new_video=True):
1091         # Extract ID from URL
1092         mobj = re.match(self._VALID_URL, url)
1093         if mobj is None:
1094             self._downloader.report_error(u'Invalid URL: %s' % url)
1095             return
1096
1097         video_id = mobj.group('id')
1098         if not mobj.group('proto'):
1099             url = 'https://' + url
1100         if mobj.group('direct_link'):
1101             url = 'https://vimeo.com/' + video_id
1102
1103         # Retrieve video webpage to extract further information
1104         request = compat_urllib_request.Request(url, None, std_headers)
1105         try:
1106             self.report_download_webpage(video_id)
1107             webpage_bytes = compat_urllib_request.urlopen(request).read()
1108             webpage = webpage_bytes.decode('utf-8')
1109         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1111             return
1112
1113         # Now we begin extracting as much information as we can from what we
1114         # retrieved. First we extract the information common to all extractors,
1115         # and latter we extract those that are Vimeo specific.
1116         self.report_extraction(video_id)
1117
1118         # Extract the config JSON
1119         try:
1120             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1121             config = json.loads(config)
1122         except:
1123             self._downloader.report_error(u'unable to extract info section')
1124             return
1125
1126         # Extract title
1127         video_title = config["video"]["title"]
1128
1129         # Extract uploader and uploader_id
1130         video_uploader = config["video"]["owner"]["name"]
1131         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132
1133         # Extract video thumbnail
1134         video_thumbnail = config["video"]["thumbnail"]
1135
1136         # Extract video description
1137         video_description = get_element_by_attribute("itemprop", "description", webpage)
1138         if video_description: video_description = clean_html(video_description)
1139         else: video_description = u''
1140
1141         # Extract upload date
1142         video_upload_date = None
1143         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1144         if mobj is not None:
1145             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146
1147         # Vimeo specific: extract request signature and timestamp
1148         sig = config['request']['signature']
1149         timestamp = config['request']['timestamp']
1150
1151         # Vimeo specific: extract video codec and quality information
1152         # First consider quality, then codecs, then take everything
1153         # TODO bind to format param
1154         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1155         files = { 'hd': [], 'sd': [], 'other': []}
1156         for codec_name, codec_extension in codecs:
1157             if codec_name in config["video"]["files"]:
1158                 if 'hd' in config["video"]["files"][codec_name]:
1159                     files['hd'].append((codec_name, codec_extension, 'hd'))
1160                 elif 'sd' in config["video"]["files"][codec_name]:
1161                     files['sd'].append((codec_name, codec_extension, 'sd'))
1162                 else:
1163                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164
1165         for quality in ('hd', 'sd', 'other'):
1166             if len(files[quality]) > 0:
1167                 video_quality = files[quality][0][2]
1168                 video_codec = files[quality][0][0]
1169                 video_extension = files[quality][0][1]
1170                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1171                 break
1172         else:
1173             self._downloader.report_error(u'no known codec found')
1174             return
1175
1176         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178
1179         return [{
1180             'id':       video_id,
1181             'url':      video_url,
1182             'uploader': video_uploader,
1183             'uploader_id': video_uploader_id,
1184             'upload_date':  video_upload_date,
1185             'title':    video_title,
1186             'ext':      video_extension,
1187             'thumbnail':    video_thumbnail,
1188             'description':  video_description,
1189         }]
1190
1191
1192 class ArteTvIE(InfoExtractor):
1193     """arte.tv information extractor."""
1194
1195     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196     _LIVE_URL = r'index-[0-9]+\.html$'
1197
1198     IE_NAME = u'arte.tv'
1199
1200     def __init__(self, downloader=None):
1201         InfoExtractor.__init__(self, downloader)
1202
1203     def report_download_webpage(self, video_id):
1204         """Report webpage download."""
1205         self.to_screen(u'%s: Downloading webpage' % video_id)
1206
1207     def fetch_webpage(self, url):
1208         request = compat_urllib_request.Request(url)
1209         try:
1210             self.report_download_webpage(url)
1211             webpage = compat_urllib_request.urlopen(request).read()
1212         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1213             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1214             return
1215         except ValueError as err:
1216             self._downloader.report_error(u'Invalid URL: %s' % url)
1217             return
1218         return webpage
1219
1220     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1221         page = self.fetch_webpage(url)
1222         mobj = re.search(regex, page, regexFlags)
1223         info = {}
1224
1225         if mobj is None:
1226             self._downloader.report_error(u'Invalid URL: %s' % url)
1227             return
1228
1229         for (i, key, err) in matchTuples:
1230             if mobj.group(i) is None:
1231                 self._downloader.report_error(err)
1232                 return
1233             else:
1234                 info[key] = mobj.group(i)
1235
1236         return info
1237
1238     def extractLiveStream(self, url):
1239         video_lang = url.split('/')[-4]
1240         info = self.grep_webpage(
1241             url,
1242             r'src="(.*?/videothek_js.*?\.js)',
1243             0,
1244             [
1245                 (1, 'url', u'Invalid URL: %s' % url)
1246             ]
1247         )
1248         http_host = url.split('/')[2]
1249         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1250         info = self.grep_webpage(
1251             next_url,
1252             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1253                 '(http://.*?\.swf).*?' +
1254                 '(rtmp://.*?)\'',
1255             re.DOTALL,
1256             [
1257                 (1, 'path',   u'could not extract video path: %s' % url),
1258                 (2, 'player', u'could not extract video player: %s' % url),
1259                 (3, 'url',    u'could not extract video url: %s' % url)
1260             ]
1261         )
1262         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1263
1264     def extractPlus7Stream(self, url):
1265         video_lang = url.split('/')[-3]
1266         info = self.grep_webpage(
1267             url,
1268             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1269             0,
1270             [
1271                 (1, 'url', u'Invalid URL: %s' % url)
1272             ]
1273         )
1274         next_url = compat_urllib_parse.unquote(info.get('url'))
1275         info = self.grep_webpage(
1276             next_url,
1277             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1278             0,
1279             [
1280                 (1, 'url', u'Could not find <video> tag: %s' % url)
1281             ]
1282         )
1283         next_url = compat_urllib_parse.unquote(info.get('url'))
1284
1285         info = self.grep_webpage(
1286             next_url,
1287             r'<video id="(.*?)".*?>.*?' +
1288                 '<name>(.*?)</name>.*?' +
1289                 '<dateVideo>(.*?)</dateVideo>.*?' +
1290                 '<url quality="hd">(.*?)</url>',
1291             re.DOTALL,
1292             [
1293                 (1, 'id',    u'could not extract video id: %s' % url),
1294                 (2, 'title', u'could not extract video title: %s' % url),
1295                 (3, 'date',  u'could not extract video date: %s' % url),
1296                 (4, 'url',   u'could not extract video url: %s' % url)
1297             ]
1298         )
1299
1300         return {
1301             'id':           info.get('id'),
1302             'url':          compat_urllib_parse.unquote(info.get('url')),
1303             'uploader':     u'arte.tv',
1304             'upload_date':  info.get('date'),
1305             'title':        info.get('title').decode('utf-8'),
1306             'ext':          u'mp4',
1307             'format':       u'NA',
1308             'player_url':   None,
1309         }
1310
1311     def _real_extract(self, url):
1312         video_id = url.split('/')[-1]
1313         self.report_extraction(video_id)
1314
1315         if re.search(self._LIVE_URL, video_id) is not None:
1316             self.extractLiveStream(url)
1317             return
1318         else:
1319             info = self.extractPlus7Stream(url)
1320
1321         return [info]
1322
1323
1324 class GenericIE(InfoExtractor):
1325     """Generic last-resort information extractor."""
1326
1327     _VALID_URL = r'.*'
1328     IE_NAME = u'generic'
1329
1330     def __init__(self, downloader=None):
1331         InfoExtractor.__init__(self, downloader)
1332
1333     def report_download_webpage(self, video_id):
1334         """Report webpage download."""
1335         if not self._downloader.params.get('test', False):
1336             self._downloader.report_warning(u'Falling back on generic information extractor.')
1337         self.to_screen(u'%s: Downloading webpage' % video_id)
1338
1339     def report_following_redirect(self, new_url):
1340         """Report information extraction."""
1341         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1342
1343     def _test_redirect(self, url):
1344         """Check if it is a redirect, like url shorteners, in case return the new url."""
1345         class HeadRequest(compat_urllib_request.Request):
1346             def get_method(self):
1347                 return "HEAD"
1348
1349         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1350             """
1351             Subclass the HTTPRedirectHandler to make it use our
1352             HeadRequest also on the redirected URL
1353             """
1354             def redirect_request(self, req, fp, code, msg, headers, newurl):
1355                 if code in (301, 302, 303, 307):
1356                     newurl = newurl.replace(' ', '%20')
1357                     newheaders = dict((k,v) for k,v in req.headers.items()
1358                                       if k.lower() not in ("content-length", "content-type"))
1359                     return HeadRequest(newurl,
1360                                        headers=newheaders,
1361                                        origin_req_host=req.get_origin_req_host(),
1362                                        unverifiable=True)
1363                 else:
1364                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1365
1366         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1367             """
1368             Fallback to GET if HEAD is not allowed (405 HTTP error)
1369             """
1370             def http_error_405(self, req, fp, code, msg, headers):
1371                 fp.read()
1372                 fp.close()
1373
1374                 newheaders = dict((k,v) for k,v in req.headers.items()
1375                                   if k.lower() not in ("content-length", "content-type"))
1376                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1377                                                  headers=newheaders,
1378                                                  origin_req_host=req.get_origin_req_host(),
1379                                                  unverifiable=True))
1380
1381         # Build our opener
1382         opener = compat_urllib_request.OpenerDirector()
1383         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1384                         HTTPMethodFallback, HEADRedirectHandler,
1385                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1386             opener.add_handler(handler())
1387
1388         response = opener.open(HeadRequest(url))
1389         new_url = response.geturl()
1390
1391         if url == new_url:
1392             return False
1393
1394         self.report_following_redirect(new_url)
1395         return new_url
1396
1397     def _real_extract(self, url):
1398         new_url = self._test_redirect(url)
1399         if new_url: return [self.url_result(new_url)]
1400
1401         video_id = url.split('/')[-1]
1402         try:
1403             webpage = self._download_webpage(url, video_id)
1404         except ValueError as err:
1405             # since this is the last-resort InfoExtractor, if
1406             # this error is thrown, it'll be thrown here
1407             self._downloader.report_error(u'Invalid URL: %s' % url)
1408             return
1409
1410         self.report_extraction(video_id)
1411         # Start with something easy: JW Player in SWFObject
1412         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1413         if mobj is None:
1414             # Broaden the search a little bit
1415             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1416         if mobj is None:
1417             # Broaden the search a little bit: JWPlayer JS loader
1418             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1419         if mobj is None:
1420             self._downloader.report_error(u'Invalid URL: %s' % url)
1421             return
1422
1423         # It's possible that one of the regexes
1424         # matched, but returned an empty group:
1425         if mobj.group(1) is None:
1426             self._downloader.report_error(u'Invalid URL: %s' % url)
1427             return
1428
1429         video_url = compat_urllib_parse.unquote(mobj.group(1))
1430         video_id = os.path.basename(video_url)
1431
1432         # here's a fun little line of code for you:
1433         video_extension = os.path.splitext(video_id)[1][1:]
1434         video_id = os.path.splitext(video_id)[0]
1435
1436         # it's tempting to parse this further, but you would
1437         # have to take into account all the variations like
1438         #   Video Title - Site Name
1439         #   Site Name | Video Title
1440         #   Video Title - Tagline | Site Name
1441         # and so on and so forth; it's just not practical
1442         mobj = re.search(r'<title>(.*)</title>', webpage)
1443         if mobj is None:
1444             self._downloader.report_error(u'unable to extract title')
1445             return
1446         video_title = mobj.group(1)
1447
1448         # video uploader is domain name
1449         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_uploader = mobj.group(1)
1454
1455         return [{
1456             'id':       video_id,
1457             'url':      video_url,
1458             'uploader': video_uploader,
1459             'upload_date':  None,
1460             'title':    video_title,
1461             'ext':      video_extension,
1462         }]
1463
1464
1465 class YoutubeSearchIE(InfoExtractor):
1466     """Information Extractor for YouTube search queries."""
1467     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1468     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1469     _max_youtube_results = 1000
1470     IE_NAME = u'youtube:search'
1471
1472     def __init__(self, downloader=None):
1473         InfoExtractor.__init__(self, downloader)
1474
1475     def report_download_page(self, query, pagenum):
1476         """Report attempt to download search page with given number."""
1477         query = query.decode(preferredencoding())
1478         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1479
1480     def _real_extract(self, query):
1481         mobj = re.match(self._VALID_URL, query)
1482         if mobj is None:
1483             self._downloader.report_error(u'invalid search query "%s"' % query)
1484             return
1485
1486         prefix, query = query.split(':')
1487         prefix = prefix[8:]
1488         query = query.encode('utf-8')
1489         if prefix == '':
1490             return self._get_n_results(query, 1)
1491         elif prefix == 'all':
1492             self._get_n_results(query, self._max_youtube_results)
1493         else:
1494             try:
1495                 n = int(prefix)
1496                 if n <= 0:
1497                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1498                     return
1499                 elif n > self._max_youtube_results:
1500                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1501                     n = self._max_youtube_results
1502                 return self._get_n_results(query, n)
1503             except ValueError: # parsing prefix as integer fails
1504                 return self._get_n_results(query, 1)
1505
1506     def _get_n_results(self, query, n):
1507         """Get a specified number of results for a query"""
1508
1509         video_ids = []
1510         pagenum = 0
1511         limit = n
1512
1513         while (50 * pagenum) < limit:
1514             self.report_download_page(query, pagenum+1)
1515             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1516             request = compat_urllib_request.Request(result_url)
1517             try:
1518                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1519             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1520                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1521                 return
1522             api_response = json.loads(data)['data']
1523
1524             if not 'items' in api_response:
1525                 self._downloader.report_error(u'[youtube] No video results')
1526                 return
1527
1528             new_ids = list(video['id'] for video in api_response['items'])
1529             video_ids += new_ids
1530
1531             limit = min(n, api_response['totalItems'])
1532             pagenum += 1
1533
1534         if len(video_ids) > n:
1535             video_ids = video_ids[:n]
1536         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1537         return videos
1538
1539
1540 class GoogleSearchIE(InfoExtractor):
1541     """Information Extractor for Google Video search queries."""
1542     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1543     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1544     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1545     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1546     _max_google_results = 1000
1547     IE_NAME = u'video.google:search'
1548
1549     def __init__(self, downloader=None):
1550         InfoExtractor.__init__(self, downloader)
1551
1552     def report_download_page(self, query, pagenum):
1553         """Report attempt to download playlist page with given number."""
1554         query = query.decode(preferredencoding())
1555         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1556
1557     def _real_extract(self, query):
1558         mobj = re.match(self._VALID_URL, query)
1559         if mobj is None:
1560             self._downloader.report_error(u'invalid search query "%s"' % query)
1561             return
1562
1563         prefix, query = query.split(':')
1564         prefix = prefix[8:]
1565         query = query.encode('utf-8')
1566         if prefix == '':
1567             self._download_n_results(query, 1)
1568             return
1569         elif prefix == 'all':
1570             self._download_n_results(query, self._max_google_results)
1571             return
1572         else:
1573             try:
1574                 n = int(prefix)
1575                 if n <= 0:
1576                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1577                     return
1578                 elif n > self._max_google_results:
1579                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1580                     n = self._max_google_results
1581                 self._download_n_results(query, n)
1582                 return
1583             except ValueError: # parsing prefix as integer fails
1584                 self._download_n_results(query, 1)
1585                 return
1586
1587     def _download_n_results(self, query, n):
1588         """Downloads a specified number of results for a query"""
1589
1590         video_ids = []
1591         pagenum = 0
1592
1593         while True:
1594             self.report_download_page(query, pagenum)
1595             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1596             request = compat_urllib_request.Request(result_url)
1597             try:
1598                 page = compat_urllib_request.urlopen(request).read()
1599             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1600                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1601                 return
1602
1603             # Extract video identifiers
1604             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1605                 video_id = mobj.group(1)
1606                 if video_id not in video_ids:
1607                     video_ids.append(video_id)
1608                     if len(video_ids) == n:
1609                         # Specified n videos reached
1610                         for id in video_ids:
1611                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1612                         return
1613
1614             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1615                 for id in video_ids:
1616                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1617                 return
1618
1619             pagenum = pagenum + 1
1620
1621
1622 class YahooSearchIE(InfoExtractor):
1623     """Information Extractor for Yahoo! Video search queries."""
1624
1625     _WORKING = False
1626     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1627     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1628     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1629     _MORE_PAGES_INDICATOR = r'\s*Next'
1630     _max_yahoo_results = 1000
1631     IE_NAME = u'video.yahoo:search'
1632
1633     def __init__(self, downloader=None):
1634         InfoExtractor.__init__(self, downloader)
1635
1636     def report_download_page(self, query, pagenum):
1637         """Report attempt to download playlist page with given number."""
1638         query = query.decode(preferredencoding())
1639         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1640
1641     def _real_extract(self, query):
1642         mobj = re.match(self._VALID_URL, query)
1643         if mobj is None:
1644             self._downloader.report_error(u'invalid search query "%s"' % query)
1645             return
1646
1647         prefix, query = query.split(':')
1648         prefix = prefix[8:]
1649         query = query.encode('utf-8')
1650         if prefix == '':
1651             self._download_n_results(query, 1)
1652             return
1653         elif prefix == 'all':
1654             self._download_n_results(query, self._max_yahoo_results)
1655             return
1656         else:
1657             try:
1658                 n = int(prefix)
1659                 if n <= 0:
1660                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1661                     return
1662                 elif n > self._max_yahoo_results:
1663                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1664                     n = self._max_yahoo_results
1665                 self._download_n_results(query, n)
1666                 return
1667             except ValueError: # parsing prefix as integer fails
1668                 self._download_n_results(query, 1)
1669                 return
1670
1671     def _download_n_results(self, query, n):
1672         """Downloads a specified number of results for a query"""
1673
1674         video_ids = []
1675         already_seen = set()
1676         pagenum = 1
1677
1678         while True:
1679             self.report_download_page(query, pagenum)
1680             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1681             request = compat_urllib_request.Request(result_url)
1682             try:
1683                 page = compat_urllib_request.urlopen(request).read()
1684             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1685                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1686                 return
1687
1688             # Extract video identifiers
1689             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690                 video_id = mobj.group(1)
1691                 if video_id not in already_seen:
1692                     video_ids.append(video_id)
1693                     already_seen.add(video_id)
1694                     if len(video_ids) == n:
1695                         # Specified n videos reached
1696                         for id in video_ids:
1697                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1698                         return
1699
1700             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1701                 for id in video_ids:
1702                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1703                 return
1704
1705             pagenum = pagenum + 1
1706
1707
1708 class YoutubePlaylistIE(InfoExtractor):
1709     """Information Extractor for YouTube playlists."""
1710
1711     _VALID_URL = r"""(?:
1712                         (?:https?://)?
1713                         (?:\w+\.)?
1714                         youtube\.com/
1715                         (?:
1716                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1717                            \? (?:.*?&)*? (?:p|a|list)=
1718                         |  p/
1719                         )
1720                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1721                         .*
1722                      |
1723                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1724                      )"""
1725     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1726     _MAX_RESULTS = 50
1727     IE_NAME = u'youtube:playlist'
1728
1729     def __init__(self, downloader=None):
1730         InfoExtractor.__init__(self, downloader)
1731
1732     @classmethod
1733     def suitable(cls, url):
1734         """Receives a URL and returns True if suitable for this IE."""
1735         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1736
1737     def report_download_page(self, playlist_id, pagenum):
1738         """Report attempt to download playlist page with given number."""
1739         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1740
1741     def _real_extract(self, url):
1742         # Extract playlist id
1743         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1744         if mobj is None:
1745             self._downloader.report_error(u'invalid url: %s' % url)
1746             return
1747
1748         # Download playlist videos from API
1749         playlist_id = mobj.group(1) or mobj.group(2)
1750         page_num = 1
1751         videos = []
1752
1753         while True:
1754             self.report_download_page(playlist_id, page_num)
1755
1756             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1757             try:
1758                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1759             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1760                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1761                 return
1762
1763             try:
1764                 response = json.loads(page)
1765             except ValueError as err:
1766                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1767                 return
1768
1769             if 'feed' not in response:
1770                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1771                 return
1772             if 'entry' not in response['feed']:
1773                 # Number of videos is a multiple of self._MAX_RESULTS
1774                 break
1775
1776             playlist_title = response['feed']['title']['$t']
1777
1778             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1779                         for entry in response['feed']['entry']
1780                         if 'content' in entry ]
1781
1782             if len(response['feed']['entry']) < self._MAX_RESULTS:
1783                 break
1784             page_num += 1
1785
1786         videos = [v[1] for v in sorted(videos)]
1787
1788         url_results = [self.url_result(url, 'Youtube') for url in videos]
1789         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1790
1791
1792 class YoutubeChannelIE(InfoExtractor):
1793     """Information Extractor for YouTube channels."""
1794
1795     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1796     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1797     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1798     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1799     IE_NAME = u'youtube:channel'
1800
1801     def report_download_page(self, channel_id, pagenum):
1802         """Report attempt to download channel page with given number."""
1803         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1804
1805     def extract_videos_from_page(self, page):
1806         ids_in_page = []
1807         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1808             if mobj.group(1) not in ids_in_page:
1809                 ids_in_page.append(mobj.group(1))
1810         return ids_in_page
1811
1812     def _real_extract(self, url):
1813         # Extract channel id
1814         mobj = re.match(self._VALID_URL, url)
1815         if mobj is None:
1816             self._downloader.report_error(u'invalid url: %s' % url)
1817             return
1818
1819         # Download channel page
1820         channel_id = mobj.group(1)
1821         video_ids = []
1822         pagenum = 1
1823
1824         self.report_download_page(channel_id, pagenum)
1825         url = self._TEMPLATE_URL % (channel_id, pagenum)
1826         request = compat_urllib_request.Request(url)
1827         try:
1828             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1829         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1830             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1831             return
1832
1833         # Extract video identifiers
1834         ids_in_page = self.extract_videos_from_page(page)
1835         video_ids.extend(ids_in_page)
1836
1837         # Download any subsequent channel pages using the json-based channel_ajax query
1838         if self._MORE_PAGES_INDICATOR in page:
1839             while True:
1840                 pagenum = pagenum + 1
1841
1842                 self.report_download_page(channel_id, pagenum)
1843                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1844                 request = compat_urllib_request.Request(url)
1845                 try:
1846                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1847                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1849                     return
1850
1851                 page = json.loads(page)
1852
1853                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1854                 video_ids.extend(ids_in_page)
1855
1856                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1857                     break
1858
1859         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1860
1861         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1863         return [self.playlist_result(url_entries, channel_id)]
1864
1865
1866 class YoutubeUserIE(InfoExtractor):
1867     """Information Extractor for YouTube users."""
1868
1869     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871     _GDATA_PAGE_SIZE = 50
1872     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874     IE_NAME = u'youtube:user'
1875
1876     def __init__(self, downloader=None):
1877         InfoExtractor.__init__(self, downloader)
1878
1879     def report_download_page(self, username, start_index):
1880         """Report attempt to download user page."""
1881         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1883
1884     def _real_extract(self, url):
1885         # Extract username
1886         mobj = re.match(self._VALID_URL, url)
1887         if mobj is None:
1888             self._downloader.report_error(u'invalid url: %s' % url)
1889             return
1890
1891         username = mobj.group(1)
1892
1893         # Download video ids using YouTube Data API. Result size per
1894         # query is limited (currently to 50 videos) so we need to query
1895         # page by page until there are no video ids - it means we got
1896         # all of them.
1897
1898         video_ids = []
1899         pagenum = 0
1900
1901         while True:
1902             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903             self.report_download_page(username, start_index)
1904
1905             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906
1907             try:
1908                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1911                 return
1912
1913             # Extract video identifiers
1914             ids_in_page = []
1915
1916             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917                 if mobj.group(1) not in ids_in_page:
1918                     ids_in_page.append(mobj.group(1))
1919
1920             video_ids.extend(ids_in_page)
1921
1922             # A little optimization - if current page is not
1923             # "full", ie. does not contain PAGE_SIZE video ids then
1924             # we can assume that this page is the last one - there
1925             # are no more ids on further pages - no need to query
1926             # again.
1927
1928             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1929                 break
1930
1931             pagenum += 1
1932
1933         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1934         url_results = [self.url_result(url, 'Youtube') for url in urls]
1935         return [self.playlist_result(url_results, playlist_title = username)]
1936
1937
1938 class BlipTVUserIE(InfoExtractor):
1939     """Information Extractor for blip.tv users."""
1940
1941     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1942     _PAGE_SIZE = 12
1943     IE_NAME = u'blip.tv:user'
1944
1945     def __init__(self, downloader=None):
1946         InfoExtractor.__init__(self, downloader)
1947
1948     def report_download_page(self, username, pagenum):
1949         """Report attempt to download user page."""
1950         self.to_screen(u'user %s: Downloading video ids from page %d' %
1951                 (username, pagenum))
1952
1953     def _real_extract(self, url):
1954         # Extract username
1955         mobj = re.match(self._VALID_URL, url)
1956         if mobj is None:
1957             self._downloader.report_error(u'invalid url: %s' % url)
1958             return
1959
1960         username = mobj.group(1)
1961
1962         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1963
1964         request = compat_urllib_request.Request(url)
1965
1966         try:
1967             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968             mobj = re.search(r'data-users-id="([^"]+)"', page)
1969             page_base = page_base % mobj.group(1)
1970         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1972             return
1973
1974
1975         # Download video ids using BlipTV Ajax calls. Result size per
1976         # query is limited (currently to 12 videos) so we need to query
1977         # page by page until there are no video ids - it means we got
1978         # all of them.
1979
1980         video_ids = []
1981         pagenum = 1
1982
1983         while True:
1984             self.report_download_page(username, pagenum)
1985             url = page_base + "&page=" + str(pagenum)
1986             request = compat_urllib_request.Request( url )
1987             try:
1988                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1991                 return
1992
1993             # Extract video identifiers
1994             ids_in_page = []
1995
1996             for mobj in re.finditer(r'href="/([^"]+)"', page):
1997                 if mobj.group(1) not in ids_in_page:
1998                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1999
2000             video_ids.extend(ids_in_page)
2001
2002             # A little optimization - if current page is not
2003             # "full", ie. does not contain PAGE_SIZE video ids then
2004             # we can assume that this page is the last one - there
2005             # are no more ids on further pages - no need to query
2006             # again.
2007
2008             if len(ids_in_page) < self._PAGE_SIZE:
2009                 break
2010
2011             pagenum += 1
2012
2013         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2014         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2015         return [self.playlist_result(url_entries, playlist_title = username)]
2016
2017
2018 class DepositFilesIE(InfoExtractor):
2019     """Information extractor for depositfiles.com"""
2020
2021     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2022
2023     def report_download_webpage(self, file_id):
2024         """Report webpage download."""
2025         self.to_screen(u'%s: Downloading webpage' % file_id)
2026
2027     def _real_extract(self, url):
2028         file_id = url.split('/')[-1]
2029         # Rebuild url in english locale
2030         url = 'http://depositfiles.com/en/files/' + file_id
2031
2032         # Retrieve file webpage with 'Free download' button pressed
2033         free_download_indication = { 'gateway_result' : '1' }
2034         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2035         try:
2036             self.report_download_webpage(file_id)
2037             webpage = compat_urllib_request.urlopen(request).read()
2038         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2039             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2040             return
2041
2042         # Search for the real file URL
2043         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2044         if (mobj is None) or (mobj.group(1) is None):
2045             # Try to figure out reason of the error.
2046             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2047             if (mobj is not None) and (mobj.group(1) is not None):
2048                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2049                 self._downloader.report_error(u'%s' % restriction_message)
2050             else:
2051                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2052             return
2053
2054         file_url = mobj.group(1)
2055         file_extension = os.path.splitext(file_url)[1][1:]
2056
2057         # Search for file title
2058         mobj = re.search(r'<b title="(.*?)">', webpage)
2059         if mobj is None:
2060             self._downloader.report_error(u'unable to extract title')
2061             return
2062         file_title = mobj.group(1).decode('utf-8')
2063
2064         return [{
2065             'id':       file_id.decode('utf-8'),
2066             'url':      file_url.decode('utf-8'),
2067             'uploader': None,
2068             'upload_date':  None,
2069             'title':    file_title,
2070             'ext':      file_extension.decode('utf-8'),
2071         }]
2072
2073
2074 class FacebookIE(InfoExtractor):
2075     """Information Extractor for Facebook"""
2076
2077     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2078     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2079     _NETRC_MACHINE = 'facebook'
2080     IE_NAME = u'facebook'
2081
2082     def report_login(self):
2083         """Report attempt to log in."""
2084         self.to_screen(u'Logging in')
2085
2086     def _real_initialize(self):
2087         if self._downloader is None:
2088             return
2089
2090         useremail = None
2091         password = None
2092         downloader_params = self._downloader.params
2093
2094         # Attempt to use provided username and password or .netrc data
2095         if downloader_params.get('username', None) is not None:
2096             useremail = downloader_params['username']
2097             password = downloader_params['password']
2098         elif downloader_params.get('usenetrc', False):
2099             try:
2100                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2101                 if info is not None:
2102                     useremail = info[0]
2103                     password = info[2]
2104                 else:
2105                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2106             except (IOError, netrc.NetrcParseError) as err:
2107                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2108                 return
2109
2110         if useremail is None:
2111             return
2112
2113         # Log in
2114         login_form = {
2115             'email': useremail,
2116             'pass': password,
2117             'login': 'Log+In'
2118             }
2119         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2120         try:
2121             self.report_login()
2122             login_results = compat_urllib_request.urlopen(request).read()
2123             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2124                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2125                 return
2126         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2127             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2128             return
2129
2130     def _real_extract(self, url):
2131         mobj = re.match(self._VALID_URL, url)
2132         if mobj is None:
2133             self._downloader.report_error(u'invalid URL: %s' % url)
2134             return
2135         video_id = mobj.group('ID')
2136
2137         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2138         webpage = self._download_webpage(url, video_id)
2139
2140         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2141         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2142         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2143         if not m:
2144             raise ExtractorError(u'Cannot parse data')
2145         data = dict(json.loads(m.group(1)))
2146         params_raw = compat_urllib_parse.unquote(data['params'])
2147         params = json.loads(params_raw)
2148         video_data = params['video_data'][0]
2149         video_url = video_data.get('hd_src')
2150         if not video_url:
2151             video_url = video_data['sd_src']
2152         if not video_url:
2153             raise ExtractorError(u'Cannot find video URL')
2154         video_duration = int(video_data['video_duration'])
2155         thumbnail = video_data['thumbnail_src']
2156
2157         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2158         if not m:
2159             raise ExtractorError(u'Cannot find title in webpage')
2160         video_title = unescapeHTML(m.group(1))
2161
2162         info = {
2163             'id': video_id,
2164             'title': video_title,
2165             'url': video_url,
2166             'ext': 'mp4',
2167             'duration': video_duration,
2168             'thumbnail': thumbnail,
2169         }
2170         return [info]
2171
2172
2173 class BlipTVIE(InfoExtractor):
2174     """Information extractor for blip.tv"""
2175
2176     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2177     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2178     IE_NAME = u'blip.tv'
2179
2180     def report_direct_download(self, title):
2181         """Report information extraction."""
2182         self.to_screen(u'%s: Direct download detected' % title)
2183
2184     def _real_extract(self, url):
2185         mobj = re.match(self._VALID_URL, url)
2186         if mobj is None:
2187             self._downloader.report_error(u'invalid URL: %s' % url)
2188             return
2189
2190         urlp = compat_urllib_parse_urlparse(url)
2191         if urlp.path.startswith('/play/'):
2192             request = compat_urllib_request.Request(url)
2193             response = compat_urllib_request.urlopen(request)
2194             redirecturl = response.geturl()
2195             rurlp = compat_urllib_parse_urlparse(redirecturl)
2196             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2197             url = 'http://blip.tv/a/a-' + file_id
2198             return self._real_extract(url)
2199
2200
2201         if '?' in url:
2202             cchar = '&'
2203         else:
2204             cchar = '?'
2205         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206         request = compat_urllib_request.Request(json_url)
2207         request.add_header('User-Agent', 'iTunes/10.6.1')
2208         self.report_extraction(mobj.group(1))
2209         info = None
2210         try:
2211             urlh = compat_urllib_request.urlopen(request)
2212             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2213                 basename = url.split('/')[-1]
2214                 title,ext = os.path.splitext(basename)
2215                 title = title.decode('UTF-8')
2216                 ext = ext.replace('.', '')
2217                 self.report_direct_download(title)
2218                 info = {
2219                     'id': title,
2220                     'url': url,
2221                     'uploader': None,
2222                     'upload_date': None,
2223                     'title': title,
2224                     'ext': ext,
2225                     'urlhandle': urlh
2226                 }
2227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2228             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229         if info is None: # Regular URL
2230             try:
2231                 json_code_bytes = urlh.read()
2232                 json_code = json_code_bytes.decode('utf-8')
2233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2235                 return
2236
2237             try:
2238                 json_data = json.loads(json_code)
2239                 if 'Post' in json_data:
2240                     data = json_data['Post']
2241                 else:
2242                     data = json_data
2243
2244                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245                 video_url = data['media']['url']
2246                 umobj = re.match(self._URL_EXT, video_url)
2247                 if umobj is None:
2248                     raise ValueError('Can not determine filename extension')
2249                 ext = umobj.group(1)
2250
2251                 info = {
2252                     'id': data['item_id'],
2253                     'url': video_url,
2254                     'uploader': data['display_name'],
2255                     'upload_date': upload_date,
2256                     'title': data['title'],
2257                     'ext': ext,
2258                     'format': data['media']['mimeType'],
2259                     'thumbnail': data['thumbnailUrl'],
2260                     'description': data['description'],
2261                     'player_url': data['embedUrl'],
2262                     'user_agent': 'iTunes/10.6.1',
2263                 }
2264             except (ValueError,KeyError) as err:
2265                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2266                 return
2267
2268         return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272     """Information Extractor for myvideo.de."""
2273
2274     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275     IE_NAME = u'myvideo'
2276
2277     def __init__(self, downloader=None):
2278         InfoExtractor.__init__(self, downloader)
2279
2280     def _real_extract(self,url):
2281         mobj = re.match(self._VALID_URL, url)
2282         if mobj is None:
2283             self._download.report_error(u'invalid URL: %s' % url)
2284             return
2285
2286         video_id = mobj.group(1)
2287
2288         # Get video webpage
2289         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2290         webpage = self._download_webpage(webpage_url, video_id)
2291
2292         self.report_extraction(video_id)
2293         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2294                  webpage)
2295         if mobj is None:
2296             self._downloader.report_error(u'unable to extract media URL')
2297             return
2298         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2299
2300         mobj = re.search('<title>([^<]+)</title>', webpage)
2301         if mobj is None:
2302             self._downloader.report_error(u'unable to extract title')
2303             return
2304
2305         video_title = mobj.group(1)
2306
2307         return [{
2308             'id':       video_id,
2309             'url':      video_url,
2310             'uploader': None,
2311             'upload_date':  None,
2312             'title':    video_title,
2313             'ext':      u'flv',
2314         }]
2315
2316 class ComedyCentralIE(InfoExtractor):
2317     """Information extractor for The Daily Show and Colbert Report """
2318
2319     # urls can be abbreviations like :thedailyshow or :colbert
2320     # urls for episodes like:
2321     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2322     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2323     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2324     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2325                       |(https?://)?(www\.)?
2326                           (?P<showname>thedailyshow|colbertnation)\.com/
2327                          (full-episodes/(?P<episode>.*)|
2328                           (?P<clip>
2329                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2330                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2331                      $"""
2332
2333     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2334
2335     _video_extensions = {
2336         '3500': 'mp4',
2337         '2200': 'mp4',
2338         '1700': 'mp4',
2339         '1200': 'mp4',
2340         '750': 'mp4',
2341         '400': 'mp4',
2342     }
2343     _video_dimensions = {
2344         '3500': '1280x720',
2345         '2200': '960x540',
2346         '1700': '768x432',
2347         '1200': '640x360',
2348         '750': '512x288',
2349         '400': '384x216',
2350     }
2351
2352     @classmethod
2353     def suitable(cls, url):
2354         """Receives a URL and returns True if suitable for this IE."""
2355         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2356
2357     def report_config_download(self, episode_id, media_id):
2358         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2359
2360     def report_index_download(self, episode_id):
2361         self.to_screen(u'%s: Downloading show index' % episode_id)
2362
2363     def _print_formats(self, formats):
2364         print('Available formats:')
2365         for x in formats:
2366             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2367
2368
2369     def _real_extract(self, url):
2370         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2371         if mobj is None:
2372             self._downloader.report_error(u'invalid URL: %s' % url)
2373             return
2374
2375         if mobj.group('shortname'):
2376             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2377                 url = u'http://www.thedailyshow.com/full-episodes/'
2378             else:
2379                 url = u'http://www.colbertnation.com/full-episodes/'
2380             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2381             assert mobj is not None
2382
2383         if mobj.group('clip'):
2384             if mobj.group('showname') == 'thedailyshow':
2385                 epTitle = mobj.group('tdstitle')
2386             else:
2387                 epTitle = mobj.group('cntitle')
2388             dlNewest = False
2389         else:
2390             dlNewest = not mobj.group('episode')
2391             if dlNewest:
2392                 epTitle = mobj.group('showname')
2393             else:
2394                 epTitle = mobj.group('episode')
2395
2396         req = compat_urllib_request.Request(url)
2397         self.report_extraction(epTitle)
2398         try:
2399             htmlHandle = compat_urllib_request.urlopen(req)
2400             html = htmlHandle.read()
2401             webpage = html.decode('utf-8')
2402         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2403             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2404             return
2405         if dlNewest:
2406             url = htmlHandle.geturl()
2407             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408             if mobj is None:
2409                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2410                 return
2411             if mobj.group('episode') == '':
2412                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2413                 return
2414             epTitle = mobj.group('episode')
2415
2416         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2417
2418         if len(mMovieParams) == 0:
2419             # The Colbert Report embeds the information in a without
2420             # a URL prefix; so extract the alternate reference
2421             # and then add the URL prefix manually.
2422
2423             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2424             if len(altMovieParams) == 0:
2425                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2426                 return
2427             else:
2428                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2429
2430         uri = mMovieParams[0][1]
2431         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2432         self.report_index_download(epTitle)
2433         try:
2434             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2435         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2437             return
2438
2439         results = []
2440
2441         idoc = xml.etree.ElementTree.fromstring(indexXml)
2442         itemEls = idoc.findall('.//item')
2443         for partNum,itemEl in enumerate(itemEls):
2444             mediaId = itemEl.findall('./guid')[0].text
2445             shortMediaId = mediaId.split(':')[-1]
2446             showId = mediaId.split(':')[-2].replace('.com', '')
2447             officialTitle = itemEl.findall('./title')[0].text
2448             officialDate = itemEl.findall('./pubDate')[0].text
2449
2450             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2451                         compat_urllib_parse.urlencode({'uri': mediaId}))
2452             configReq = compat_urllib_request.Request(configUrl)
2453             self.report_config_download(epTitle, shortMediaId)
2454             try:
2455                 configXml = compat_urllib_request.urlopen(configReq).read()
2456             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2457                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2458                 return
2459
2460             cdoc = xml.etree.ElementTree.fromstring(configXml)
2461             turls = []
2462             for rendition in cdoc.findall('.//rendition'):
2463                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2464                 turls.append(finfo)
2465
2466             if len(turls) == 0:
2467                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2468                 continue
2469
2470             if self._downloader.params.get('listformats', None):
2471                 self._print_formats([i[0] for i in turls])
2472                 return
2473
2474             # For now, just pick the highest bitrate
2475             format,rtmp_video_url = turls[-1]
2476
2477             # Get the format arg from the arg stream
2478             req_format = self._downloader.params.get('format', None)
2479
2480             # Select format if we can find one
2481             for f,v in turls:
2482                 if f == req_format:
2483                     format, rtmp_video_url = f, v
2484                     break
2485
2486             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2487             if not m:
2488                 raise ExtractorError(u'Cannot transform RTMP url')
2489             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2490             video_url = base + m.group('finalid')
2491
2492             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2493             info = {
2494                 'id': shortMediaId,
2495                 'url': video_url,
2496                 'uploader': showId,
2497                 'upload_date': officialDate,
2498                 'title': effTitle,
2499                 'ext': 'mp4',
2500                 'format': format,
2501                 'thumbnail': None,
2502                 'description': officialTitle,
2503             }
2504             results.append(info)
2505
2506         return results
2507
2508
2509 class EscapistIE(InfoExtractor):
2510     """Information extractor for The Escapist """
2511
2512     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2513     IE_NAME = u'escapist'
2514
2515     def report_config_download(self, showName):
2516         self.to_screen(u'%s: Downloading configuration' % showName)
2517
2518     def _real_extract(self, url):
2519         mobj = re.match(self._VALID_URL, url)
2520         if mobj is None:
2521             self._downloader.report_error(u'invalid URL: %s' % url)
2522             return
2523         showName = mobj.group('showname')
2524         videoId = mobj.group('episode')
2525
2526         self.report_extraction(showName)
2527         try:
2528             webPage = compat_urllib_request.urlopen(url)
2529             webPageBytes = webPage.read()
2530             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2531             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2532         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2534             return
2535
2536         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2537         description = unescapeHTML(descMatch.group(1))
2538         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2539         imgUrl = unescapeHTML(imgMatch.group(1))
2540         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2541         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2542         configUrlMatch = re.search('config=(.*)$', playerUrl)
2543         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2544
2545         self.report_config_download(showName)
2546         try:
2547             configJSON = compat_urllib_request.urlopen(configUrl)
2548             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2549             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2550         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2551             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2552             return
2553
2554         # Technically, it's JavaScript, not JSON
2555         configJSON = configJSON.replace("'", '"')
2556
2557         try:
2558             config = json.loads(configJSON)
2559         except (ValueError,) as err:
2560             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2561             return
2562
2563         playlist = config['playlist']
2564         videoUrl = playlist[1]['url']
2565
2566         info = {
2567             'id': videoId,
2568             'url': videoUrl,
2569             'uploader': showName,
2570             'upload_date': None,
2571             'title': showName,
2572             'ext': 'mp4',
2573             'thumbnail': imgUrl,
2574             'description': description,
2575             'player_url': playerUrl,
2576         }
2577
2578         return [info]
2579
2580 class CollegeHumorIE(InfoExtractor):
2581     """Information extractor for collegehumor.com"""
2582
2583     _WORKING = False
2584     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2585     IE_NAME = u'collegehumor'
2586
2587     def report_manifest(self, video_id):
2588         """Report information extraction."""
2589         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2590
2591     def _real_extract(self, url):
2592         mobj = re.match(self._VALID_URL, url)
2593         if mobj is None:
2594             self._downloader.report_error(u'invalid URL: %s' % url)
2595             return
2596         video_id = mobj.group('videoid')
2597
2598         info = {
2599             'id': video_id,
2600             'uploader': None,
2601             'upload_date': None,
2602         }
2603
2604         self.report_extraction(video_id)
2605         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2606         try:
2607             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2609             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2610             return
2611
2612         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2613         try:
2614             videoNode = mdoc.findall('./video')[0]
2615             info['description'] = videoNode.findall('./description')[0].text
2616             info['title'] = videoNode.findall('./caption')[0].text
2617             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2618             manifest_url = videoNode.findall('./file')[0].text
2619         except IndexError:
2620             self._downloader.report_error(u'Invalid metadata XML file')
2621             return
2622
2623         manifest_url += '?hdcore=2.10.3'
2624         self.report_manifest(video_id)
2625         try:
2626             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2627         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2629             return
2630
2631         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2632         try:
2633             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2634             node_id = media_node.attrib['url']
2635             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2636         except IndexError as err:
2637             self._downloader.report_error(u'Invalid manifest file')
2638             return
2639
2640         url_pr = compat_urllib_parse_urlparse(manifest_url)
2641         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2642
2643         info['url'] = url
2644         info['ext'] = 'f4f'
2645         return [info]
2646
2647
2648 class XVideosIE(InfoExtractor):
2649     """Information extractor for xvideos.com"""
2650
2651     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2652     IE_NAME = u'xvideos'
2653
2654     def _real_extract(self, url):
2655         mobj = re.match(self._VALID_URL, url)
2656         if mobj is None:
2657             self._downloader.report_error(u'invalid URL: %s' % url)
2658             return
2659         video_id = mobj.group(1)
2660
2661         webpage = self._download_webpage(url, video_id)
2662
2663         self.report_extraction(video_id)
2664
2665
2666         # Extract video URL
2667         mobj = re.search(r'flv_url=(.+?)&', webpage)
2668         if mobj is None:
2669             self._downloader.report_error(u'unable to extract video url')
2670             return
2671         video_url = compat_urllib_parse.unquote(mobj.group(1))
2672
2673
2674         # Extract title
2675         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2676         if mobj is None:
2677             self._downloader.report_error(u'unable to extract video title')
2678             return
2679         video_title = mobj.group(1)
2680
2681
2682         # Extract video thumbnail
2683         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2684         if mobj is None:
2685             self._downloader.report_error(u'unable to extract video thumbnail')
2686             return
2687         video_thumbnail = mobj.group(0)
2688
2689         info = {
2690             'id': video_id,
2691             'url': video_url,
2692             'uploader': None,
2693             'upload_date': None,
2694             'title': video_title,
2695             'ext': 'flv',
2696             'thumbnail': video_thumbnail,
2697             'description': None,
2698         }
2699
2700         return [info]
2701
2702
2703 class SoundcloudIE(InfoExtractor):
2704     """Information extractor for soundcloud.com
2705        To access the media, the uid of the song and a stream token
2706        must be extracted from the page source and the script must make
2707        a request to media.soundcloud.com/crossdomain.xml. Then
2708        the media can be grabbed by requesting from an url composed
2709        of the stream token and uid
2710      """
2711
2712     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2713     IE_NAME = u'soundcloud'
2714
2715     def __init__(self, downloader=None):
2716         InfoExtractor.__init__(self, downloader)
2717
2718     def report_resolve(self, video_id):
2719         """Report information extraction."""
2720         self.to_screen(u'%s: Resolving id' % video_id)
2721
2722     def _real_extract(self, url):
2723         mobj = re.match(self._VALID_URL, url)
2724         if mobj is None:
2725             self._downloader.report_error(u'invalid URL: %s' % url)
2726             return
2727
2728         # extract uploader (which is in the url)
2729         uploader = mobj.group(1)
2730         # extract simple title (uploader + slug of song title)
2731         slug_title =  mobj.group(2)
2732         simple_title = uploader + u'-' + slug_title
2733
2734         self.report_resolve('%s/%s' % (uploader, slug_title))
2735
2736         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2737         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2738         request = compat_urllib_request.Request(resolv_url)
2739         try:
2740             info_json_bytes = compat_urllib_request.urlopen(request).read()
2741             info_json = info_json_bytes.decode('utf-8')
2742         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2743             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2744             return
2745
2746         info = json.loads(info_json)
2747         video_id = info['id']
2748         self.report_extraction('%s/%s' % (uploader, slug_title))
2749
2750         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2751         request = compat_urllib_request.Request(streams_url)
2752         try:
2753             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2754             stream_json = stream_json_bytes.decode('utf-8')
2755         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2756             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2757             return
2758
2759         streams = json.loads(stream_json)
2760         mediaURL = streams['http_mp3_128_url']
2761
2762         return [{
2763             'id':       info['id'],
2764             'url':      mediaURL,
2765             'uploader': info['user']['username'],
2766             'upload_date':  info['created_at'],
2767             'title':    info['title'],
2768             'ext':      u'mp3',
2769             'description': info['description'],
2770         }]
2771
2772 class SoundcloudSetIE(InfoExtractor):
2773     """Information extractor for soundcloud.com sets
2774        To access the media, the uid of the song and a stream token
2775        must be extracted from the page source and the script must make
2776        a request to media.soundcloud.com/crossdomain.xml. Then
2777        the media can be grabbed by requesting from an url composed
2778        of the stream token and uid
2779      """
2780
2781     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2782     IE_NAME = u'soundcloud'
2783
2784     def __init__(self, downloader=None):
2785         InfoExtractor.__init__(self, downloader)
2786
2787     def report_resolve(self, video_id):
2788         """Report information extraction."""
2789         self.to_screen(u'%s: Resolving id' % video_id)
2790
2791     def _real_extract(self, url):
2792         mobj = re.match(self._VALID_URL, url)
2793         if mobj is None:
2794             self._downloader.report_error(u'invalid URL: %s' % url)
2795             return
2796
2797         # extract uploader (which is in the url)
2798         uploader = mobj.group(1)
2799         # extract simple title (uploader + slug of song title)
2800         slug_title =  mobj.group(2)
2801         simple_title = uploader + u'-' + slug_title
2802
2803         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2804
2805         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2806         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807         request = compat_urllib_request.Request(resolv_url)
2808         try:
2809             info_json_bytes = compat_urllib_request.urlopen(request).read()
2810             info_json = info_json_bytes.decode('utf-8')
2811         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2813             return
2814
2815         videos = []
2816         info = json.loads(info_json)
2817         if 'errors' in info:
2818             for err in info['errors']:
2819                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2820             return
2821
2822         for track in info['tracks']:
2823             video_id = track['id']
2824             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2825
2826             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2827             request = compat_urllib_request.Request(streams_url)
2828             try:
2829                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2830                 stream_json = stream_json_bytes.decode('utf-8')
2831             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2832                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2833                 return
2834
2835             streams = json.loads(stream_json)
2836             mediaURL = streams['http_mp3_128_url']
2837
2838             videos.append({
2839                 'id':       video_id,
2840                 'url':      mediaURL,
2841                 'uploader': track['user']['username'],
2842                 'upload_date':  track['created_at'],
2843                 'title':    track['title'],
2844                 'ext':      u'mp3',
2845                 'description': track['description'],
2846             })
2847         return videos
2848
2849
2850 class InfoQIE(InfoExtractor):
2851     """Information extractor for infoq.com"""
2852     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2853
2854     def _real_extract(self, url):
2855         mobj = re.match(self._VALID_URL, url)
2856         if mobj is None:
2857             self._downloader.report_error(u'invalid URL: %s' % url)
2858             return
2859
2860         webpage = self._download_webpage(url, video_id=url)
2861         self.report_extraction(url)
2862
2863         # Extract video URL
2864         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2865         if mobj is None:
2866             self._downloader.report_error(u'unable to extract video url')
2867             return
2868         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2869         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2870
2871         # Extract title
2872         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2873         if mobj is None:
2874             self._downloader.report_error(u'unable to extract video title')
2875             return
2876         video_title = mobj.group(1)
2877
2878         # Extract description
2879         video_description = u'No description available.'
2880         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2881         if mobj is not None:
2882             video_description = mobj.group(1)
2883
2884         video_filename = video_url.split('/')[-1]
2885         video_id, extension = video_filename.split('.')
2886
2887         info = {
2888             'id': video_id,
2889             'url': video_url,
2890             'uploader': None,
2891             'upload_date': None,
2892             'title': video_title,
2893             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2894             'thumbnail': None,
2895             'description': video_description,
2896         }
2897
2898         return [info]
2899
2900 class MixcloudIE(InfoExtractor):
2901     """Information extractor for www.mixcloud.com"""
2902
2903     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2904     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2905     IE_NAME = u'mixcloud'
2906
2907     def __init__(self, downloader=None):
2908         InfoExtractor.__init__(self, downloader)
2909
2910     def report_download_json(self, file_id):
2911         """Report JSON download."""
2912         self.to_screen(u'Downloading json')
2913
2914     def get_urls(self, jsonData, fmt, bitrate='best'):
2915         """Get urls from 'audio_formats' section in json"""
2916         file_url = None
2917         try:
2918             bitrate_list = jsonData[fmt]
2919             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2920                 bitrate = max(bitrate_list) # select highest
2921
2922             url_list = jsonData[fmt][bitrate]
2923         except TypeError: # we have no bitrate info.
2924             url_list = jsonData[fmt]
2925         return url_list
2926
2927     def check_urls(self, url_list):
2928         """Returns 1st active url from list"""
2929         for url in url_list:
2930             try:
2931                 compat_urllib_request.urlopen(url)
2932                 return url
2933             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934                 url = None
2935
2936         return None
2937
2938     def _print_formats(self, formats):
2939         print('Available formats:')
2940         for fmt in formats.keys():
2941             for b in formats[fmt]:
2942                 try:
2943                     ext = formats[fmt][b][0]
2944                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2945                 except TypeError: # we have no bitrate info
2946                     ext = formats[fmt][0]
2947                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2948                     break
2949
2950     def _real_extract(self, url):
2951         mobj = re.match(self._VALID_URL, url)
2952         if mobj is None:
2953             self._downloader.report_error(u'invalid URL: %s' % url)
2954             return
2955         # extract uploader & filename from url
2956         uploader = mobj.group(1).decode('utf-8')
2957         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2958
2959         # construct API request
2960         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2961         # retrieve .json file with links to files
2962         request = compat_urllib_request.Request(file_url)
2963         try:
2964             self.report_download_json(file_url)
2965             jsonData = compat_urllib_request.urlopen(request).read()
2966         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2967             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2968             return
2969
2970         # parse JSON
2971         json_data = json.loads(jsonData)
2972         player_url = json_data['player_swf_url']
2973         formats = dict(json_data['audio_formats'])
2974
2975         req_format = self._downloader.params.get('format', None)
2976         bitrate = None
2977
2978         if self._downloader.params.get('listformats', None):
2979             self._print_formats(formats)
2980             return
2981
2982         if req_format is None or req_format == 'best':
2983             for format_param in formats.keys():
2984                 url_list = self.get_urls(formats, format_param)
2985                 # check urls
2986                 file_url = self.check_urls(url_list)
2987                 if file_url is not None:
2988                     break # got it!
2989         else:
2990             if req_format not in formats:
2991                 self._downloader.report_error(u'format is not available')
2992                 return
2993
2994             url_list = self.get_urls(formats, req_format)
2995             file_url = self.check_urls(url_list)
2996             format_param = req_format
2997
2998         return [{
2999             'id': file_id.decode('utf-8'),
3000             'url': file_url.decode('utf-8'),
3001             'uploader': uploader.decode('utf-8'),
3002             'upload_date': None,
3003             'title': json_data['name'],
3004             'ext': file_url.split('.')[-1].decode('utf-8'),
3005             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3006             'thumbnail': json_data['thumbnail_url'],
3007             'description': json_data['description'],
3008             'player_url': player_url.decode('utf-8'),
3009         }]
3010
3011 class StanfordOpenClassroomIE(InfoExtractor):
3012     """Information extractor for Stanford's Open ClassRoom"""
3013
3014     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3015     IE_NAME = u'stanfordoc'
3016
3017     def report_download_webpage(self, objid):
3018         """Report information extraction."""
3019         self.to_screen(u'%s: Downloading webpage' % objid)
3020
3021     def _real_extract(self, url):
3022         mobj = re.match(self._VALID_URL, url)
3023         if mobj is None:
3024             raise ExtractorError(u'Invalid URL: %s' % url)
3025
3026         if mobj.group('course') and mobj.group('video'): # A specific video
3027             course = mobj.group('course')
3028             video = mobj.group('video')
3029             info = {
3030                 'id': course + '_' + video,
3031                 'uploader': None,
3032                 'upload_date': None,
3033             }
3034
3035             self.report_extraction(info['id'])
3036             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3037             xmlUrl = baseUrl + video + '.xml'
3038             try:
3039                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3040             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3041                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3042                 return
3043             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3044             try:
3045                 info['title'] = mdoc.findall('./title')[0].text
3046                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3047             except IndexError:
3048                 self._downloader.report_error(u'Invalid metadata XML file')
3049                 return
3050             info['ext'] = info['url'].rpartition('.')[2]
3051             return [info]
3052         elif mobj.group('course'): # A course page
3053             course = mobj.group('course')
3054             info = {
3055                 'id': course,
3056                 'type': 'playlist',
3057                 'uploader': None,
3058                 'upload_date': None,
3059             }
3060
3061             coursepage = self._download_webpage(url, info['id'],
3062                                         note='Downloading course info page',
3063                                         errnote='Unable to download course info page')
3064
3065             m = re.search('<h1>([^<]+)</h1>', coursepage)
3066             if m:
3067                 info['title'] = unescapeHTML(m.group(1))
3068             else:
3069                 info['title'] = info['id']
3070
3071             m = re.search('<description>([^<]+)</description>', coursepage)
3072             if m:
3073                 info['description'] = unescapeHTML(m.group(1))
3074
3075             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3076             info['list'] = [
3077                 {
3078                     'type': 'reference',
3079                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3080                 }
3081                     for vpage in links]
3082             results = []
3083             for entry in info['list']:
3084                 assert entry['type'] == 'reference'
3085                 results += self.extract(entry['url'])
3086             return results
3087         else: # Root page
3088             info = {
3089                 'id': 'Stanford OpenClassroom',
3090                 'type': 'playlist',
3091                 'uploader': None,
3092                 'upload_date': None,
3093             }
3094
3095             self.report_download_webpage(info['id'])
3096             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3097             try:
3098                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3099             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3100                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3101                 return
3102
3103             info['title'] = info['id']
3104
3105             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3106             info['list'] = [
3107                 {
3108                     'type': 'reference',
3109                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3110                 }
3111                     for cpage in links]
3112
3113             results = []
3114             for entry in info['list']:
3115                 assert entry['type'] == 'reference'
3116                 results += self.extract(entry['url'])
3117             return results
3118
3119 class MTVIE(InfoExtractor):
3120     """Information extractor for MTV.com"""
3121
3122     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3123     IE_NAME = u'mtv'
3124
3125     def _real_extract(self, url):
3126         mobj = re.match(self._VALID_URL, url)
3127         if mobj is None:
3128             self._downloader.report_error(u'invalid URL: %s' % url)
3129             return
3130         if not mobj.group('proto'):
3131             url = 'http://' + url
3132         video_id = mobj.group('videoid')
3133
3134         webpage = self._download_webpage(url, video_id)
3135
3136         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3137         if mobj is None:
3138             self._downloader.report_error(u'unable to extract song name')
3139             return
3140         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3141         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3142         if mobj is None:
3143             self._downloader.report_error(u'unable to extract performer')
3144             return
3145         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3146         video_title = performer + ' - ' + song_name
3147
3148         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3149         if mobj is None:
3150             self._downloader.report_error(u'unable to mtvn_uri')
3151             return
3152         mtvn_uri = mobj.group(1)
3153
3154         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3155         if mobj is None:
3156             self._downloader.report_error(u'unable to extract content id')
3157             return
3158         content_id = mobj.group(1)
3159
3160         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3161         self.report_extraction(video_id)
3162         request = compat_urllib_request.Request(videogen_url)
3163         try:
3164             metadataXml = compat_urllib_request.urlopen(request).read()
3165         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3167             return
3168
3169         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3170         renditions = mdoc.findall('.//rendition')
3171
3172         # For now, always pick the highest quality.
3173         rendition = renditions[-1]
3174
3175         try:
3176             _,_,ext = rendition.attrib['type'].partition('/')
3177             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3178             video_url = rendition.find('./src').text
3179         except KeyError:
3180             self._downloader.report_error('Invalid rendition field.')
3181             return
3182
3183         info = {
3184             'id': video_id,
3185             'url': video_url,
3186             'uploader': performer,
3187             'upload_date': None,
3188             'title': video_title,
3189             'ext': ext,
3190             'format': format,
3191         }
3192
3193         return [info]
3194
3195
3196 class YoukuIE(InfoExtractor):
3197     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3198
3199     def report_download_webpage(self, file_id):
3200         """Report webpage download."""
3201         self.to_screen(u'%s: Downloading webpage' % file_id)
3202
3203     def _gen_sid(self):
3204         nowTime = int(time.time() * 1000)
3205         random1 = random.randint(1000,1998)
3206         random2 = random.randint(1000,9999)
3207
3208         return "%d%d%d" %(nowTime,random1,random2)
3209
3210     def _get_file_ID_mix_string(self, seed):
3211         mixed = []
3212         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3213         seed = float(seed)
3214         for i in range(len(source)):
3215             seed  =  (seed * 211 + 30031 ) % 65536
3216             index  =  math.floor(seed / 65536 * len(source) )
3217             mixed.append(source[int(index)])
3218             source.remove(source[int(index)])
3219         #return ''.join(mixed)
3220         return mixed
3221
3222     def _get_file_id(self, fileId, seed):
3223         mixed = self._get_file_ID_mix_string(seed)
3224         ids = fileId.split('*')
3225         realId = []
3226         for ch in ids:
3227             if ch:
3228                 realId.append(mixed[int(ch)])
3229         return ''.join(realId)
3230
3231     def _real_extract(self, url):
3232         mobj = re.match(self._VALID_URL, url)
3233         if mobj is None:
3234             self._downloader.report_error(u'invalid URL: %s' % url)
3235             return
3236         video_id = mobj.group('ID')
3237
3238         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3239
3240         request = compat_urllib_request.Request(info_url, None, std_headers)
3241         try:
3242             self.report_download_webpage(video_id)
3243             jsondata = compat_urllib_request.urlopen(request).read()
3244         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3246             return
3247
3248         self.report_extraction(video_id)
3249         try:
3250             jsonstr = jsondata.decode('utf-8')
3251             config = json.loads(jsonstr)
3252
3253             video_title =  config['data'][0]['title']
3254             seed = config['data'][0]['seed']
3255
3256             format = self._downloader.params.get('format', None)
3257             supported_format = list(config['data'][0]['streamfileids'].keys())
3258
3259             if format is None or format == 'best':
3260                 if 'hd2' in supported_format:
3261                     format = 'hd2'
3262                 else:
3263                     format = 'flv'
3264                 ext = u'flv'
3265             elif format == 'worst':
3266                 format = 'mp4'
3267                 ext = u'mp4'
3268             else:
3269                 format = 'flv'
3270                 ext = u'flv'
3271
3272
3273             fileid = config['data'][0]['streamfileids'][format]
3274             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3275         except (UnicodeDecodeError, ValueError, KeyError):
3276             self._downloader.report_error(u'unable to extract info section')
3277             return
3278
3279         files_info=[]
3280         sid = self._gen_sid()
3281         fileid = self._get_file_id(fileid, seed)
3282
3283         #column 8,9 of fileid represent the segment number
3284         #fileid[7:9] should be changed
3285         for index, key in enumerate(keys):
3286
3287             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3288             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3289
3290             info = {
3291                 'id': '%s_part%02d' % (video_id, index),
3292                 'url': download_url,
3293                 'uploader': None,
3294                 'upload_date': None,
3295                 'title': video_title,
3296                 'ext': ext,
3297             }
3298             files_info.append(info)
3299
3300         return files_info
3301
3302
3303 class XNXXIE(InfoExtractor):
3304     """Information extractor for xnxx.com"""
3305
3306     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3307     IE_NAME = u'xnxx'
3308     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3309     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3310     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3311
3312     def report_webpage(self, video_id):
3313         """Report information extraction"""
3314         self.to_screen(u'%s: Downloading webpage' % video_id)
3315
3316     def _real_extract(self, url):
3317         mobj = re.match(self._VALID_URL, url)
3318         if mobj is None:
3319             self._downloader.report_error(u'invalid URL: %s' % url)
3320             return
3321         video_id = mobj.group(1)
3322
3323         self.report_webpage(video_id)
3324
3325         # Get webpage content
3326         try:
3327             webpage_bytes = compat_urllib_request.urlopen(url).read()
3328             webpage = webpage_bytes.decode('utf-8')
3329         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3330             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3331             return
3332
3333         result = re.search(self.VIDEO_URL_RE, webpage)
3334         if result is None:
3335             self._downloader.report_error(u'unable to extract video url')
3336             return
3337         video_url = compat_urllib_parse.unquote(result.group(1))
3338
3339         result = re.search(self.VIDEO_TITLE_RE, webpage)
3340         if result is None:
3341             self._downloader.report_error(u'unable to extract video title')
3342             return
3343         video_title = result.group(1)
3344
3345         result = re.search(self.VIDEO_THUMB_RE, webpage)
3346         if result is None:
3347             self._downloader.report_error(u'unable to extract video thumbnail')
3348             return
3349         video_thumbnail = result.group(1)
3350
3351         return [{
3352             'id': video_id,
3353             'url': video_url,
3354             'uploader': None,
3355             'upload_date': None,
3356             'title': video_title,
3357             'ext': 'flv',
3358             'thumbnail': video_thumbnail,
3359             'description': None,
3360         }]
3361
3362
3363 class GooglePlusIE(InfoExtractor):
3364     """Information extractor for plus.google.com."""
3365
3366     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3367     IE_NAME = u'plus.google'
3368
3369     def __init__(self, downloader=None):
3370         InfoExtractor.__init__(self, downloader)
3371
3372     def report_extract_entry(self, url):
3373         """Report downloading extry"""
3374         self.to_screen(u'Downloading entry: %s' % url)
3375
3376     def report_date(self, upload_date):
3377         """Report downloading extry"""
3378         self.to_screen(u'Entry date: %s' % upload_date)
3379
3380     def report_uploader(self, uploader):
3381         """Report downloading extry"""
3382         self.to_screen(u'Uploader: %s' % uploader)
3383
3384     def report_title(self, video_title):
3385         """Report downloading extry"""
3386         self.to_screen(u'Title: %s' % video_title)
3387
3388     def report_extract_vid_page(self, video_page):
3389         """Report information extraction."""
3390         self.to_screen(u'Extracting video page: %s' % video_page)
3391
3392     def _real_extract(self, url):
3393         # Extract id from URL
3394         mobj = re.match(self._VALID_URL, url)
3395         if mobj is None:
3396             self._downloader.report_error(u'Invalid URL: %s' % url)
3397             return
3398
3399         post_url = mobj.group(0)
3400         video_id = mobj.group(1)
3401
3402         video_extension = 'flv'
3403
3404         # Step 1, Retrieve post webpage to extract further information
3405         self.report_extract_entry(post_url)
3406         request = compat_urllib_request.Request(post_url)
3407         try:
3408             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3410             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3411             return
3412
3413         # Extract update date
3414         upload_date = None
3415         pattern = 'title="Timestamp">(.*?)</a>'
3416         mobj = re.search(pattern, webpage)
3417         if mobj:
3418             upload_date = mobj.group(1)
3419             # Convert timestring to a format suitable for filename
3420             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3421             upload_date = upload_date.strftime('%Y%m%d')
3422         self.report_date(upload_date)
3423
3424         # Extract uploader
3425         uploader = None
3426         pattern = r'rel\="author".*?>(.*?)</a>'
3427         mobj = re.search(pattern, webpage)
3428         if mobj:
3429             uploader = mobj.group(1)
3430         self.report_uploader(uploader)
3431
3432         # Extract title
3433         # Get the first line for title
3434         video_title = u'NA'
3435         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3436         mobj = re.search(pattern, webpage)
3437         if mobj:
3438             video_title = mobj.group(1)
3439         self.report_title(video_title)
3440
3441         # Step 2, Stimulate clicking the image box to launch video
3442         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3443         mobj = re.search(pattern, webpage)
3444         if mobj is None:
3445             self._downloader.report_error(u'unable to extract video page URL')
3446
3447         video_page = mobj.group(1)
3448         request = compat_urllib_request.Request(video_page)
3449         try:
3450             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3452             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3453             return
3454         self.report_extract_vid_page(video_page)
3455
3456
3457         # Extract video links on video page
3458         """Extract video links of all sizes"""
3459         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3460         mobj = re.findall(pattern, webpage)
3461         if len(mobj) == 0:
3462             self._downloader.report_error(u'unable to extract video links')
3463
3464         # Sort in resolution
3465         links = sorted(mobj)
3466
3467         # Choose the lowest of the sort, i.e. highest resolution
3468         video_url = links[-1]
3469         # Only get the url. The resolution part in the tuple has no use anymore
3470         video_url = video_url[-1]
3471         # Treat escaped \u0026 style hex
3472         try:
3473             video_url = video_url.decode("unicode_escape")
3474         except AttributeError: # Python 3
3475             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3476
3477
3478         return [{
3479             'id':       video_id,
3480             'url':      video_url,
3481             'uploader': uploader,
3482             'upload_date':  upload_date,
3483             'title':    video_title,
3484             'ext':      video_extension,
3485         }]
3486
3487 class NBAIE(InfoExtractor):
3488     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3489     IE_NAME = u'nba'
3490
3491     def _real_extract(self, url):
3492         mobj = re.match(self._VALID_URL, url)
3493         if mobj is None:
3494             self._downloader.report_error(u'invalid URL: %s' % url)
3495             return
3496
3497         video_id = mobj.group(1)
3498         if video_id.endswith('/index.html'):
3499             video_id = video_id[:-len('/index.html')]
3500
3501         webpage = self._download_webpage(url, video_id)
3502
3503         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3504         def _findProp(rexp, default=None):
3505             m = re.search(rexp, webpage)
3506             if m:
3507                 return unescapeHTML(m.group(1))
3508             else:
3509                 return default
3510
3511         shortened_video_id = video_id.rpartition('/')[2]
3512         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3513         info = {
3514             'id': shortened_video_id,
3515             'url': video_url,
3516             'ext': 'mp4',
3517             'title': title,
3518             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3519             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3520         }
3521         return [info]
3522
3523 class JustinTVIE(InfoExtractor):
3524     """Information extractor for justin.tv and twitch.tv"""
3525     # TODO: One broadcast may be split into multiple videos. The key
3526     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3527     # starts at 1 and increases. Can we treat all parts as one video?
3528
3529     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3530         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3531     _JUSTIN_PAGE_LIMIT = 100
3532     IE_NAME = u'justin.tv'
3533
3534     def report_download_page(self, channel, offset):
3535         """Report attempt to download a single page of videos."""
3536         self.to_screen(u'%s: Downloading video information from %d to %d' %
3537                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3538
3539     # Return count of items, list of *valid* items
3540     def _parse_page(self, url):
3541         try:
3542             urlh = compat_urllib_request.urlopen(url)
3543             webpage_bytes = urlh.read()
3544             webpage = webpage_bytes.decode('utf-8', 'ignore')
3545         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3547             return
3548
3549         response = json.loads(webpage)
3550         if type(response) != list:
3551             error_text = response.get('error', 'unknown error')
3552             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3553             return
3554         info = []
3555         for clip in response:
3556             video_url = clip['video_file_url']
3557             if video_url:
3558                 video_extension = os.path.splitext(video_url)[1][1:]
3559                 video_date = re.sub('-', '', clip['start_time'][:10])
3560                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3561                 video_id = clip['id']
3562                 video_title = clip.get('title', video_id)
3563                 info.append({
3564                     'id': video_id,
3565                     'url': video_url,
3566                     'title': video_title,
3567                     'uploader': clip.get('channel_name', video_uploader_id),
3568                     'uploader_id': video_uploader_id,
3569                     'upload_date': video_date,
3570                     'ext': video_extension,
3571                 })
3572         return (len(response), info)
3573
3574     def _real_extract(self, url):
3575         mobj = re.match(self._VALID_URL, url)
3576         if mobj is None:
3577             self._downloader.report_error(u'invalid URL: %s' % url)
3578             return
3579
3580         api = 'http://api.justin.tv'
3581         video_id = mobj.group(mobj.lastindex)
3582         paged = False
3583         if mobj.lastindex == 1:
3584             paged = True
3585             api += '/channel/archives/%s.json'
3586         else:
3587             api += '/broadcast/by_archive/%s.json'
3588         api = api % (video_id,)
3589
3590         self.report_extraction(video_id)
3591
3592         info = []
3593         offset = 0
3594         limit = self._JUSTIN_PAGE_LIMIT
3595         while True:
3596             if paged:
3597                 self.report_download_page(video_id, offset)
3598             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3599             page_count, page_info = self._parse_page(page_url)
3600             info.extend(page_info)
3601             if not paged or page_count != limit:
3602                 break
3603             offset += limit
3604         return info
3605
3606 class FunnyOrDieIE(InfoExtractor):
3607     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3608
3609     def _real_extract(self, url):
3610         mobj = re.match(self._VALID_URL, url)
3611         if mobj is None:
3612             self._downloader.report_error(u'invalid URL: %s' % url)
3613             return
3614
3615         video_id = mobj.group('id')
3616         webpage = self._download_webpage(url, video_id)
3617
3618         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3619         if not m:
3620             self._downloader.report_error(u'unable to find video information')
3621         video_url = unescapeHTML(m.group('url'))
3622
3623         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3624         if not m:
3625             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3626             if not m:
3627                 self._downloader.report_error(u'Cannot find video title')
3628         title = clean_html(m.group('title'))
3629
3630         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3631         if m:
3632             desc = unescapeHTML(m.group('desc'))
3633         else:
3634             desc = None
3635
3636         info = {
3637             'id': video_id,
3638             'url': video_url,
3639             'ext': 'mp4',
3640             'title': title,
3641             'description': desc,
3642         }
3643         return [info]
3644
3645 class SteamIE(InfoExtractor):
3646     _VALID_URL = r"""http://store.steampowered.com/
3647                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3648                 (?P<gameID>\d+)/?
3649                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3650                 """
3651
3652     @classmethod
3653     def suitable(cls, url):
3654         """Receives a URL and returns True if suitable for this IE."""
3655         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3656
3657     def _real_extract(self, url):
3658         m = re.match(self._VALID_URL, url, re.VERBOSE)
3659         gameID = m.group('gameID')
3660         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3661         self.report_age_confirmation()
3662         webpage = self._download_webpage(videourl, gameID)
3663         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3664         
3665         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3666         mweb = re.finditer(urlRE, webpage)
3667         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3668         titles = re.finditer(namesRE, webpage)
3669         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3670         thumbs = re.finditer(thumbsRE, webpage)
3671         videos = []
3672         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3673             video_id = vid.group('videoID')
3674             title = vtitle.group('videoName')
3675             video_url = vid.group('videoURL')
3676             video_thumb = thumb.group('thumbnail')
3677             if not video_url:
3678                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3679             info = {
3680                 'id':video_id,
3681                 'url':video_url,
3682                 'ext': 'flv',
3683                 'title': unescapeHTML(title),
3684                 'thumbnail': video_thumb
3685                   }
3686             videos.append(info)
3687         return [self.playlist_result(videos, gameID, game_title)]
3688
3689 class UstreamIE(InfoExtractor):
3690     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3691     IE_NAME = u'ustream'
3692
3693     def _real_extract(self, url):
3694         m = re.match(self._VALID_URL, url)
3695         video_id = m.group('videoID')
3696         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3697         webpage = self._download_webpage(url, video_id)
3698         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3699         title = m.group('title')
3700         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3701         uploader = m.group('uploader')
3702         info = {
3703                 'id':video_id,
3704                 'url':video_url,
3705                 'ext': 'flv',
3706                 'title': title,
3707                 'uploader': uploader
3708                   }
3709         return [info]
3710
3711 class WorldStarHipHopIE(InfoExtractor):
3712     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3713     IE_NAME = u'WorldStarHipHop'
3714
3715     def _real_extract(self, url):
3716         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3717
3718         webpage_src = compat_urllib_request.urlopen(url).read()
3719         webpage_src = webpage_src.decode('utf-8')
3720
3721         mobj = re.search(_src_url, webpage_src)
3722
3723         m = re.match(self._VALID_URL, url)
3724         video_id = m.group('id')
3725
3726         if mobj is not None:
3727             video_url = mobj.group()
3728             if 'mp4' in video_url:
3729                 ext = 'mp4'
3730             else:
3731                 ext = 'flv'
3732         else:
3733             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3734             return
3735
3736         _title = r"""<title>(.*)</title>"""
3737
3738         mobj = re.search(_title, webpage_src)
3739
3740         if mobj is not None:
3741             title = mobj.group(1)
3742         else:
3743             title = 'World Start Hip Hop - %s' % time.ctime()
3744
3745         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3746         mobj = re.search(_thumbnail, webpage_src)
3747
3748         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3749         if mobj is not None:
3750             thumbnail = mobj.group(1)
3751         else:
3752             _title = r"""candytitles.*>(.*)</span>"""
3753             mobj = re.search(_title, webpage_src)
3754             if mobj is not None:
3755                 title = mobj.group(1)
3756             thumbnail = None
3757
3758         results = [{
3759                     'id': video_id,
3760                     'url' : video_url,
3761                     'title' : title,
3762                     'thumbnail' : thumbnail,
3763                     'ext' : ext,
3764                     }]
3765         return results
3766
3767 class RBMARadioIE(InfoExtractor):
3768     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3769
3770     def _real_extract(self, url):
3771         m = re.match(self._VALID_URL, url)
3772         video_id = m.group('videoID')
3773
3774         webpage = self._download_webpage(url, video_id)
3775         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3776         if not m:
3777             raise ExtractorError(u'Cannot find metadata')
3778         json_data = m.group(1)
3779
3780         try:
3781             data = json.loads(json_data)
3782         except ValueError as e:
3783             raise ExtractorError(u'Invalid JSON: ' + str(e))
3784
3785         video_url = data['akamai_url'] + '&cbr=256'
3786         url_parts = compat_urllib_parse_urlparse(video_url)
3787         video_ext = url_parts.path.rpartition('.')[2]
3788         info = {
3789                 'id': video_id,
3790                 'url': video_url,
3791                 'ext': video_ext,
3792                 'title': data['title'],
3793                 'description': data.get('teaser_text'),
3794                 'location': data.get('country_of_origin'),
3795                 'uploader': data.get('host', {}).get('name'),
3796                 'uploader_id': data.get('host', {}).get('slug'),
3797                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3798                 'duration': data.get('duration'),
3799         }
3800         return [info]
3801
3802
3803 class YouPornIE(InfoExtractor):
3804     """Information extractor for youporn.com."""
3805     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3806
3807     def _print_formats(self, formats):
3808         """Print all available formats"""
3809         print(u'Available formats:')
3810         print(u'ext\t\tformat')
3811         print(u'---------------------------------')
3812         for format in formats:
3813             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3814
3815     def _specific(self, req_format, formats):
3816         for x in formats:
3817             if(x["format"]==req_format):
3818                 return x
3819         return None
3820
3821     def _real_extract(self, url):
3822         mobj = re.match(self._VALID_URL, url)
3823         if mobj is None:
3824             self._downloader.report_error(u'invalid URL: %s' % url)
3825             return
3826
3827         video_id = mobj.group('videoid')
3828
3829         req = compat_urllib_request.Request(url)
3830         req.add_header('Cookie', 'age_verified=1')
3831         webpage = self._download_webpage(req, video_id)
3832
3833         # Get the video title
3834         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3835         if result is None:
3836             raise ExtractorError(u'Unable to extract video title')
3837         video_title = result.group('title').strip()
3838
3839         # Get the video date
3840         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3841         if result is None:
3842             self._downloader.report_warning(u'unable to extract video date')
3843             upload_date = None
3844         else:
3845             upload_date = result.group('date').strip()
3846
3847         # Get the video uploader
3848         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3849         if result is None:
3850             self._downloader.report_warning(u'unable to extract uploader')
3851             video_uploader = None
3852         else:
3853             video_uploader = result.group('uploader').strip()
3854             video_uploader = clean_html( video_uploader )
3855
3856         # Get all of the formats available
3857         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3858         result = re.search(DOWNLOAD_LIST_RE, webpage)
3859         if result is None:
3860             raise ExtractorError(u'Unable to extract download list')
3861         download_list_html = result.group('download_list').strip()
3862
3863         # Get all of the links from the page
3864         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3865         links = re.findall(LINK_RE, download_list_html)
3866         if(len(links) == 0):
3867             raise ExtractorError(u'ERROR: no known formats available for video')
3868
3869         self.to_screen(u'Links found: %d' % len(links))
3870
3871         formats = []
3872         for link in links:
3873
3874             # A link looks like this:
3875             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3876             # A path looks like this:
3877             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3878             video_url = unescapeHTML( link )
3879             path = compat_urllib_parse_urlparse( video_url ).path
3880             extension = os.path.splitext( path )[1][1:]
3881             format = path.split('/')[4].split('_')[:2]
3882             size = format[0]
3883             bitrate = format[1]
3884             format = "-".join( format )
3885             title = u'%s-%s-%s' % (video_title, size, bitrate)
3886
3887             formats.append({
3888                 'id': video_id,
3889                 'url': video_url,
3890                 'uploader': video_uploader,
3891                 'upload_date': upload_date,
3892                 'title': title,
3893                 'ext': extension,
3894                 'format': format,
3895                 'thumbnail': None,
3896                 'description': None,
3897                 'player_url': None
3898             })
3899
3900         if self._downloader.params.get('listformats', None):
3901             self._print_formats(formats)
3902             return
3903
3904         req_format = self._downloader.params.get('format', None)
3905         self.to_screen(u'Format: %s' % req_format)
3906
3907         if req_format is None or req_format == 'best':
3908             return [formats[0]]
3909         elif req_format == 'worst':
3910             return [formats[-1]]
3911         elif req_format in ('-1', 'all'):
3912             return formats
3913         else:
3914             format = self._specific( req_format, formats )
3915             if result is None:
3916                 self._downloader.report_error(u'requested format not available')
3917                 return
3918             return [format]
3919
3920
3921
3922 class PornotubeIE(InfoExtractor):
3923     """Information extractor for pornotube.com."""
3924     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3925
3926     def _real_extract(self, url):
3927         mobj = re.match(self._VALID_URL, url)
3928         if mobj is None:
3929             self._downloader.report_error(u'invalid URL: %s' % url)
3930             return
3931
3932         video_id = mobj.group('videoid')
3933         video_title = mobj.group('title')
3934
3935         # Get webpage content
3936         webpage = self._download_webpage(url, video_id)
3937
3938         # Get the video URL
3939         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3940         result = re.search(VIDEO_URL_RE, webpage)
3941         if result is None:
3942             self._downloader.report_error(u'unable to extract video url')
3943             return
3944         video_url = compat_urllib_parse.unquote(result.group('url'))
3945
3946         #Get the uploaded date
3947         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3948         result = re.search(VIDEO_UPLOADED_RE, webpage)
3949         if result is None:
3950             self._downloader.report_error(u'unable to extract video title')
3951             return
3952         upload_date = result.group('date')
3953
3954         info = {'id': video_id,
3955                 'url': video_url,
3956                 'uploader': None,
3957                 'upload_date': upload_date,
3958                 'title': video_title,
3959                 'ext': 'flv',
3960                 'format': 'flv'}
3961
3962         return [info]
3963
3964 class YouJizzIE(InfoExtractor):
3965     """Information extractor for youjizz.com."""
3966     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3967
3968     def _real_extract(self, url):
3969         mobj = re.match(self._VALID_URL, url)
3970         if mobj is None:
3971             self._downloader.report_error(u'invalid URL: %s' % url)
3972             return
3973
3974         video_id = mobj.group('videoid')
3975
3976         # Get webpage content
3977         webpage = self._download_webpage(url, video_id)
3978
3979         # Get the video title
3980         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3981         if result is None:
3982             raise ExtractorError(u'ERROR: unable to extract video title')
3983         video_title = result.group('title').strip()
3984
3985         # Get the embed page
3986         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3987         if result is None:
3988             raise ExtractorError(u'ERROR: unable to extract embed page')
3989
3990         embed_page_url = result.group(0).strip()
3991         video_id = result.group('videoid')
3992
3993         webpage = self._download_webpage(embed_page_url, video_id)
3994
3995         # Get the video URL
3996         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3997         if result is None:
3998             raise ExtractorError(u'ERROR: unable to extract video url')
3999         video_url = result.group('source')
4000
4001         info = {'id': video_id,
4002                 'url': video_url,
4003                 'title': video_title,
4004                 'ext': 'flv',
4005                 'format': 'flv',
4006                 'player_url': embed_page_url}
4007
4008         return [info]
4009
4010 class EightTracksIE(InfoExtractor):
4011     IE_NAME = '8tracks'
4012     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4013
4014     def _real_extract(self, url):
4015         mobj = re.match(self._VALID_URL, url)
4016         if mobj is None:
4017             raise ExtractorError(u'Invalid URL: %s' % url)
4018         playlist_id = mobj.group('id')
4019
4020         webpage = self._download_webpage(url, playlist_id)
4021
4022         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4023         if not m:
4024             raise ExtractorError(u'Cannot find trax information')
4025         json_like = m.group(1)
4026         data = json.loads(json_like)
4027
4028         session = str(random.randint(0, 1000000000))
4029         mix_id = data['id']
4030         track_count = data['tracks_count']
4031         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4032         next_url = first_url
4033         res = []
4034         for i in itertools.count():
4035             api_json = self._download_webpage(next_url, playlist_id,
4036                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4037                 errnote=u'Failed to download song information')
4038             api_data = json.loads(api_json)
4039             track_data = api_data[u'set']['track']
4040             info = {
4041                 'id': track_data['id'],
4042                 'url': track_data['track_file_stream_url'],
4043                 'title': track_data['performer'] + u' - ' + track_data['name'],
4044                 'raw_title': track_data['name'],
4045                 'uploader_id': data['user']['login'],
4046                 'ext': 'm4a',
4047             }
4048             res.append(info)
4049             if api_data['set']['at_last_track']:
4050                 break
4051             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4052         return res
4053
4054 class KeekIE(InfoExtractor):
4055     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4056     IE_NAME = u'keek'
4057
4058     def _real_extract(self, url):
4059         m = re.match(self._VALID_URL, url)
4060         video_id = m.group('videoID')
4061         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4062         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4063         webpage = self._download_webpage(url, video_id)
4064         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4065         title = unescapeHTML(m.group('title'))
4066         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4067         uploader = clean_html(m.group('uploader'))
4068         info = {
4069                 'id': video_id,
4070                 'url': video_url,
4071                 'ext': 'mp4',
4072                 'title': title,
4073                 'thumbnail': thumbnail,
4074                 'uploader': uploader
4075         }
4076         return [info]
4077
4078 class TEDIE(InfoExtractor):
4079     _VALID_URL=r'''http://www.ted.com/
4080                    (
4081                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4082                         |
4083                         ((?P<type_talk>talks)) # We have a simple talk
4084                    )
4085                    /(?P<name>\w+) # Here goes the name and then ".html"
4086                    '''
4087
4088     @classmethod
4089     def suitable(cls, url):
4090         """Receives a URL and returns True if suitable for this IE."""
4091         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4092
4093     def _real_extract(self, url):
4094         m=re.match(self._VALID_URL, url, re.VERBOSE)
4095         if m.group('type_talk'):
4096             return [self._talk_info(url)]
4097         else :
4098             playlist_id=m.group('playlist_id')
4099             name=m.group('name')
4100             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4101             return [self._playlist_videos_info(url,name,playlist_id)]
4102
4103     def _talk_video_link(self,mediaSlug):
4104         '''Returns the video link for that mediaSlug'''
4105         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4106
4107     def _playlist_videos_info(self,url,name,playlist_id=0):
4108         '''Returns the videos of the playlist'''
4109         video_RE=r'''
4110                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4111                      ([.\s]*?)data-playlist_item_id="(\d+)"
4112                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4113                      '''
4114         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4115         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4116         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4117         m_names=re.finditer(video_name_RE,webpage)
4118
4119         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4120         m_playlist = re.search(playlist_RE, webpage)
4121         playlist_title = m_playlist.group('playlist_title')
4122
4123         playlist_entries = []
4124         for m_video, m_name in zip(m_videos,m_names):
4125             video_id=m_video.group('video_id')
4126             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4127             playlist_entries.append(self.url_result(talk_url, 'TED'))
4128         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4129
4130     def _talk_info(self, url, video_id=0):
4131         """Return the video for the talk in the url"""
4132         m=re.match(self._VALID_URL, url,re.VERBOSE)
4133         videoName=m.group('name')
4134         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4135         # If the url includes the language we get the title translated
4136         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4137         title=re.search(title_RE, webpage).group('title')
4138         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4139                         "id":(?P<videoID>[\d]+).*?
4140                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4141         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4142         thumb_match=re.search(thumb_RE,webpage)
4143         info_match=re.search(info_RE,webpage,re.VERBOSE)
4144         video_id=info_match.group('videoID')
4145         mediaSlug=info_match.group('mediaSlug')
4146         video_url=self._talk_video_link(mediaSlug)
4147         info = {
4148                 'id': video_id,
4149                 'url': video_url,
4150                 'ext': 'mp4',
4151                 'title': title,
4152                 'thumbnail': thumb_match.group('thumbnail')
4153                 }
4154         return info
4155
4156 class MySpassIE(InfoExtractor):
4157     _VALID_URL = r'http://www.myspass.de/.*'
4158
4159     def _real_extract(self, url):
4160         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4161
4162         # video id is the last path element of the URL
4163         # usually there is a trailing slash, so also try the second but last
4164         url_path = compat_urllib_parse_urlparse(url).path
4165         url_parent_path, video_id = os.path.split(url_path)
4166         if not video_id:
4167             _, video_id = os.path.split(url_parent_path)
4168
4169         # get metadata
4170         metadata_url = META_DATA_URL_TEMPLATE % video_id
4171         metadata_text = self._download_webpage(metadata_url, video_id)
4172         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4173
4174         # extract values from metadata
4175         url_flv_el = metadata.find('url_flv')
4176         if url_flv_el is None:
4177             self._downloader.report_error(u'unable to extract download url')
4178             return
4179         video_url = url_flv_el.text
4180         extension = os.path.splitext(video_url)[1][1:]
4181         title_el = metadata.find('title')
4182         if title_el is None:
4183             self._downloader.report_error(u'unable to extract title')
4184             return
4185         title = title_el.text
4186         format_id_el = metadata.find('format_id')
4187         if format_id_el is None:
4188             format = ext
4189         else:
4190             format = format_id_el.text
4191         description_el = metadata.find('description')
4192         if description_el is not None:
4193             description = description_el.text
4194         else:
4195             description = None
4196         imagePreview_el = metadata.find('imagePreview')
4197         if imagePreview_el is not None:
4198             thumbnail = imagePreview_el.text
4199         else:
4200             thumbnail = None
4201         info = {
4202             'id': video_id,
4203             'url': video_url,
4204             'title': title,
4205             'ext': extension,
4206             'format': format,
4207             'thumbnail': thumbnail,
4208             'description': description
4209         }
4210         return [info]
4211
4212 class SpiegelIE(InfoExtractor):
4213     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4214
4215     def _real_extract(self, url):
4216         m = re.match(self._VALID_URL, url)
4217         video_id = m.group('videoID')
4218
4219         webpage = self._download_webpage(url, video_id)
4220         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4221         if not m:
4222             raise ExtractorError(u'Cannot find title')
4223         video_title = unescapeHTML(m.group(1))
4224
4225         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4226         xml_code = self._download_webpage(xml_url, video_id,
4227                     note=u'Downloading XML', errnote=u'Failed to download XML')
4228
4229         idoc = xml.etree.ElementTree.fromstring(xml_code)
4230         last_type = idoc[-1]
4231         filename = last_type.findall('./filename')[0].text
4232         duration = float(last_type.findall('./duration')[0].text)
4233
4234         video_url = 'http://video2.spiegel.de/flash/' + filename
4235         video_ext = filename.rpartition('.')[2]
4236         info = {
4237             'id': video_id,
4238             'url': video_url,
4239             'ext': video_ext,
4240             'title': video_title,
4241             'duration': duration,
4242         }
4243         return [info]
4244
4245 class LiveLeakIE(InfoExtractor):
4246
4247     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4248     IE_NAME = u'liveleak'
4249
4250     def _real_extract(self, url):
4251         mobj = re.match(self._VALID_URL, url)
4252         if mobj is None:
4253             self._downloader.report_error(u'invalid URL: %s' % url)
4254             return
4255
4256         video_id = mobj.group('video_id')
4257
4258         webpage = self._download_webpage(url, video_id)
4259
4260         m = re.search(r'file: "(.*?)",', webpage)
4261         if not m:
4262             self._downloader.report_error(u'unable to find video url')
4263             return
4264         video_url = m.group(1)
4265
4266         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4267         if not m:
4268             self._downloader.report_error(u'Cannot find video title')
4269         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4270
4271         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4272         if m:
4273             desc = unescapeHTML(m.group('desc'))
4274         else:
4275             desc = None
4276
4277         m = re.search(r'By:.*?(\w+)</a>', webpage)
4278         if m:
4279             uploader = clean_html(m.group(1))
4280         else:
4281             uploader = None
4282
4283         info = {
4284             'id':  video_id,
4285             'url': video_url,
4286             'ext': 'mp4',
4287             'title': title,
4288             'description': desc,
4289             'uploader': uploader
4290         }
4291
4292         return [info]
4293
4294 class ARDIE(InfoExtractor):
4295     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4296     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4297     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4298
4299     def _real_extract(self, url):
4300         # determine video id from url
4301         m = re.match(self._VALID_URL, url)
4302
4303         numid = re.search(r'documentId=([0-9]+)', url)
4304         if numid:
4305             video_id = numid.group(1)
4306         else:
4307             video_id = m.group('video_id')
4308
4309         # determine title and media streams from webpage
4310         html = self._download_webpage(url, video_id)
4311         title = re.search(self._TITLE, html).group('title')
4312         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4313         if not streams:
4314             assert '"fsk"' in html
4315             self._downloader.report_error(u'this video is only available after 8:00 pm')
4316             return
4317
4318         # choose default media type and highest quality for now
4319         stream = max([s for s in streams if int(s["media_type"]) == 0],
4320                      key=lambda s: int(s["quality"]))
4321
4322         # there's two possibilities: RTMP stream or HTTP download
4323         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4324         if stream['rtmp_url']:
4325             self.to_screen(u'RTMP download detected')
4326             assert stream['video_url'].startswith('mp4:')
4327             info["url"] = stream["rtmp_url"]
4328             info["play_path"] = stream['video_url']
4329         else:
4330             assert stream["video_url"].endswith('.mp4')
4331             info["url"] = stream["video_url"]
4332         return [info]
4333
4334 class TumblrIE(InfoExtractor):
4335     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4336
4337     def _real_extract(self, url):
4338         m_url = re.match(self._VALID_URL, url)
4339         video_id = m_url.group('id')
4340         blog = m_url.group('blog_name')
4341
4342         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4343         webpage = self._download_webpage(url, video_id)
4344
4345         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4346         video = re.search(re_video, webpage)
4347         if video is None:
4348             self.to_screen("No video founded")
4349             return []
4350         video_url = video.group('video_url')
4351         ext = video.group('ext')
4352
4353         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4354         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4355
4356         # The only place where you can get a title, it's not complete,
4357         # but searching in other places doesn't work for all videos
4358         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4359         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4360
4361         return [{'id': video_id,
4362                  'url': video_url,
4363                  'title': title,
4364                  'thumbnail': thumb,
4365                  'ext': ext
4366                  }]
4367
4368
4369 def gen_extractors():
4370     """ Return a list of an instance of every supported extractor.
4371     The order does matter; the first extractor matched is the one handling the URL.
4372     """
4373     return [
4374         YoutubePlaylistIE(),
4375         YoutubeChannelIE(),
4376         YoutubeUserIE(),
4377         YoutubeSearchIE(),
4378         YoutubeIE(),
4379         MetacafeIE(),
4380         DailymotionIE(),
4381         GoogleSearchIE(),
4382         PhotobucketIE(),
4383         YahooIE(),
4384         YahooSearchIE(),
4385         DepositFilesIE(),
4386         FacebookIE(),
4387         BlipTVUserIE(),
4388         BlipTVIE(),
4389         VimeoIE(),
4390         MyVideoIE(),
4391         ComedyCentralIE(),
4392         EscapistIE(),
4393         CollegeHumorIE(),
4394         XVideosIE(),
4395         SoundcloudSetIE(),
4396         SoundcloudIE(),
4397         InfoQIE(),
4398         MixcloudIE(),
4399         StanfordOpenClassroomIE(),
4400         MTVIE(),
4401         YoukuIE(),
4402         XNXXIE(),
4403         YouJizzIE(),
4404         PornotubeIE(),
4405         YouPornIE(),
4406         GooglePlusIE(),
4407         ArteTvIE(),
4408         NBAIE(),
4409         WorldStarHipHopIE(),
4410         JustinTVIE(),
4411         FunnyOrDieIE(),
4412         SteamIE(),
4413         UstreamIE(),
4414         RBMARadioIE(),
4415         EightTracksIE(),
4416         KeekIE(),
4417         TEDIE(),
4418         MySpassIE(),
4419         SpiegelIE(),
4420         LiveLeakIE(),
4421         ARDIE(),
4422         TumblrIE(),
4423         GenericIE()
4424     ]
4425
4426 def get_info_extractor(ie_name):
4427     """Returns the info extractor class with the given ie_name"""
4428     return globals()[ie_name+'IE']