_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * source_preference  Order number for this video source
  93                                   (quality takes higher priority)
  94                                  -1 for default (order by other properties),
  95                                  -2 or smaller for less than default.
  96                     * http_referer  HTTP Referer header value to set.
  97                     * http_method  HTTP method to use for the download.
  98                     * http_headers  A dictionary of additional HTTP headers
  99                                  to add to the request.
 100                     * http_post_data  Additional data to send with a POST
 101                                  request.
 102     url:            Final video URL.
 103     ext:            Video filename extension.
 104     format:         The video format, defaults to ext (used for --get-format)
 105     player_url:     SWF Player URL (used for rtmpdump).
 106
 107     The following fields are optional:
 108
 109     display_id      An alternative identifier for the video, not necessarily
 110                     unique, but available before title. Typically, id is
 111                     something like "4234987", title "Dancing naked mole rats",
 112                     and display_id "dancing-naked-mole-rats"
 113     thumbnails:     A list of dictionaries, with the following entries:
 114                         * "url"
 115                         * "width" (optional, int)
 116                         * "height" (optional, int)
 117                         * "resolution" (optional, string "{width}x{height"},
 118                                         deprecated)
 119     thumbnail:      Full URL to a video thumbnail image.
 120     description:    One-line video description.
 121     uploader:       Full name of the video uploader.
 122     timestamp:      UNIX timestamp of the moment the video became available.
 123     upload_date:    Video upload date (YYYYMMDD).
 124                     If not explicitly set, calculated from timestamp.
 125     uploader_id:    Nickname or id of the video uploader.
 126     location:       Physical location where the video was filmed.
 127     subtitles:      The subtitle file contents as a dictionary in the format
 128                     {language: subtitles}.
 129     duration:       Length of the video in seconds, as an integer.
 130     view_count:     How many users have watched the video on the platform.
 131     like_count:     Number of positive ratings of the video
 132     dislike_count:  Number of negative ratings of the video
 133     comment_count:  Number of comments on the video
 134     age_limit:      Age restriction for the video, as an integer (years)
 135     webpage_url:    The url to the video webpage, if given to youtube-dl it
 136                     should allow to get the same result again. (It will be set
 137                     by YoutubeDL if it's missing)
 138     categories:     A list of categories that the video falls in, for example
 139                     ["Sports", "Berlin"]
 140     is_live:        True, False, or None (=unknown). Whether this video is a
 141                     live stream that goes on instead of a fixed-length video.
 142
 143     Unless mentioned otherwise, the fields should be Unicode strings.
 144
 145     Unless mentioned otherwise, None is equivalent to absence of information.
 146
 147     Subclasses of this one should re-define the _real_initialize() and
 148     _real_extract() methods and define a _VALID_URL regexp.
 149     Probably, they should also be added to the list of extractors.
 150
 151     Finally, the _WORKING attribute should be set to False for broken IEs
 152     in order to warn the users and skip the tests.
 153     """
 154
 155     _ready = False
 156     _downloader = None
 157     _WORKING = True
 158
 159     def __init__(self, downloader=None):
 160         """Constructor. Receives an optional downloader."""
 161         self._ready = False
 162         self.set_downloader(downloader)
 163
 164     @classmethod
 165     def suitable(cls, url):
 166         """Receives a URL and returns True if suitable for this IE."""
 167
 168         # This does not use has/getattr intentionally - we want to know whether
 169         # we have cached the regexp for *this* class, whereas getattr would also
 170         # match the superclass
 171         if '_VALID_URL_RE' not in cls.__dict__:
 172             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 173         return cls._VALID_URL_RE.match(url) is not None
 174
 175     @classmethod
 176     def _match_id(cls, url):
 177         if '_VALID_URL_RE' not in cls.__dict__:
 178             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 179         m = cls._VALID_URL_RE.match(url)
 180         assert m
 181         return m.group('id')
 182
 183     @classmethod
 184     def working(cls):
 185         """Getter method for _WORKING."""
 186         return cls._WORKING
 187
 188     def initialize(self):
 189         """Initializes an instance (authentication, etc)."""
 190         if not self._ready:
 191             self._real_initialize()
 192             self._ready = True
 193
 194     def extract(self, url):
 195         """Extracts URL information and returns it in list of dicts."""
 196         self.initialize()
 197         return self._real_extract(url)
 198
 199     def set_downloader(self, downloader):
 200         """Sets the downloader for this IE."""
 201         self._downloader = downloader
 202
 203     def _real_initialize(self):
 204         """Real initialization process. Redefine in subclasses."""
 205         pass
 206
 207     def _real_extract(self, url):
 208         """Real extraction process. Redefine in subclasses."""
 209         pass
 210
 211     @classmethod
 212     def ie_key(cls):
 213         """A string for getting the InfoExtractor with get_info_extractor"""
 214         return cls.__name__[:-2]
 215
 216     @property
 217     def IE_NAME(self):
 218         return type(self).__name__[:-2]
 219
 220     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 221         """ Returns the response handle """
 222         if note is None:
 223             self.report_download_webpage(video_id)
 224         elif note is not False:
 225             if video_id is None:
 226                 self.to_screen('%s' % (note,))
 227             else:
 228                 self.to_screen('%s: %s' % (video_id, note))
 229         try:
 230             return self._downloader.urlopen(url_or_request)
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             if errnote is False:
 233                 return False
 234             if errnote is None:
 235                 errnote = 'Unable to download webpage'
 236             errmsg = '%s: %s' % (errnote, compat_str(err))
 237             if fatal:
 238                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 239             else:
 240                 self._downloader.report_warning(errmsg)
 241                 return False
 242
 243     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 244         """ Returns a tuple (page content as string, URL handle) """
 245         # Strip hashes from the URL (#1038)
 246         if isinstance(url_or_request, (compat_str, str)):
 247             url_or_request = url_or_request.partition('#')[0]
 248
 249         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 250         if urlh is False:
 251             assert not fatal
 252             return False
 253         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 254         return (content, urlh)
 255
 256     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 257         content_type = urlh.headers.get('Content-Type', '')
 258         webpage_bytes = urlh.read()
 259         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 260         if m:
 261             encoding = m.group(1)
 262         else:
 263             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 264                           webpage_bytes[:1024])
 265             if m:
 266                 encoding = m.group(1).decode('ascii')
 267             elif webpage_bytes.startswith(b'\xff\xfe'):
 268                 encoding = 'utf-16'
 269             else:
 270                 encoding = 'utf-8'
 271         if self._downloader.params.get('dump_intermediate_pages', False):
 272             try:
 273                 url = url_or_request.get_full_url()
 274             except AttributeError:
 275                 url = url_or_request
 276             self.to_screen('Dumping request to ' + url)
 277             dump = base64.b64encode(webpage_bytes).decode('ascii')
 278             self._downloader.to_screen(dump)
 279         if self._downloader.params.get('write_pages', False):
 280             try:
 281                 url = url_or_request.get_full_url()
 282             except AttributeError:
 283                 url = url_or_request
 284             basen = '%s_%s' % (video_id, url)
 285             if len(basen) > 240:
 286                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 287                 basen = basen[:240 - len(h)] + h
 288             raw_filename = basen + '.dump'
 289             filename = sanitize_filename(raw_filename, restricted=True)
 290             self.to_screen('Saving request to ' + filename)
 291             # Working around MAX_PATH limitation on Windows (see
 292             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 293             if os.name == 'nt':
 294                 absfilepath = os.path.abspath(filename)
 295                 if len(absfilepath) > 259:
 296                     filename = '\\\\?\\' + absfilepath
 297             with open(filename, 'wb') as outf:
 298                 outf.write(webpage_bytes)
 299
 300         try:
 301             content = webpage_bytes.decode(encoding, 'replace')
 302         except LookupError:
 303             content = webpage_bytes.decode('utf-8', 'replace')
 304
 305         if ('<title>Access to this site is blocked</title>' in content and
 306                 'Websense' in content[:512]):
 307             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 308             blocked_iframe = self._html_search_regex(
 309                 r'<iframe src="([^"]+)"', content,
 310                 'Websense information URL', default=None)
 311             if blocked_iframe:
 312                 msg += ' Visit %s for more details' % blocked_iframe
 313             raise ExtractorError(msg, expected=True)
 314
 315         return content
 316
 317     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 318         """ Returns the data of the page as a string """
 319         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 320         if res is False:
 321             return res
 322         else:
 323             content, _ = res
 324             return content
 325
 326     def _download_xml(self, url_or_request, video_id,
 327                       note='Downloading XML', errnote='Unable to download XML',
 328                       transform_source=None, fatal=True):
 329         """Return the xml as an xml.etree.ElementTree.Element"""
 330         xml_string = self._download_webpage(
 331             url_or_request, video_id, note, errnote, fatal=fatal)
 332         if xml_string is False:
 333             return xml_string
 334         if transform_source:
 335             xml_string = transform_source(xml_string)
 336         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 337
 338     def _download_json(self, url_or_request, video_id,
 339                        note='Downloading JSON metadata',
 340                        errnote='Unable to download JSON metadata',
 341                        transform_source=None,
 342                        fatal=True):
 343         json_string = self._download_webpage(
 344             url_or_request, video_id, note, errnote, fatal=fatal)
 345         if (not fatal) and json_string is False:
 346             return None
 347         if transform_source:
 348             json_string = transform_source(json_string)
 349         try:
 350             return json.loads(json_string)
 351         except ValueError as ve:
 352             errmsg = '%s: Failed to parse JSON ' % video_id
 353             if fatal:
 354                 raise ExtractorError(errmsg, cause=ve)
 355             else:
 356                 self.report_warning(errmsg + str(ve))
 357
 358     def report_warning(self, msg, video_id=None):
 359         idstr = '' if video_id is None else '%s: ' % video_id
 360         self._downloader.report_warning(
 361             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 362
 363     def to_screen(self, msg):
 364         """Print msg to screen, prefixing it with '[ie_name]'"""
 365         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 366
 367     def report_extraction(self, id_or_name):
 368         """Report information extraction."""
 369         self.to_screen('%s: Extracting information' % id_or_name)
 370
 371     def report_download_webpage(self, video_id):
 372         """Report webpage download."""
 373         self.to_screen('%s: Downloading webpage' % video_id)
 374
 375     def report_age_confirmation(self):
 376         """Report attempt to confirm age."""
 377         self.to_screen('Confirming age')
 378
 379     def report_login(self):
 380         """Report attempt to log in."""
 381         self.to_screen('Logging in')
 382
 383     #Methods for following #608
 384     @staticmethod
 385     def url_result(url, ie=None, video_id=None):
 386         """Returns a url that points to a page that should be processed"""
 387         #TODO: ie should be the class used for getting the info
 388         video_info = {'_type': 'url',
 389                       'url': url,
 390                       'ie_key': ie}
 391         if video_id is not None:
 392             video_info['id'] = video_id
 393         return video_info
 394     @staticmethod
 395     def playlist_result(entries, playlist_id=None, playlist_title=None):
 396         """Returns a playlist"""
 397         video_info = {'_type': 'playlist',
 398                       'entries': entries}
 399         if playlist_id:
 400             video_info['id'] = playlist_id
 401         if playlist_title:
 402             video_info['title'] = playlist_title
 403         return video_info
 404
 405     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 406         """
 407         Perform a regex search on the given string, using a single or a list of
 408         patterns returning the first matching group.
 409         In case of failure return a default value or raise a WARNING or a
 410         RegexNotFoundError, depending on fatal, specifying the field name.
 411         """
 412         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 413             mobj = re.search(pattern, string, flags)
 414         else:
 415             for p in pattern:
 416                 mobj = re.search(p, string, flags)
 417                 if mobj:
 418                     break
 419
 420         if os.name != 'nt' and sys.stderr.isatty():
 421             _name = '\033[0;34m%s\033[0m' % name
 422         else:
 423             _name = name
 424
 425         if mobj:
 426             # return the first matching group
 427             return next(g for g in mobj.groups() if g is not None)
 428         elif default is not _NO_DEFAULT:
 429             return default
 430         elif fatal:
 431             raise RegexNotFoundError('Unable to extract %s' % _name)
 432         else:
 433             self._downloader.report_warning('unable to extract %s; '
 434                 'please report this issue on http://yt-dl.org/bug' % _name)
 435             return None
 436
 437     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 438         """
 439         Like _search_regex, but strips HTML tags and unescapes entities.
 440         """
 441         res = self._search_regex(pattern, string, name, default, fatal, flags)
 442         if res:
 443             return clean_html(res).strip()
 444         else:
 445             return res
 446
 447     def _get_login_info(self):
 448         """
 449         Get the the login info as (username, password)
 450         It will look in the netrc file using the _NETRC_MACHINE value
 451         If there's no info available, return (None, None)
 452         """
 453         if self._downloader is None:
 454             return (None, None)
 455
 456         username = None
 457         password = None
 458         downloader_params = self._downloader.params
 459
 460         # Attempt to use provided username and password or .netrc data
 461         if downloader_params.get('username', None) is not None:
 462             username = downloader_params['username']
 463             password = downloader_params['password']
 464         elif downloader_params.get('usenetrc', False):
 465             try:
 466                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 467                 if info is not None:
 468                     username = info[0]
 469                     password = info[2]
 470                 else:
 471                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 472             except (IOError, netrc.NetrcParseError) as err:
 473                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 474
 475         return (username, password)
 476
 477     def _get_tfa_info(self):
 478         """
 479         Get the two-factor authentication info
 480         TODO - asking the user will be required for sms/phone verify
 481         currently just uses the command line option
 482         If there's no info available, return None
 483         """
 484         if self._downloader is None:
 485             return None
 486         downloader_params = self._downloader.params
 487
 488         if downloader_params.get('twofactor', None) is not None:
 489             return downloader_params['twofactor']
 490
 491         return None
 492
 493     # Helper functions for extracting OpenGraph info
 494     @staticmethod
 495     def _og_regexes(prop):
 496         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 497         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 498         template = r'<meta[^>]+?%s[^>]+?%s'
 499         return [
 500             template % (property_re, content_re),
 501             template % (content_re, property_re),
 502         ]
 503
 504     def _og_search_property(self, prop, html, name=None, **kargs):
 505         if name is None:
 506             name = 'OpenGraph %s' % prop
 507         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 508         if escaped is None:
 509             return None
 510         return unescapeHTML(escaped)
 511
 512     def _og_search_thumbnail(self, html, **kargs):
 513         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 514
 515     def _og_search_description(self, html, **kargs):
 516         return self._og_search_property('description', html, fatal=False, **kargs)
 517
 518     def _og_search_title(self, html, **kargs):
 519         return self._og_search_property('title', html, **kargs)
 520
 521     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 522         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 523         if secure:
 524             regexes = self._og_regexes('video:secure_url') + regexes
 525         return self._html_search_regex(regexes, html, name, **kargs)
 526
 527     def _og_search_url(self, html, **kargs):
 528         return self._og_search_property('url', html, **kargs)
 529
 530     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 531         if display_name is None:
 532             display_name = name
 533         return self._html_search_regex(
 534             r'''(?ix)<meta
 535                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 536                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 537             html, display_name, fatal=fatal, **kwargs)
 538
 539     def _dc_search_uploader(self, html):
 540         return self._html_search_meta('dc.creator', html, 'uploader')
 541
 542     def _rta_search(self, html):
 543         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 544         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 545                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 546                      html):
 547             return 18
 548         return 0
 549
 550     def _media_rating_search(self, html):
 551         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 552         rating = self._html_search_meta('rating', html)
 553
 554         if not rating:
 555             return None
 556
 557         RATING_TABLE = {
 558             'safe for kids': 0,
 559             'general': 8,
 560             '14 years': 14,
 561             'mature': 17,
 562             'restricted': 19,
 563         }
 564         return RATING_TABLE.get(rating.lower(), None)
 565
 566     def _twitter_search_player(self, html):
 567         return self._html_search_meta('twitter:player', html,
 568             'twitter card player')
 569
 570     def _sort_formats(self, formats):
 571         if not formats:
 572             raise ExtractorError('No video formats found')
 573
 574         def _formats_key(f):
 575             # TODO remove the following workaround
 576             from ..utils import determine_ext
 577             if not f.get('ext') and 'url' in f:
 578                 f['ext'] = determine_ext(f['url'])
 579
 580             preference = f.get('preference')
 581             if preference is None:
 582                 proto = f.get('protocol')
 583                 if proto is None:
 584                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 585
 586                 preference = 0 if proto in ['http', 'https'] else -0.1
 587                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 588                     preference -= 0.5
 589
 590             if f.get('vcodec') == 'none':  # audio only
 591                 if self._downloader.params.get('prefer_free_formats'):
 592                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 593                 else:
 594                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 595                 ext_preference = 0
 596                 try:
 597                     audio_ext_preference = ORDER.index(f['ext'])
 598                 except ValueError:
 599                     audio_ext_preference = -1
 600             else:
 601                 if self._downloader.params.get('prefer_free_formats'):
 602                     ORDER = ['flv', 'mp4', 'webm']
 603                 else:
 604                     ORDER = ['webm', 'flv', 'mp4']
 605                 try:
 606                     ext_preference = ORDER.index(f['ext'])
 607                 except ValueError:
 608                     ext_preference = -1
 609                 audio_ext_preference = 0
 610
 611             return (
 612                 preference,
 613                 f.get('quality') if f.get('quality') is not None else -1,
 614                 f.get('height') if f.get('height') is not None else -1,
 615                 f.get('width') if f.get('width') is not None else -1,
 616                 ext_preference,
 617                 f.get('tbr') if f.get('tbr') is not None else -1,
 618                 f.get('vbr') if f.get('vbr') is not None else -1,
 619                 f.get('abr') if f.get('abr') is not None else -1,
 620                 audio_ext_preference,
 621                 f.get('filesize') if f.get('filesize') is not None else -1,
 622                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 623                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 624                 f.get('format_id'),
 625             )
 626         formats.sort(key=_formats_key)
 627
 628     def http_scheme(self):
 629         """ Either "http:" or "https:", depending on the user's preferences """
 630         return (
 631             'http:'
 632             if self._downloader.params.get('prefer_insecure', False)
 633             else 'https:')
 634
 635     def _proto_relative_url(self, url, scheme=None):
 636         if url is None:
 637             return url
 638         if url.startswith('//'):
 639             if scheme is None:
 640                 scheme = self.http_scheme()
 641             return scheme + url
 642         else:
 643             return url
 644
 645     def _sleep(self, timeout, video_id, msg_template=None):
 646         if msg_template is None:
 647             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 648         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 649         self.to_screen(msg)
 650         time.sleep(timeout)
 651
 652     def _extract_f4m_formats(self, manifest_url, video_id):
 653         manifest = self._download_xml(
 654             manifest_url, video_id, 'Downloading f4m manifest',
 655             'Unable to download f4m manifest')
 656
 657         formats = []
 658         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 659         for i, media_el in enumerate(media_nodes):
 660             tbr = int_or_none(media_el.attrib.get('bitrate'))
 661             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 662             formats.append({
 663                 'format_id': format_id,
 664                 'url': manifest_url,
 665                 'ext': 'flv',
 666                 'tbr': tbr,
 667                 'width': int_or_none(media_el.attrib.get('width')),
 668                 'height': int_or_none(media_el.attrib.get('height')),
 669             })
 670         self._sort_formats(formats)
 671
 672         return formats
 673
 674     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 675                               entry_protocol='m3u8', preference=None):
 676
 677         formats = [{
 678             'format_id': 'm3u8-meta',
 679             'url': m3u8_url,
 680             'ext': ext,
 681             'protocol': 'm3u8',
 682             'preference': -1,
 683             'resolution': 'multiple',
 684             'format_note': 'Quality selection URL',
 685         }]
 686
 687         format_url = lambda u: (
 688             u
 689             if re.match(r'^https?://', u)
 690             else compat_urlparse.urljoin(m3u8_url, u))
 691
 692         m3u8_doc = self._download_webpage(
 693             m3u8_url, video_id,
 694             note='Downloading m3u8 information',
 695             errnote='Failed to download m3u8 information')
 696         last_info = None
 697         kv_rex = re.compile(
 698             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 699         for line in m3u8_doc.splitlines():
 700             if line.startswith('#EXT-X-STREAM-INF:'):
 701                 last_info = {}
 702                 for m in kv_rex.finditer(line):
 703                     v = m.group('val')
 704                     if v.startswith('"'):
 705                         v = v[1:-1]
 706                     last_info[m.group('key')] = v
 707             elif line.startswith('#') or not line.strip():
 708                 continue
 709             else:
 710                 if last_info is None:
 711                     formats.append({'url': format_url(line)})
 712                     continue
 713                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 714
 715                 f = {
 716                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 717                     'url': format_url(line.strip()),
 718                     'tbr': tbr,
 719                     'ext': ext,
 720                     'protocol': entry_protocol,
 721                     'preference': preference,
 722                 }
 723                 codecs = last_info.get('CODECS')
 724                 if codecs:
 725                     # TODO: looks like video codec is not always necessarily goes first
 726                     va_codecs = codecs.split(',')
 727                     if va_codecs[0]:
 728                         f['vcodec'] = va_codecs[0].partition('.')[0]
 729                     if len(va_codecs) > 1 and va_codecs[1]:
 730                         f['acodec'] = va_codecs[1].partition('.')[0]
 731                 resolution = last_info.get('RESOLUTION')
 732                 if resolution:
 733                     width_str, height_str = resolution.split('x')
 734                     f['width'] = int(width_str)
 735                     f['height'] = int(height_str)
 736                 formats.append(f)
 737                 last_info = {}
 738         self._sort_formats(formats)
 739         return formats
 740
 741     def _live_title(self, name):
 742         """ Generate the title for a live video """
 743         now = datetime.datetime.now()
 744         now_str = now.strftime("%Y-%m-%d %H:%M")
 745         return name + ' ' + now_str
 746
 747     def _int(self, v, name, fatal=False, **kwargs):
 748         res = int_or_none(v, **kwargs)
 749         if 'get_attr' in kwargs:
 750             print(getattr(v, kwargs['get_attr']))
 751         if res is None:
 752             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 753             if fatal:
 754                 raise ExtractorError(msg)
 755             else:
 756                 self._downloader.report_warning(msg)
 757         return res
 758
 759     def _float(self, v, name, fatal=False, **kwargs):
 760         res = float_or_none(v, **kwargs)
 761         if res is None:
 762             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 763             if fatal:
 764                 raise ExtractorError(msg)
 765             else:
 766                 self._downloader.report_warning(msg)
 767         return res
 768
 769
 770 class SearchInfoExtractor(InfoExtractor):
 771     """
 772     Base class for paged search queries extractors.
 773     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 774     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 775     """
 776
 777     @classmethod
 778     def _make_valid_url(cls):
 779         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 780
 781     @classmethod
 782     def suitable(cls, url):
 783         return re.match(cls._make_valid_url(), url) is not None
 784
 785     def _real_extract(self, query):
 786         mobj = re.match(self._make_valid_url(), query)
 787         if mobj is None:
 788             raise ExtractorError('Invalid search query "%s"' % query)
 789
 790         prefix = mobj.group('prefix')
 791         query = mobj.group('query')
 792         if prefix == '':
 793             return self._get_n_results(query, 1)
 794         elif prefix == 'all':
 795             return self._get_n_results(query, self._MAX_RESULTS)
 796         else:
 797             n = int(prefix)
 798             if n <= 0:
 799                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 800             elif n > self._MAX_RESULTS:
 801                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 802                 n = self._MAX_RESULTS
 803             return self._get_n_results(query, n)
 804
 805     def _get_n_results(self, query, n):
 806         """Get a specified number of results for a query"""
 807         raise NotImplementedError("This method must be implemented by subclasses")
 808
 809     @property
 810     def SEARCH_KEY(self):
 811         return self._SEARCH_KEY