_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The dictionaries must include the following fields:
  47
  48     id:             Video identifier.
  49     title:          Video title, unescaped.
  50
  51     Additionally, it must contain either a formats entry or a url one:
  52
  53     formats:        A list of dictionaries for each format available, ordered
  54                     from worst to best quality.
  55
  56                     Potential fields:
  57                     * url        Mandatory. The URL of the video file
  58                     * ext        Will be calculated from url if missing
  59                     * format     A human-readable description of the format
  60                                  ("mp4 container with h264/opus").
  61                                  Calculated from the format_id, width, height.
  62                                  and format_note fields if missing.
  63                     * format_id  A short description of the format
  64                                  ("mp4_h264_opus" or "19").
  65                                 Technically optional, but strongly recommended.
  66                     * format_note Additional info about the format
  67                                  ("3D" or "DASH video")
  68                     * width      Width of the video, if known
  69                     * height     Height of the video, if known
  70                     * resolution Textual description of width and height
  71                     * tbr        Average bitrate of audio and video in KBit/s
  72                     * abr        Average audio bitrate in KBit/s
  73                     * acodec     Name of the audio codec in use
  74                     * asr        Audio sampling rate in Hertz
  75                     * vbr        Average video bitrate in KBit/s
  76                     * fps        Frame rate
  77                     * vcodec     Name of the video codec in use
  78                     * container  Name of the container format
  79                     * filesize   The number of bytes, if known in advance
  80                     * filesize_approx  An estimate for the number of bytes
  81                     * player_url SWF Player URL (used for rtmpdump).
  82                     * protocol   The protocol that will be used for the actual
  83                                  download, lower-case.
  84                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  85                     * preference Order number of this format. If this field is
  86                                  present and not None, the formats get sorted
  87                                  by this field, regardless of all other values.
  88                                  -1 for default (order by other properties),
  89                                  -2 or smaller for less than default.
  90                     * language_preference  Is this in the correct requested
  91                                  language?
  92                                  10 if it's what the URL is about,
  93                                  -1 for default (don't know),
  94                                  -10 otherwise, other values reserved for now.
  95                     * quality    Order number of the video quality of this
  96                                  format, irrespective of the file format.
  97                                  -1 for default (order by other properties),
  98                                  -2 or smaller for less than default.
  99                     * source_preference  Order number for this video source
 100                                   (quality takes higher priority)
 101                                  -1 for default (order by other properties),
 102                                  -2 or smaller for less than default.
 103                     * http_referer  HTTP Referer header value to set.
 104                     * http_method  HTTP method to use for the download.
 105                     * http_headers  A dictionary of additional HTTP headers
 106                                  to add to the request.
 107                     * http_post_data  Additional data to send with a POST
 108                                  request.
 109     url:            Final video URL.
 110     ext:            Video filename extension.
 111     format:         The video format, defaults to ext (used for --get-format)
 112     player_url:     SWF Player URL (used for rtmpdump).
 113
 114     The following fields are optional:
 115
 116     display_id      An alternative identifier for the video, not necessarily
 117                     unique, but available before title. Typically, id is
 118                     something like "4234987", title "Dancing naked mole rats",
 119                     and display_id "dancing-naked-mole-rats"
 120     thumbnails:     A list of dictionaries, with the following entries:
 121                         * "url"
 122                         * "width" (optional, int)
 123                         * "height" (optional, int)
 124                         * "resolution" (optional, string "{width}x{height"},
 125                                         deprecated)
 126     thumbnail:      Full URL to a video thumbnail image.
 127     description:    One-line video description.
 128     uploader:       Full name of the video uploader.
 129     timestamp:      UNIX timestamp of the moment the video became available.
 130     upload_date:    Video upload date (YYYYMMDD).
 131                     If not explicitly set, calculated from timestamp.
 132     uploader_id:    Nickname or id of the video uploader.
 133     location:       Physical location where the video was filmed.
 134     subtitles:      The subtitle file contents as a dictionary in the format
 135                     {language: subtitles}.
 136     duration:       Length of the video in seconds, as an integer.
 137     view_count:     How many users have watched the video on the platform.
 138     like_count:     Number of positive ratings of the video
 139     dislike_count:  Number of negative ratings of the video
 140     comment_count:  Number of comments on the video
 141     age_limit:      Age restriction for the video, as an integer (years)
 142     webpage_url:    The url to the video webpage, if given to youtube-dl it
 143                     should allow to get the same result again. (It will be set
 144                     by YoutubeDL if it's missing)
 145     categories:     A list of categories that the video falls in, for example
 146                     ["Sports", "Berlin"]
 147     is_live:        True, False, or None (=unknown). Whether this video is a
 148                     live stream that goes on instead of a fixed-length video.
 149
 150     Unless mentioned otherwise, the fields should be Unicode strings.
 151
 152     Unless mentioned otherwise, None is equivalent to absence of information.
 153
 154     Subclasses of this one should re-define the _real_initialize() and
 155     _real_extract() methods and define a _VALID_URL regexp.
 156     Probably, they should also be added to the list of extractors.
 157
 158     Finally, the _WORKING attribute should be set to False for broken IEs
 159     in order to warn the users and skip the tests.
 160     """
 161
 162     _ready = False
 163     _downloader = None
 164     _WORKING = True
 165
 166     def __init__(self, downloader=None):
 167         """Constructor. Receives an optional downloader."""
 168         self._ready = False
 169         self.set_downloader(downloader)
 170
 171     @classmethod
 172     def suitable(cls, url):
 173         """Receives a URL and returns True if suitable for this IE."""
 174
 175         # This does not use has/getattr intentionally - we want to know whether
 176         # we have cached the regexp for *this* class, whereas getattr would also
 177         # match the superclass
 178         if '_VALID_URL_RE' not in cls.__dict__:
 179             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 180         return cls._VALID_URL_RE.match(url) is not None
 181
 182     @classmethod
 183     def _match_id(cls, url):
 184         if '_VALID_URL_RE' not in cls.__dict__:
 185             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 186         m = cls._VALID_URL_RE.match(url)
 187         assert m
 188         return m.group('id')
 189
 190     @classmethod
 191     def working(cls):
 192         """Getter method for _WORKING."""
 193         return cls._WORKING
 194
 195     def initialize(self):
 196         """Initializes an instance (authentication, etc)."""
 197         if not self._ready:
 198             self._real_initialize()
 199             self._ready = True
 200
 201     def extract(self, url):
 202         """Extracts URL information and returns it in list of dicts."""
 203         self.initialize()
 204         return self._real_extract(url)
 205
 206     def set_downloader(self, downloader):
 207         """Sets the downloader for this IE."""
 208         self._downloader = downloader
 209
 210     def _real_initialize(self):
 211         """Real initialization process. Redefine in subclasses."""
 212         pass
 213
 214     def _real_extract(self, url):
 215         """Real extraction process. Redefine in subclasses."""
 216         pass
 217
 218     @classmethod
 219     def ie_key(cls):
 220         """A string for getting the InfoExtractor with get_info_extractor"""
 221         return cls.__name__[:-2]
 222
 223     @property
 224     def IE_NAME(self):
 225         return type(self).__name__[:-2]
 226
 227     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 228         """ Returns the response handle """
 229         if note is None:
 230             self.report_download_webpage(video_id)
 231         elif note is not False:
 232             if video_id is None:
 233                 self.to_screen('%s' % (note,))
 234             else:
 235                 self.to_screen('%s: %s' % (video_id, note))
 236         try:
 237             return self._downloader.urlopen(url_or_request)
 238         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 239             if errnote is False:
 240                 return False
 241             if errnote is None:
 242                 errnote = 'Unable to download webpage'
 243             errmsg = '%s: %s' % (errnote, compat_str(err))
 244             if fatal:
 245                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 246             else:
 247                 self._downloader.report_warning(errmsg)
 248                 return False
 249
 250     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 251         """ Returns a tuple (page content as string, URL handle) """
 252         # Strip hashes from the URL (#1038)
 253         if isinstance(url_or_request, (compat_str, str)):
 254             url_or_request = url_or_request.partition('#')[0]
 255
 256         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 257         if urlh is False:
 258             assert not fatal
 259             return False
 260         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 261         return (content, urlh)
 262
 263     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 264         content_type = urlh.headers.get('Content-Type', '')
 265         webpage_bytes = urlh.read()
 266         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 267         if m:
 268             encoding = m.group(1)
 269         else:
 270             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 271                           webpage_bytes[:1024])
 272             if m:
 273                 encoding = m.group(1).decode('ascii')
 274             elif webpage_bytes.startswith(b'\xff\xfe'):
 275                 encoding = 'utf-16'
 276             else:
 277                 encoding = 'utf-8'
 278         if self._downloader.params.get('dump_intermediate_pages', False):
 279             try:
 280                 url = url_or_request.get_full_url()
 281             except AttributeError:
 282                 url = url_or_request
 283             self.to_screen('Dumping request to ' + url)
 284             dump = base64.b64encode(webpage_bytes).decode('ascii')
 285             self._downloader.to_screen(dump)
 286         if self._downloader.params.get('write_pages', False):
 287             try:
 288                 url = url_or_request.get_full_url()
 289             except AttributeError:
 290                 url = url_or_request
 291             basen = '%s_%s' % (video_id, url)
 292             if len(basen) > 240:
 293                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 294                 basen = basen[:240 - len(h)] + h
 295             raw_filename = basen + '.dump'
 296             filename = sanitize_filename(raw_filename, restricted=True)
 297             self.to_screen('Saving request to ' + filename)
 298             # Working around MAX_PATH limitation on Windows (see
 299             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 300             if os.name == 'nt':
 301                 absfilepath = os.path.abspath(filename)
 302                 if len(absfilepath) > 259:
 303                     filename = '\\\\?\\' + absfilepath
 304             with open(filename, 'wb') as outf:
 305                 outf.write(webpage_bytes)
 306
 307         try:
 308             content = webpage_bytes.decode(encoding, 'replace')
 309         except LookupError:
 310             content = webpage_bytes.decode('utf-8', 'replace')
 311
 312         if ('<title>Access to this site is blocked</title>' in content and
 313                 'Websense' in content[:512]):
 314             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 315             blocked_iframe = self._html_search_regex(
 316                 r'<iframe src="([^"]+)"', content,
 317                 'Websense information URL', default=None)
 318             if blocked_iframe:
 319                 msg += ' Visit %s for more details' % blocked_iframe
 320             raise ExtractorError(msg, expected=True)
 321
 322         return content
 323
 324     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 325         """ Returns the data of the page as a string """
 326         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 327         if res is False:
 328             return res
 329         else:
 330             content, _ = res
 331             return content
 332
 333     def _download_xml(self, url_or_request, video_id,
 334                       note='Downloading XML', errnote='Unable to download XML',
 335                       transform_source=None, fatal=True):
 336         """Return the xml as an xml.etree.ElementTree.Element"""
 337         xml_string = self._download_webpage(
 338             url_or_request, video_id, note, errnote, fatal=fatal)
 339         if xml_string is False:
 340             return xml_string
 341         if transform_source:
 342             xml_string = transform_source(xml_string)
 343         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 344
 345     def _download_json(self, url_or_request, video_id,
 346                        note='Downloading JSON metadata',
 347                        errnote='Unable to download JSON metadata',
 348                        transform_source=None,
 349                        fatal=True):
 350         json_string = self._download_webpage(
 351             url_or_request, video_id, note, errnote, fatal=fatal)
 352         if (not fatal) and json_string is False:
 353             return None
 354         if transform_source:
 355             json_string = transform_source(json_string)
 356         try:
 357             return json.loads(json_string)
 358         except ValueError as ve:
 359             errmsg = '%s: Failed to parse JSON ' % video_id
 360             if fatal:
 361                 raise ExtractorError(errmsg, cause=ve)
 362             else:
 363                 self.report_warning(errmsg + str(ve))
 364
 365     def report_warning(self, msg, video_id=None):
 366         idstr = '' if video_id is None else '%s: ' % video_id
 367         self._downloader.report_warning(
 368             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 369
 370     def to_screen(self, msg):
 371         """Print msg to screen, prefixing it with '[ie_name]'"""
 372         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 373
 374     def report_extraction(self, id_or_name):
 375         """Report information extraction."""
 376         self.to_screen('%s: Extracting information' % id_or_name)
 377
 378     def report_download_webpage(self, video_id):
 379         """Report webpage download."""
 380         self.to_screen('%s: Downloading webpage' % video_id)
 381
 382     def report_age_confirmation(self):
 383         """Report attempt to confirm age."""
 384         self.to_screen('Confirming age')
 385
 386     def report_login(self):
 387         """Report attempt to log in."""
 388         self.to_screen('Logging in')
 389
 390     #Methods for following #608
 391     @staticmethod
 392     def url_result(url, ie=None, video_id=None):
 393         """Returns a url that points to a page that should be processed"""
 394         #TODO: ie should be the class used for getting the info
 395         video_info = {'_type': 'url',
 396                       'url': url,
 397                       'ie_key': ie}
 398         if video_id is not None:
 399             video_info['id'] = video_id
 400         return video_info
 401     @staticmethod
 402     def playlist_result(entries, playlist_id=None, playlist_title=None):
 403         """Returns a playlist"""
 404         video_info = {'_type': 'playlist',
 405                       'entries': entries}
 406         if playlist_id:
 407             video_info['id'] = playlist_id
 408         if playlist_title:
 409             video_info['title'] = playlist_title
 410         return video_info
 411
 412     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 413         """
 414         Perform a regex search on the given string, using a single or a list of
 415         patterns returning the first matching group.
 416         In case of failure return a default value or raise a WARNING or a
 417         RegexNotFoundError, depending on fatal, specifying the field name.
 418         """
 419         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 420             mobj = re.search(pattern, string, flags)
 421         else:
 422             for p in pattern:
 423                 mobj = re.search(p, string, flags)
 424                 if mobj:
 425                     break
 426
 427         if os.name != 'nt' and sys.stderr.isatty():
 428             _name = '\033[0;34m%s\033[0m' % name
 429         else:
 430             _name = name
 431
 432         if mobj:
 433             if group is None:
 434                 # return the first matching group
 435                 return next(g for g in mobj.groups() if g is not None)
 436             else:
 437                 return mobj.group(group)
 438         elif default is not _NO_DEFAULT:
 439             return default
 440         elif fatal:
 441             raise RegexNotFoundError('Unable to extract %s' % _name)
 442         else:
 443             self._downloader.report_warning('unable to extract %s; '
 444                 'please report this issue on http://yt-dl.org/bug' % _name)
 445             return None
 446
 447     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 448         """
 449         Like _search_regex, but strips HTML tags and unescapes entities.
 450         """
 451         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 452         if res:
 453             return clean_html(res).strip()
 454         else:
 455             return res
 456
 457     def _get_login_info(self):
 458         """
 459         Get the the login info as (username, password)
 460         It will look in the netrc file using the _NETRC_MACHINE value
 461         If there's no info available, return (None, None)
 462         """
 463         if self._downloader is None:
 464             return (None, None)
 465
 466         username = None
 467         password = None
 468         downloader_params = self._downloader.params
 469
 470         # Attempt to use provided username and password or .netrc data
 471         if downloader_params.get('username', None) is not None:
 472             username = downloader_params['username']
 473             password = downloader_params['password']
 474         elif downloader_params.get('usenetrc', False):
 475             try:
 476                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 477                 if info is not None:
 478                     username = info[0]
 479                     password = info[2]
 480                 else:
 481                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 482             except (IOError, netrc.NetrcParseError) as err:
 483                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 484
 485         return (username, password)
 486
 487     def _get_tfa_info(self):
 488         """
 489         Get the two-factor authentication info
 490         TODO - asking the user will be required for sms/phone verify
 491         currently just uses the command line option
 492         If there's no info available, return None
 493         """
 494         if self._downloader is None:
 495             return None
 496         downloader_params = self._downloader.params
 497
 498         if downloader_params.get('twofactor', None) is not None:
 499             return downloader_params['twofactor']
 500
 501         return None
 502
 503     # Helper functions for extracting OpenGraph info
 504     @staticmethod
 505     def _og_regexes(prop):
 506         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 507         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 508         template = r'<meta[^>]+?%s[^>]+?%s'
 509         return [
 510             template % (property_re, content_re),
 511             template % (content_re, property_re),
 512         ]
 513
 514     def _og_search_property(self, prop, html, name=None, **kargs):
 515         if name is None:
 516             name = 'OpenGraph %s' % prop
 517         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 518         if escaped is None:
 519             return None
 520         return unescapeHTML(escaped)
 521
 522     def _og_search_thumbnail(self, html, **kargs):
 523         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 524
 525     def _og_search_description(self, html, **kargs):
 526         return self._og_search_property('description', html, fatal=False, **kargs)
 527
 528     def _og_search_title(self, html, **kargs):
 529         return self._og_search_property('title', html, **kargs)
 530
 531     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 532         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 533         if secure:
 534             regexes = self._og_regexes('video:secure_url') + regexes
 535         return self._html_search_regex(regexes, html, name, **kargs)
 536
 537     def _og_search_url(self, html, **kargs):
 538         return self._og_search_property('url', html, **kargs)
 539
 540     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 541         if display_name is None:
 542             display_name = name
 543         return self._html_search_regex(
 544             r'''(?ix)<meta
 545                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 546                     [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
 547             html, display_name, fatal=fatal, group='content', **kwargs)
 548
 549     def _dc_search_uploader(self, html):
 550         return self._html_search_meta('dc.creator', html, 'uploader')
 551
 552     def _rta_search(self, html):
 553         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 554         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 555                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 556                      html):
 557             return 18
 558         return 0
 559
 560     def _media_rating_search(self, html):
 561         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 562         rating = self._html_search_meta('rating', html)
 563
 564         if not rating:
 565             return None
 566
 567         RATING_TABLE = {
 568             'safe for kids': 0,
 569             'general': 8,
 570             '14 years': 14,
 571             'mature': 17,
 572             'restricted': 19,
 573         }
 574         return RATING_TABLE.get(rating.lower(), None)
 575
 576     def _twitter_search_player(self, html):
 577         return self._html_search_meta('twitter:player', html,
 578             'twitter card player')
 579
 580     def _sort_formats(self, formats):
 581         if not formats:
 582             raise ExtractorError('No video formats found')
 583
 584         def _formats_key(f):
 585             # TODO remove the following workaround
 586             from ..utils import determine_ext
 587             if not f.get('ext') and 'url' in f:
 588                 f['ext'] = determine_ext(f['url'])
 589
 590             preference = f.get('preference')
 591             if preference is None:
 592                 proto = f.get('protocol')
 593                 if proto is None:
 594                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 595
 596                 preference = 0 if proto in ['http', 'https'] else -0.1
 597                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 598                     preference -= 0.5
 599
 600             if f.get('vcodec') == 'none':  # audio only
 601                 if self._downloader.params.get('prefer_free_formats'):
 602                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 603                 else:
 604                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 605                 ext_preference = 0
 606                 try:
 607                     audio_ext_preference = ORDER.index(f['ext'])
 608                 except ValueError:
 609                     audio_ext_preference = -1
 610             else:
 611                 if self._downloader.params.get('prefer_free_formats'):
 612                     ORDER = ['flv', 'mp4', 'webm']
 613                 else:
 614                     ORDER = ['webm', 'flv', 'mp4']
 615                 try:
 616                     ext_preference = ORDER.index(f['ext'])
 617                 except ValueError:
 618                     ext_preference = -1
 619                 audio_ext_preference = 0
 620
 621             return (
 622                 preference,
 623                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 624                 f.get('quality') if f.get('quality') is not None else -1,
 625                 f.get('height') if f.get('height') is not None else -1,
 626                 f.get('width') if f.get('width') is not None else -1,
 627                 ext_preference,
 628                 f.get('tbr') if f.get('tbr') is not None else -1,
 629                 f.get('vbr') if f.get('vbr') is not None else -1,
 630                 f.get('abr') if f.get('abr') is not None else -1,
 631                 audio_ext_preference,
 632                 f.get('fps') if f.get('fps') is not None else -1,
 633                 f.get('filesize') if f.get('filesize') is not None else -1,
 634                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 635                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 636                 f.get('format_id'),
 637             )
 638         formats.sort(key=_formats_key)
 639
 640     def http_scheme(self):
 641         """ Either "http:" or "https:", depending on the user's preferences """
 642         return (
 643             'http:'
 644             if self._downloader.params.get('prefer_insecure', False)
 645             else 'https:')
 646
 647     def _proto_relative_url(self, url, scheme=None):
 648         if url is None:
 649             return url
 650         if url.startswith('//'):
 651             if scheme is None:
 652                 scheme = self.http_scheme()
 653             return scheme + url
 654         else:
 655             return url
 656
 657     def _sleep(self, timeout, video_id, msg_template=None):
 658         if msg_template is None:
 659             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 660         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 661         self.to_screen(msg)
 662         time.sleep(timeout)
 663
 664     def _extract_f4m_formats(self, manifest_url, video_id):
 665         manifest = self._download_xml(
 666             manifest_url, video_id, 'Downloading f4m manifest',
 667             'Unable to download f4m manifest')
 668
 669         formats = []
 670         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 671         for i, media_el in enumerate(media_nodes):
 672             tbr = int_or_none(media_el.attrib.get('bitrate'))
 673             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 674             formats.append({
 675                 'format_id': format_id,
 676                 'url': manifest_url,
 677                 'ext': 'flv',
 678                 'tbr': tbr,
 679                 'width': int_or_none(media_el.attrib.get('width')),
 680                 'height': int_or_none(media_el.attrib.get('height')),
 681             })
 682         self._sort_formats(formats)
 683
 684         return formats
 685
 686     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 687                               entry_protocol='m3u8', preference=None):
 688
 689         formats = [{
 690             'format_id': 'm3u8-meta',
 691             'url': m3u8_url,
 692             'ext': ext,
 693             'protocol': 'm3u8',
 694             'preference': -1,
 695             'resolution': 'multiple',
 696             'format_note': 'Quality selection URL',
 697         }]
 698
 699         format_url = lambda u: (
 700             u
 701             if re.match(r'^https?://', u)
 702             else compat_urlparse.urljoin(m3u8_url, u))
 703
 704         m3u8_doc = self._download_webpage(
 705             m3u8_url, video_id,
 706             note='Downloading m3u8 information',
 707             errnote='Failed to download m3u8 information')
 708         last_info = None
 709         kv_rex = re.compile(
 710             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 711         for line in m3u8_doc.splitlines():
 712             if line.startswith('#EXT-X-STREAM-INF:'):
 713                 last_info = {}
 714                 for m in kv_rex.finditer(line):
 715                     v = m.group('val')
 716                     if v.startswith('"'):
 717                         v = v[1:-1]
 718                     last_info[m.group('key')] = v
 719             elif line.startswith('#') or not line.strip():
 720                 continue
 721             else:
 722                 if last_info is None:
 723                     formats.append({'url': format_url(line)})
 724                     continue
 725                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 726
 727                 f = {
 728                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 729                     'url': format_url(line.strip()),
 730                     'tbr': tbr,
 731                     'ext': ext,
 732                     'protocol': entry_protocol,
 733                     'preference': preference,
 734                 }
 735                 codecs = last_info.get('CODECS')
 736                 if codecs:
 737                     # TODO: looks like video codec is not always necessarily goes first
 738                     va_codecs = codecs.split(',')
 739                     if va_codecs[0]:
 740                         f['vcodec'] = va_codecs[0].partition('.')[0]
 741                     if len(va_codecs) > 1 and va_codecs[1]:
 742                         f['acodec'] = va_codecs[1].partition('.')[0]
 743                 resolution = last_info.get('RESOLUTION')
 744                 if resolution:
 745                     width_str, height_str = resolution.split('x')
 746                     f['width'] = int(width_str)
 747                     f['height'] = int(height_str)
 748                 formats.append(f)
 749                 last_info = {}
 750         self._sort_formats(formats)
 751         return formats
 752
 753     def _live_title(self, name):
 754         """ Generate the title for a live video """
 755         now = datetime.datetime.now()
 756         now_str = now.strftime("%Y-%m-%d %H:%M")
 757         return name + ' ' + now_str
 758
 759     def _int(self, v, name, fatal=False, **kwargs):
 760         res = int_or_none(v, **kwargs)
 761         if 'get_attr' in kwargs:
 762             print(getattr(v, kwargs['get_attr']))
 763         if res is None:
 764             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 765             if fatal:
 766                 raise ExtractorError(msg)
 767             else:
 768                 self._downloader.report_warning(msg)
 769         return res
 770
 771     def _float(self, v, name, fatal=False, **kwargs):
 772         res = float_or_none(v, **kwargs)
 773         if res is None:
 774             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 775             if fatal:
 776                 raise ExtractorError(msg)
 777             else:
 778                 self._downloader.report_warning(msg)
 779         return res
 780
 781
 782 class SearchInfoExtractor(InfoExtractor):
 783     """
 784     Base class for paged search queries extractors.
 785     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 786     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 787     """
 788
 789     @classmethod
 790     def _make_valid_url(cls):
 791         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 792
 793     @classmethod
 794     def suitable(cls, url):
 795         return re.match(cls._make_valid_url(), url) is not None
 796
 797     def _real_extract(self, query):
 798         mobj = re.match(self._make_valid_url(), query)
 799         if mobj is None:
 800             raise ExtractorError('Invalid search query "%s"' % query)
 801
 802         prefix = mobj.group('prefix')
 803         query = mobj.group('query')
 804         if prefix == '':
 805             return self._get_n_results(query, 1)
 806         elif prefix == 'all':
 807             return self._get_n_results(query, self._MAX_RESULTS)
 808         else:
 809             n = int(prefix)
 810             if n <= 0:
 811                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 812             elif n > self._MAX_RESULTS:
 813                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 814                 n = self._MAX_RESULTS
 815             return self._get_n_results(query, n)
 816
 817     def _get_n_results(self, query, n):
 818         """Get a specified number of results for a query"""
 819         raise NotImplementedError("This method must be implemented by subclasses")
 820
 821     @property
 822     def SEARCH_KEY(self):
 823         return self._SEARCH_KEY