_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The type field determines the the type of the result.
  47     By far the most common value (and the default if _type is missing) is
  48     "video", which indicates a single video.
  49
  50     For a video, the dictionaries must include the following fields:
  51
  52     id:             Video identifier.
  53     title:          Video title, unescaped.
  54
  55     Additionally, it must contain either a formats entry or a url one:
  56
  57     formats:        A list of dictionaries for each format available, ordered
  58                     from worst to best quality.
  59
  60                     Potential fields:
  61                     * url        Mandatory. The URL of the video file
  62                     * ext        Will be calculated from url if missing
  63                     * format     A human-readable description of the format
  64                                  ("mp4 container with h264/opus").
  65                                  Calculated from the format_id, width, height.
  66                                  and format_note fields if missing.
  67                     * format_id  A short description of the format
  68                                  ("mp4_h264_opus" or "19").
  69                                 Technically optional, but strongly recommended.
  70                     * format_note Additional info about the format
  71                                  ("3D" or "DASH video")
  72                     * width      Width of the video, if known
  73                     * height     Height of the video, if known
  74                     * resolution Textual description of width and height
  75                     * tbr        Average bitrate of audio and video in KBit/s
  76                     * abr        Average audio bitrate in KBit/s
  77                     * acodec     Name of the audio codec in use
  78                     * asr        Audio sampling rate in Hertz
  79                     * vbr        Average video bitrate in KBit/s
  80                     * fps        Frame rate
  81                     * vcodec     Name of the video codec in use
  82                     * container  Name of the container format
  83                     * filesize   The number of bytes, if known in advance
  84                     * filesize_approx  An estimate for the number of bytes
  85                     * player_url SWF Player URL (used for rtmpdump).
  86                     * protocol   The protocol that will be used for the actual
  87                                  download, lower-case.
  88                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  89                     * preference Order number of this format. If this field is
  90                                  present and not None, the formats get sorted
  91                                  by this field, regardless of all other values.
  92                                  -1 for default (order by other properties),
  93                                  -2 or smaller for less than default.
  94                     * language_preference  Is this in the correct requested
  95                                  language?
  96                                  10 if it's what the URL is about,
  97                                  -1 for default (don't know),
  98                                  -10 otherwise, other values reserved for now.
  99                     * quality    Order number of the video quality of this
 100                                  format, irrespective of the file format.
 101                                  -1 for default (order by other properties),
 102                                  -2 or smaller for less than default.
 103                     * source_preference  Order number for this video source
 104                                   (quality takes higher priority)
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * http_referer  HTTP Referer header value to set.
 108                     * http_method  HTTP method to use for the download.
 109                     * http_headers  A dictionary of additional HTTP headers
 110                                  to add to the request.
 111                     * http_post_data  Additional data to send with a POST
 112                                  request.
 113     url:            Final video URL.
 114     ext:            Video filename extension.
 115     format:         The video format, defaults to ext (used for --get-format)
 116     player_url:     SWF Player URL (used for rtmpdump).
 117
 118     The following fields are optional:
 119
 120     display_id      An alternative identifier for the video, not necessarily
 121                     unique, but available before title. Typically, id is
 122                     something like "4234987", title "Dancing naked mole rats",
 123                     and display_id "dancing-naked-mole-rats"
 124     thumbnails:     A list of dictionaries, with the following entries:
 125                         * "url"
 126                         * "width" (optional, int)
 127                         * "height" (optional, int)
 128                         * "resolution" (optional, string "{width}x{height"},
 129                                         deprecated)
 130     thumbnail:      Full URL to a video thumbnail image.
 131     description:    One-line video description.
 132     uploader:       Full name of the video uploader.
 133     timestamp:      UNIX timestamp of the moment the video became available.
 134     upload_date:    Video upload date (YYYYMMDD).
 135                     If not explicitly set, calculated from timestamp.
 136     uploader_id:    Nickname or id of the video uploader.
 137     location:       Physical location where the video was filmed.
 138     subtitles:      The subtitle file contents as a dictionary in the format
 139                     {language: subtitles}.
 140     duration:       Length of the video in seconds, as an integer.
 141     view_count:     How many users have watched the video on the platform.
 142     like_count:     Number of positive ratings of the video
 143     dislike_count:  Number of negative ratings of the video
 144     comment_count:  Number of comments on the video
 145     age_limit:      Age restriction for the video, as an integer (years)
 146     webpage_url:    The url to the video webpage, if given to youtube-dl it
 147                     should allow to get the same result again. (It will be set
 148                     by YoutubeDL if it's missing)
 149     categories:     A list of categories that the video falls in, for example
 150                     ["Sports", "Berlin"]
 151     is_live:        True, False, or None (=unknown). Whether this video is a
 152                     live stream that goes on instead of a fixed-length video.
 153
 154     Unless mentioned otherwise, the fields should be Unicode strings.
 155
 156     Unless mentioned otherwise, None is equivalent to absence of information.
 157
 158
 159     _type "playlist" indicates multiple videos.
 160     There must be a key "entries", which is a list or a PagedList object, each
 161     element of which is a valid dictionary under this specfication.
 162
 163     Additionally, playlists can have "title" and "id" attributes with the same
 164     semantics as videos (see above).
 165
 166
 167     _type "multi_video" indicates that there are multiple videos that
 168     form a single show, for examples multiple acts of an opera or TV episode.
 169     It must have an entries key like a playlist and contain all the keys
 170     required for a video at the same time.
 171
 172
 173     _type "url" indicates that the video must be extracted from another
 174     location, possibly by a different extractor. Its only required key is:
 175     "url" - the next URL to extract.
 176
 177     Additionally, it may have properties believed to be identical to the
 178     resolved entity, for example "title" if the title of the referred video is
 179     known ahead of time.
 180
 181
 182     _type "url_transparent" entities have the same specification as "url", but
 183     indicate that the given additional information is more precise than the one
 184     associated with the resolved URL.
 185     This is useful when a site employs a video service that hosts the video and
 186     its technical metadata, but that video service does not embed a useful
 187     title, description etc.
 188
 189
 190     Subclasses of this one should re-define the _real_initialize() and
 191     _real_extract() methods and define a _VALID_URL regexp.
 192     Probably, they should also be added to the list of extractors.
 193
 194     Finally, the _WORKING attribute should be set to False for broken IEs
 195     in order to warn the users and skip the tests.
 196     """
 197
 198     _ready = False
 199     _downloader = None
 200     _WORKING = True
 201
 202     def __init__(self, downloader=None):
 203         """Constructor. Receives an optional downloader."""
 204         self._ready = False
 205         self.set_downloader(downloader)
 206
 207     @classmethod
 208     def suitable(cls, url):
 209         """Receives a URL and returns True if suitable for this IE."""
 210
 211         # This does not use has/getattr intentionally - we want to know whether
 212         # we have cached the regexp for *this* class, whereas getattr would also
 213         # match the superclass
 214         if '_VALID_URL_RE' not in cls.__dict__:
 215             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 216         return cls._VALID_URL_RE.match(url) is not None
 217
 218     @classmethod
 219     def _match_id(cls, url):
 220         if '_VALID_URL_RE' not in cls.__dict__:
 221             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 222         m = cls._VALID_URL_RE.match(url)
 223         assert m
 224         return m.group('id')
 225
 226     @classmethod
 227     def working(cls):
 228         """Getter method for _WORKING."""
 229         return cls._WORKING
 230
 231     def initialize(self):
 232         """Initializes an instance (authentication, etc)."""
 233         if not self._ready:
 234             self._real_initialize()
 235             self._ready = True
 236
 237     def extract(self, url):
 238         """Extracts URL information and returns it in list of dicts."""
 239         self.initialize()
 240         return self._real_extract(url)
 241
 242     def set_downloader(self, downloader):
 243         """Sets the downloader for this IE."""
 244         self._downloader = downloader
 245
 246     def _real_initialize(self):
 247         """Real initialization process. Redefine in subclasses."""
 248         pass
 249
 250     def _real_extract(self, url):
 251         """Real extraction process. Redefine in subclasses."""
 252         pass
 253
 254     @classmethod
 255     def ie_key(cls):
 256         """A string for getting the InfoExtractor with get_info_extractor"""
 257         return cls.__name__[:-2]
 258
 259     @property
 260     def IE_NAME(self):
 261         return type(self).__name__[:-2]
 262
 263     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 264         """ Returns the response handle """
 265         if note is None:
 266             self.report_download_webpage(video_id)
 267         elif note is not False:
 268             if video_id is None:
 269                 self.to_screen('%s' % (note,))
 270             else:
 271                 self.to_screen('%s: %s' % (video_id, note))
 272         try:
 273             return self._downloader.urlopen(url_or_request)
 274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 275             if errnote is False:
 276                 return False
 277             if errnote is None:
 278                 errnote = 'Unable to download webpage'
 279             errmsg = '%s: %s' % (errnote, compat_str(err))
 280             if fatal:
 281                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 282             else:
 283                 self._downloader.report_warning(errmsg)
 284                 return False
 285
 286     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 287         """ Returns a tuple (page content as string, URL handle) """
 288         # Strip hashes from the URL (#1038)
 289         if isinstance(url_or_request, (compat_str, str)):
 290             url_or_request = url_or_request.partition('#')[0]
 291
 292         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 293         if urlh is False:
 294             assert not fatal
 295             return False
 296         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 297         return (content, urlh)
 298
 299     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 300         content_type = urlh.headers.get('Content-Type', '')
 301         webpage_bytes = urlh.read()
 302         if prefix is not None:
 303             webpage_bytes = prefix + webpage_bytes
 304         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 305         if m:
 306             encoding = m.group(1)
 307         else:
 308             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 309                           webpage_bytes[:1024])
 310             if m:
 311                 encoding = m.group(1).decode('ascii')
 312             elif webpage_bytes.startswith(b'\xff\xfe'):
 313                 encoding = 'utf-16'
 314             else:
 315                 encoding = 'utf-8'
 316         if self._downloader.params.get('dump_intermediate_pages', False):
 317             try:
 318                 url = url_or_request.get_full_url()
 319             except AttributeError:
 320                 url = url_or_request
 321             self.to_screen('Dumping request to ' + url)
 322             dump = base64.b64encode(webpage_bytes).decode('ascii')
 323             self._downloader.to_screen(dump)
 324         if self._downloader.params.get('write_pages', False):
 325             try:
 326                 url = url_or_request.get_full_url()
 327             except AttributeError:
 328                 url = url_or_request
 329             basen = '%s_%s' % (video_id, url)
 330             if len(basen) > 240:
 331                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 332                 basen = basen[:240 - len(h)] + h
 333             raw_filename = basen + '.dump'
 334             filename = sanitize_filename(raw_filename, restricted=True)
 335             self.to_screen('Saving request to ' + filename)
 336             # Working around MAX_PATH limitation on Windows (see
 337             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 338             if os.name == 'nt':
 339                 absfilepath = os.path.abspath(filename)
 340                 if len(absfilepath) > 259:
 341                     filename = '\\\\?\\' + absfilepath
 342             with open(filename, 'wb') as outf:
 343                 outf.write(webpage_bytes)
 344
 345         try:
 346             content = webpage_bytes.decode(encoding, 'replace')
 347         except LookupError:
 348             content = webpage_bytes.decode('utf-8', 'replace')
 349
 350         if ('<title>Access to this site is blocked</title>' in content and
 351                 'Websense' in content[:512]):
 352             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 353             blocked_iframe = self._html_search_regex(
 354                 r'<iframe src="([^"]+)"', content,
 355                 'Websense information URL', default=None)
 356             if blocked_iframe:
 357                 msg += ' Visit %s for more details' % blocked_iframe
 358             raise ExtractorError(msg, expected=True)
 359
 360         return content
 361
 362     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 363         """ Returns the data of the page as a string """
 364         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 365         if res is False:
 366             return res
 367         else:
 368             content, _ = res
 369             return content
 370
 371     def _download_xml(self, url_or_request, video_id,
 372                       note='Downloading XML', errnote='Unable to download XML',
 373                       transform_source=None, fatal=True):
 374         """Return the xml as an xml.etree.ElementTree.Element"""
 375         xml_string = self._download_webpage(
 376             url_or_request, video_id, note, errnote, fatal=fatal)
 377         if xml_string is False:
 378             return xml_string
 379         if transform_source:
 380             xml_string = transform_source(xml_string)
 381         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 382
 383     def _download_json(self, url_or_request, video_id,
 384                        note='Downloading JSON metadata',
 385                        errnote='Unable to download JSON metadata',
 386                        transform_source=None,
 387                        fatal=True):
 388         json_string = self._download_webpage(
 389             url_or_request, video_id, note, errnote, fatal=fatal)
 390         if (not fatal) and json_string is False:
 391             return None
 392         if transform_source:
 393             json_string = transform_source(json_string)
 394         try:
 395             return json.loads(json_string)
 396         except ValueError as ve:
 397             errmsg = '%s: Failed to parse JSON ' % video_id
 398             if fatal:
 399                 raise ExtractorError(errmsg, cause=ve)
 400             else:
 401                 self.report_warning(errmsg + str(ve))
 402
 403     def report_warning(self, msg, video_id=None):
 404         idstr = '' if video_id is None else '%s: ' % video_id
 405         self._downloader.report_warning(
 406             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 407
 408     def to_screen(self, msg):
 409         """Print msg to screen, prefixing it with '[ie_name]'"""
 410         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 411
 412     def report_extraction(self, id_or_name):
 413         """Report information extraction."""
 414         self.to_screen('%s: Extracting information' % id_or_name)
 415
 416     def report_download_webpage(self, video_id):
 417         """Report webpage download."""
 418         self.to_screen('%s: Downloading webpage' % video_id)
 419
 420     def report_age_confirmation(self):
 421         """Report attempt to confirm age."""
 422         self.to_screen('Confirming age')
 423
 424     def report_login(self):
 425         """Report attempt to log in."""
 426         self.to_screen('Logging in')
 427
 428     # Methods for following #608
 429     @staticmethod
 430     def url_result(url, ie=None, video_id=None):
 431         """Returns a url that points to a page that should be processed"""
 432         # TODO: ie should be the class used for getting the info
 433         video_info = {'_type': 'url',
 434                       'url': url,
 435                       'ie_key': ie}
 436         if video_id is not None:
 437             video_info['id'] = video_id
 438         return video_info
 439
 440     @staticmethod
 441     def playlist_result(entries, playlist_id=None, playlist_title=None):
 442         """Returns a playlist"""
 443         video_info = {'_type': 'playlist',
 444                       'entries': entries}
 445         if playlist_id:
 446             video_info['id'] = playlist_id
 447         if playlist_title:
 448             video_info['title'] = playlist_title
 449         return video_info
 450
 451     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 452         """
 453         Perform a regex search on the given string, using a single or a list of
 454         patterns returning the first matching group.
 455         In case of failure return a default value or raise a WARNING or a
 456         RegexNotFoundError, depending on fatal, specifying the field name.
 457         """
 458         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 459             mobj = re.search(pattern, string, flags)
 460         else:
 461             for p in pattern:
 462                 mobj = re.search(p, string, flags)
 463                 if mobj:
 464                     break
 465
 466         if os.name != 'nt' and sys.stderr.isatty():
 467             _name = '\033[0;34m%s\033[0m' % name
 468         else:
 469             _name = name
 470
 471         if mobj:
 472             if group is None:
 473                 # return the first matching group
 474                 return next(g for g in mobj.groups() if g is not None)
 475             else:
 476                 return mobj.group(group)
 477         elif default is not _NO_DEFAULT:
 478             return default
 479         elif fatal:
 480             raise RegexNotFoundError('Unable to extract %s' % _name)
 481         else:
 482             self._downloader.report_warning('unable to extract %s; '
 483                                             'please report this issue on http://yt-dl.org/bug' % _name)
 484             return None
 485
 486     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 487         """
 488         Like _search_regex, but strips HTML tags and unescapes entities.
 489         """
 490         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 491         if res:
 492             return clean_html(res).strip()
 493         else:
 494             return res
 495
 496     def _get_login_info(self):
 497         """
 498         Get the the login info as (username, password)
 499         It will look in the netrc file using the _NETRC_MACHINE value
 500         If there's no info available, return (None, None)
 501         """
 502         if self._downloader is None:
 503             return (None, None)
 504
 505         username = None
 506         password = None
 507         downloader_params = self._downloader.params
 508
 509         # Attempt to use provided username and password or .netrc data
 510         if downloader_params.get('username', None) is not None:
 511             username = downloader_params['username']
 512             password = downloader_params['password']
 513         elif downloader_params.get('usenetrc', False):
 514             try:
 515                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 516                 if info is not None:
 517                     username = info[0]
 518                     password = info[2]
 519                 else:
 520                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 521             except (IOError, netrc.NetrcParseError) as err:
 522                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 523
 524         return (username, password)
 525
 526     def _get_tfa_info(self):
 527         """
 528         Get the two-factor authentication info
 529         TODO - asking the user will be required for sms/phone verify
 530         currently just uses the command line option
 531         If there's no info available, return None
 532         """
 533         if self._downloader is None:
 534             return None
 535         downloader_params = self._downloader.params
 536
 537         if downloader_params.get('twofactor', None) is not None:
 538             return downloader_params['twofactor']
 539
 540         return None
 541
 542     # Helper functions for extracting OpenGraph info
 543     @staticmethod
 544     def _og_regexes(prop):
 545         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 546         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 547         template = r'<meta[^>]+?%s[^>]+?%s'
 548         return [
 549             template % (property_re, content_re),
 550             template % (content_re, property_re),
 551         ]
 552
 553     def _og_search_property(self, prop, html, name=None, **kargs):
 554         if name is None:
 555             name = 'OpenGraph %s' % prop
 556         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 557         if escaped is None:
 558             return None
 559         return unescapeHTML(escaped)
 560
 561     def _og_search_thumbnail(self, html, **kargs):
 562         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 563
 564     def _og_search_description(self, html, **kargs):
 565         return self._og_search_property('description', html, fatal=False, **kargs)
 566
 567     def _og_search_title(self, html, **kargs):
 568         return self._og_search_property('title', html, **kargs)
 569
 570     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 571         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 572         if secure:
 573             regexes = self._og_regexes('video:secure_url') + regexes
 574         return self._html_search_regex(regexes, html, name, **kargs)
 575
 576     def _og_search_url(self, html, **kargs):
 577         return self._og_search_property('url', html, **kargs)
 578
 579     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 580         if display_name is None:
 581             display_name = name
 582         return self._html_search_regex(
 583             r'''(?ix)<meta
 584                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 585                     [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
 586             html, display_name, fatal=fatal, group='content', **kwargs)
 587
 588     def _dc_search_uploader(self, html):
 589         return self._html_search_meta('dc.creator', html, 'uploader')
 590
 591     def _rta_search(self, html):
 592         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 593         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 594                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 595                      html):
 596             return 18
 597         return 0
 598
 599     def _media_rating_search(self, html):
 600         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 601         rating = self._html_search_meta('rating', html)
 602
 603         if not rating:
 604             return None
 605
 606         RATING_TABLE = {
 607             'safe for kids': 0,
 608             'general': 8,
 609             '14 years': 14,
 610             'mature': 17,
 611             'restricted': 19,
 612         }
 613         return RATING_TABLE.get(rating.lower(), None)
 614
 615     def _twitter_search_player(self, html):
 616         return self._html_search_meta('twitter:player', html,
 617                                       'twitter card player')
 618
 619     def _sort_formats(self, formats):
 620         if not formats:
 621             raise ExtractorError('No video formats found')
 622
 623         def _formats_key(f):
 624             # TODO remove the following workaround
 625             from ..utils import determine_ext
 626             if not f.get('ext') and 'url' in f:
 627                 f['ext'] = determine_ext(f['url'])
 628
 629             preference = f.get('preference')
 630             if preference is None:
 631                 proto = f.get('protocol')
 632                 if proto is None:
 633                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 634
 635                 preference = 0 if proto in ['http', 'https'] else -0.1
 636                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 637                     preference -= 0.5
 638
 639             if f.get('vcodec') == 'none':  # audio only
 640                 if self._downloader.params.get('prefer_free_formats'):
 641                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 642                 else:
 643                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 644                 ext_preference = 0
 645                 try:
 646                     audio_ext_preference = ORDER.index(f['ext'])
 647                 except ValueError:
 648                     audio_ext_preference = -1
 649             else:
 650                 if self._downloader.params.get('prefer_free_formats'):
 651                     ORDER = ['flv', 'mp4', 'webm']
 652                 else:
 653                     ORDER = ['webm', 'flv', 'mp4']
 654                 try:
 655                     ext_preference = ORDER.index(f['ext'])
 656                 except ValueError:
 657                     ext_preference = -1
 658                 audio_ext_preference = 0
 659
 660             return (
 661                 preference,
 662                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 663                 f.get('quality') if f.get('quality') is not None else -1,
 664                 f.get('height') if f.get('height') is not None else -1,
 665                 f.get('width') if f.get('width') is not None else -1,
 666                 ext_preference,
 667                 f.get('tbr') if f.get('tbr') is not None else -1,
 668                 f.get('vbr') if f.get('vbr') is not None else -1,
 669                 f.get('abr') if f.get('abr') is not None else -1,
 670                 audio_ext_preference,
 671                 f.get('fps') if f.get('fps') is not None else -1,
 672                 f.get('filesize') if f.get('filesize') is not None else -1,
 673                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 674                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 675                 f.get('format_id'),
 676             )
 677         formats.sort(key=_formats_key)
 678
 679     def http_scheme(self):
 680         """ Either "http:" or "https:", depending on the user's preferences """
 681         return (
 682             'http:'
 683             if self._downloader.params.get('prefer_insecure', False)
 684             else 'https:')
 685
 686     def _proto_relative_url(self, url, scheme=None):
 687         if url is None:
 688             return url
 689         if url.startswith('//'):
 690             if scheme is None:
 691                 scheme = self.http_scheme()
 692             return scheme + url
 693         else:
 694             return url
 695
 696     def _sleep(self, timeout, video_id, msg_template=None):
 697         if msg_template is None:
 698             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 699         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 700         self.to_screen(msg)
 701         time.sleep(timeout)
 702
 703     def _extract_f4m_formats(self, manifest_url, video_id):
 704         manifest = self._download_xml(
 705             manifest_url, video_id, 'Downloading f4m manifest',
 706             'Unable to download f4m manifest')
 707
 708         formats = []
 709         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 710         for i, media_el in enumerate(media_nodes):
 711             tbr = int_or_none(media_el.attrib.get('bitrate'))
 712             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 713             formats.append({
 714                 'format_id': format_id,
 715                 'url': manifest_url,
 716                 'ext': 'flv',
 717                 'tbr': tbr,
 718                 'width': int_or_none(media_el.attrib.get('width')),
 719                 'height': int_or_none(media_el.attrib.get('height')),
 720             })
 721         self._sort_formats(formats)
 722
 723         return formats
 724
 725     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 726                               entry_protocol='m3u8', preference=None):
 727
 728         formats = [{
 729             'format_id': 'm3u8-meta',
 730             'url': m3u8_url,
 731             'ext': ext,
 732             'protocol': 'm3u8',
 733             'preference': -1,
 734             'resolution': 'multiple',
 735             'format_note': 'Quality selection URL',
 736         }]
 737
 738         format_url = lambda u: (
 739             u
 740             if re.match(r'^https?://', u)
 741             else compat_urlparse.urljoin(m3u8_url, u))
 742
 743         m3u8_doc = self._download_webpage(
 744             m3u8_url, video_id,
 745             note='Downloading m3u8 information',
 746             errnote='Failed to download m3u8 information')
 747         last_info = None
 748         kv_rex = re.compile(
 749             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 750         for line in m3u8_doc.splitlines():
 751             if line.startswith('#EXT-X-STREAM-INF:'):
 752                 last_info = {}
 753                 for m in kv_rex.finditer(line):
 754                     v = m.group('val')
 755                     if v.startswith('"'):
 756                         v = v[1:-1]
 757                     last_info[m.group('key')] = v
 758             elif line.startswith('#') or not line.strip():
 759                 continue
 760             else:
 761                 if last_info is None:
 762                     formats.append({'url': format_url(line)})
 763                     continue
 764                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 765
 766                 f = {
 767                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 768                     'url': format_url(line.strip()),
 769                     'tbr': tbr,
 770                     'ext': ext,
 771                     'protocol': entry_protocol,
 772                     'preference': preference,
 773                 }
 774                 codecs = last_info.get('CODECS')
 775                 if codecs:
 776                     # TODO: looks like video codec is not always necessarily goes first
 777                     va_codecs = codecs.split(',')
 778                     if va_codecs[0]:
 779                         f['vcodec'] = va_codecs[0].partition('.')[0]
 780                     if len(va_codecs) > 1 and va_codecs[1]:
 781                         f['acodec'] = va_codecs[1].partition('.')[0]
 782                 resolution = last_info.get('RESOLUTION')
 783                 if resolution:
 784                     width_str, height_str = resolution.split('x')
 785                     f['width'] = int(width_str)
 786                     f['height'] = int(height_str)
 787                 formats.append(f)
 788                 last_info = {}
 789         self._sort_formats(formats)
 790         return formats
 791
 792     def _live_title(self, name):
 793         """ Generate the title for a live video """
 794         now = datetime.datetime.now()
 795         now_str = now.strftime("%Y-%m-%d %H:%M")
 796         return name + ' ' + now_str
 797
 798     def _int(self, v, name, fatal=False, **kwargs):
 799         res = int_or_none(v, **kwargs)
 800         if 'get_attr' in kwargs:
 801             print(getattr(v, kwargs['get_attr']))
 802         if res is None:
 803             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 804             if fatal:
 805                 raise ExtractorError(msg)
 806             else:
 807                 self._downloader.report_warning(msg)
 808         return res
 809
 810     def _float(self, v, name, fatal=False, **kwargs):
 811         res = float_or_none(v, **kwargs)
 812         if res is None:
 813             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 814             if fatal:
 815                 raise ExtractorError(msg)
 816             else:
 817                 self._downloader.report_warning(msg)
 818         return res
 819
 820
 821 class SearchInfoExtractor(InfoExtractor):
 822     """
 823     Base class for paged search queries extractors.
 824     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 825     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 826     """
 827
 828     @classmethod
 829     def _make_valid_url(cls):
 830         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 831
 832     @classmethod
 833     def suitable(cls, url):
 834         return re.match(cls._make_valid_url(), url) is not None
 835
 836     def _real_extract(self, query):
 837         mobj = re.match(self._make_valid_url(), query)
 838         if mobj is None:
 839             raise ExtractorError('Invalid search query "%s"' % query)
 840
 841         prefix = mobj.group('prefix')
 842         query = mobj.group('query')
 843         if prefix == '':
 844             return self._get_n_results(query, 1)
 845         elif prefix == 'all':
 846             return self._get_n_results(query, self._MAX_RESULTS)
 847         else:
 848             n = int(prefix)
 849             if n <= 0:
 850                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 851             elif n > self._MAX_RESULTS:
 852                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 853                 n = self._MAX_RESULTS
 854             return self._get_n_results(query, n)
 855
 856     def _get_n_results(self, query, n):
 857         """Get a specified number of results for a query"""
 858         raise NotImplementedError("This method must be implemented by subclasses")
 859
 860     @property
 861     def SEARCH_KEY(self):
 862         return self._SEARCH_KEY