_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The dictionaries must include the following fields:
  47
  48     id:             Video identifier.
  49     title:          Video title, unescaped.
  50
  51     Additionally, it must contain either a formats entry or a url one:
  52
  53     formats:        A list of dictionaries for each format available, ordered
  54                     from worst to best quality.
  55
  56                     Potential fields:
  57                     * url        Mandatory. The URL of the video file
  58                     * ext        Will be calculated from url if missing
  59                     * format     A human-readable description of the format
  60                                  ("mp4 container with h264/opus").
  61                                  Calculated from the format_id, width, height.
  62                                  and format_note fields if missing.
  63                     * format_id  A short description of the format
  64                                  ("mp4_h264_opus" or "19").
  65                                 Technically optional, but strongly recommended.
  66                     * format_note Additional info about the format
  67                                  ("3D" or "DASH video")
  68                     * width      Width of the video, if known
  69                     * height     Height of the video, if known
  70                     * resolution Textual description of width and height
  71                     * tbr        Average bitrate of audio and video in KBit/s
  72                     * abr        Average audio bitrate in KBit/s
  73                     * acodec     Name of the audio codec in use
  74                     * asr        Audio sampling rate in Hertz
  75                     * vbr        Average video bitrate in KBit/s
  76                     * fps        Frame rate
  77                     * vcodec     Name of the video codec in use
  78                     * container  Name of the container format
  79                     * filesize   The number of bytes, if known in advance
  80                     * filesize_approx  An estimate for the number of bytes
  81                     * player_url SWF Player URL (used for rtmpdump).
  82                     * protocol   The protocol that will be used for the actual
  83                                  download, lower-case.
  84                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  85                     * preference Order number of this format. If this field is
  86                                  present and not None, the formats get sorted
  87                                  by this field, regardless of all other values.
  88                                  -1 for default (order by other properties),
  89                                  -2 or smaller for less than default.
  90                     * quality    Order number of the video quality of this
  91                                  format, irrespective of the file format.
  92                                  -1 for default (order by other properties),
  93                                  -2 or smaller for less than default.
  94                     * source_preference  Order number for this video source
  95                                   (quality takes higher priority)
  96                                  -1 for default (order by other properties),
  97                                  -2 or smaller for less than default.
  98                     * http_referer  HTTP Referer header value to set.
  99                     * http_method  HTTP method to use for the download.
 100                     * http_headers  A dictionary of additional HTTP headers
 101                                  to add to the request.
 102                     * http_post_data  Additional data to send with a POST
 103                                  request.
 104     url:            Final video URL.
 105     ext:            Video filename extension.
 106     format:         The video format, defaults to ext (used for --get-format)
 107     player_url:     SWF Player URL (used for rtmpdump).
 108
 109     The following fields are optional:
 110
 111     display_id      An alternative identifier for the video, not necessarily
 112                     unique, but available before title. Typically, id is
 113                     something like "4234987", title "Dancing naked mole rats",
 114                     and display_id "dancing-naked-mole-rats"
 115     thumbnails:     A list of dictionaries, with the following entries:
 116                         * "url"
 117                         * "width" (optional, int)
 118                         * "height" (optional, int)
 119                         * "resolution" (optional, string "{width}x{height"},
 120                                         deprecated)
 121     thumbnail:      Full URL to a video thumbnail image.
 122     description:    One-line video description.
 123     uploader:       Full name of the video uploader.
 124     timestamp:      UNIX timestamp of the moment the video became available.
 125     upload_date:    Video upload date (YYYYMMDD).
 126                     If not explicitly set, calculated from timestamp.
 127     uploader_id:    Nickname or id of the video uploader.
 128     location:       Physical location where the video was filmed.
 129     subtitles:      The subtitle file contents as a dictionary in the format
 130                     {language: subtitles}.
 131     duration:       Length of the video in seconds, as an integer.
 132     view_count:     How many users have watched the video on the platform.
 133     like_count:     Number of positive ratings of the video
 134     dislike_count:  Number of negative ratings of the video
 135     comment_count:  Number of comments on the video
 136     age_limit:      Age restriction for the video, as an integer (years)
 137     webpage_url:    The url to the video webpage, if given to youtube-dl it
 138                     should allow to get the same result again. (It will be set
 139                     by YoutubeDL if it's missing)
 140     categories:     A list of categories that the video falls in, for example
 141                     ["Sports", "Berlin"]
 142     is_live:        True, False, or None (=unknown). Whether this video is a
 143                     live stream that goes on instead of a fixed-length video.
 144
 145     Unless mentioned otherwise, the fields should be Unicode strings.
 146
 147     Unless mentioned otherwise, None is equivalent to absence of information.
 148
 149     Subclasses of this one should re-define the _real_initialize() and
 150     _real_extract() methods and define a _VALID_URL regexp.
 151     Probably, they should also be added to the list of extractors.
 152
 153     Finally, the _WORKING attribute should be set to False for broken IEs
 154     in order to warn the users and skip the tests.
 155     """
 156
 157     _ready = False
 158     _downloader = None
 159     _WORKING = True
 160
 161     def __init__(self, downloader=None):
 162         """Constructor. Receives an optional downloader."""
 163         self._ready = False
 164         self.set_downloader(downloader)
 165
 166     @classmethod
 167     def suitable(cls, url):
 168         """Receives a URL and returns True if suitable for this IE."""
 169
 170         # This does not use has/getattr intentionally - we want to know whether
 171         # we have cached the regexp for *this* class, whereas getattr would also
 172         # match the superclass
 173         if '_VALID_URL_RE' not in cls.__dict__:
 174             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 175         return cls._VALID_URL_RE.match(url) is not None
 176
 177     @classmethod
 178     def _match_id(cls, url):
 179         if '_VALID_URL_RE' not in cls.__dict__:
 180             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 181         m = cls._VALID_URL_RE.match(url)
 182         assert m
 183         return m.group('id')
 184
 185     @classmethod
 186     def working(cls):
 187         """Getter method for _WORKING."""
 188         return cls._WORKING
 189
 190     def initialize(self):
 191         """Initializes an instance (authentication, etc)."""
 192         if not self._ready:
 193             self._real_initialize()
 194             self._ready = True
 195
 196     def extract(self, url):
 197         """Extracts URL information and returns it in list of dicts."""
 198         self.initialize()
 199         return self._real_extract(url)
 200
 201     def set_downloader(self, downloader):
 202         """Sets the downloader for this IE."""
 203         self._downloader = downloader
 204
 205     def _real_initialize(self):
 206         """Real initialization process. Redefine in subclasses."""
 207         pass
 208
 209     def _real_extract(self, url):
 210         """Real extraction process. Redefine in subclasses."""
 211         pass
 212
 213     @classmethod
 214     def ie_key(cls):
 215         """A string for getting the InfoExtractor with get_info_extractor"""
 216         return cls.__name__[:-2]
 217
 218     @property
 219     def IE_NAME(self):
 220         return type(self).__name__[:-2]
 221
 222     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 223         """ Returns the response handle """
 224         if note is None:
 225             self.report_download_webpage(video_id)
 226         elif note is not False:
 227             if video_id is None:
 228                 self.to_screen('%s' % (note,))
 229             else:
 230                 self.to_screen('%s: %s' % (video_id, note))
 231         try:
 232             return self._downloader.urlopen(url_or_request)
 233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 234             if errnote is False:
 235                 return False
 236             if errnote is None:
 237                 errnote = 'Unable to download webpage'
 238             errmsg = '%s: %s' % (errnote, compat_str(err))
 239             if fatal:
 240                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 241             else:
 242                 self._downloader.report_warning(errmsg)
 243                 return False
 244
 245     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 246         """ Returns a tuple (page content as string, URL handle) """
 247         # Strip hashes from the URL (#1038)
 248         if isinstance(url_or_request, (compat_str, str)):
 249             url_or_request = url_or_request.partition('#')[0]
 250
 251         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 252         if urlh is False:
 253             assert not fatal
 254             return False
 255         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 256         return (content, urlh)
 257
 258     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 259         content_type = urlh.headers.get('Content-Type', '')
 260         webpage_bytes = urlh.read()
 261         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 262         if m:
 263             encoding = m.group(1)
 264         else:
 265             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 266                           webpage_bytes[:1024])
 267             if m:
 268                 encoding = m.group(1).decode('ascii')
 269             elif webpage_bytes.startswith(b'\xff\xfe'):
 270                 encoding = 'utf-16'
 271             else:
 272                 encoding = 'utf-8'
 273         if self._downloader.params.get('dump_intermediate_pages', False):
 274             try:
 275                 url = url_or_request.get_full_url()
 276             except AttributeError:
 277                 url = url_or_request
 278             self.to_screen('Dumping request to ' + url)
 279             dump = base64.b64encode(webpage_bytes).decode('ascii')
 280             self._downloader.to_screen(dump)
 281         if self._downloader.params.get('write_pages', False):
 282             try:
 283                 url = url_or_request.get_full_url()
 284             except AttributeError:
 285                 url = url_or_request
 286             basen = '%s_%s' % (video_id, url)
 287             if len(basen) > 240:
 288                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 289                 basen = basen[:240 - len(h)] + h
 290             raw_filename = basen + '.dump'
 291             filename = sanitize_filename(raw_filename, restricted=True)
 292             self.to_screen('Saving request to ' + filename)
 293             # Working around MAX_PATH limitation on Windows (see
 294             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 295             if os.name == 'nt':
 296                 absfilepath = os.path.abspath(filename)
 297                 if len(absfilepath) > 259:
 298                     filename = '\\\\?\\' + absfilepath
 299             with open(filename, 'wb') as outf:
 300                 outf.write(webpage_bytes)
 301
 302         try:
 303             content = webpage_bytes.decode(encoding, 'replace')
 304         except LookupError:
 305             content = webpage_bytes.decode('utf-8', 'replace')
 306
 307         if ('<title>Access to this site is blocked</title>' in content and
 308                 'Websense' in content[:512]):
 309             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 310             blocked_iframe = self._html_search_regex(
 311                 r'<iframe src="([^"]+)"', content,
 312                 'Websense information URL', default=None)
 313             if blocked_iframe:
 314                 msg += ' Visit %s for more details' % blocked_iframe
 315             raise ExtractorError(msg, expected=True)
 316
 317         return content
 318
 319     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 320         """ Returns the data of the page as a string """
 321         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 322         if res is False:
 323             return res
 324         else:
 325             content, _ = res
 326             return content
 327
 328     def _download_xml(self, url_or_request, video_id,
 329                       note='Downloading XML', errnote='Unable to download XML',
 330                       transform_source=None, fatal=True):
 331         """Return the xml as an xml.etree.ElementTree.Element"""
 332         xml_string = self._download_webpage(
 333             url_or_request, video_id, note, errnote, fatal=fatal)
 334         if xml_string is False:
 335             return xml_string
 336         if transform_source:
 337             xml_string = transform_source(xml_string)
 338         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 339
 340     def _download_json(self, url_or_request, video_id,
 341                        note='Downloading JSON metadata',
 342                        errnote='Unable to download JSON metadata',
 343                        transform_source=None,
 344                        fatal=True):
 345         json_string = self._download_webpage(
 346             url_or_request, video_id, note, errnote, fatal=fatal)
 347         if (not fatal) and json_string is False:
 348             return None
 349         if transform_source:
 350             json_string = transform_source(json_string)
 351         try:
 352             return json.loads(json_string)
 353         except ValueError as ve:
 354             errmsg = '%s: Failed to parse JSON ' % video_id
 355             if fatal:
 356                 raise ExtractorError(errmsg, cause=ve)
 357             else:
 358                 self.report_warning(errmsg + str(ve))
 359
 360     def report_warning(self, msg, video_id=None):
 361         idstr = '' if video_id is None else '%s: ' % video_id
 362         self._downloader.report_warning(
 363             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 364
 365     def to_screen(self, msg):
 366         """Print msg to screen, prefixing it with '[ie_name]'"""
 367         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 368
 369     def report_extraction(self, id_or_name):
 370         """Report information extraction."""
 371         self.to_screen('%s: Extracting information' % id_or_name)
 372
 373     def report_download_webpage(self, video_id):
 374         """Report webpage download."""
 375         self.to_screen('%s: Downloading webpage' % video_id)
 376
 377     def report_age_confirmation(self):
 378         """Report attempt to confirm age."""
 379         self.to_screen('Confirming age')
 380
 381     def report_login(self):
 382         """Report attempt to log in."""
 383         self.to_screen('Logging in')
 384
 385     #Methods for following #608
 386     @staticmethod
 387     def url_result(url, ie=None, video_id=None):
 388         """Returns a url that points to a page that should be processed"""
 389         #TODO: ie should be the class used for getting the info
 390         video_info = {'_type': 'url',
 391                       'url': url,
 392                       'ie_key': ie}
 393         if video_id is not None:
 394             video_info['id'] = video_id
 395         return video_info
 396     @staticmethod
 397     def playlist_result(entries, playlist_id=None, playlist_title=None):
 398         """Returns a playlist"""
 399         video_info = {'_type': 'playlist',
 400                       'entries': entries}
 401         if playlist_id:
 402             video_info['id'] = playlist_id
 403         if playlist_title:
 404             video_info['title'] = playlist_title
 405         return video_info
 406
 407     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 408         """
 409         Perform a regex search on the given string, using a single or a list of
 410         patterns returning the first matching group.
 411         In case of failure return a default value or raise a WARNING or a
 412         RegexNotFoundError, depending on fatal, specifying the field name.
 413         """
 414         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 415             mobj = re.search(pattern, string, flags)
 416         else:
 417             for p in pattern:
 418                 mobj = re.search(p, string, flags)
 419                 if mobj:
 420                     break
 421
 422         if os.name != 'nt' and sys.stderr.isatty():
 423             _name = '\033[0;34m%s\033[0m' % name
 424         else:
 425             _name = name
 426
 427         if mobj:
 428             if group is None:
 429                 # return the first matching group
 430                 return next(g for g in mobj.groups() if g is not None)
 431             else:
 432                 return mobj.group(group)
 433         elif default is not _NO_DEFAULT:
 434             return default
 435         elif fatal:
 436             raise RegexNotFoundError('Unable to extract %s' % _name)
 437         else:
 438             self._downloader.report_warning('unable to extract %s; '
 439                 'please report this issue on http://yt-dl.org/bug' % _name)
 440             return None
 441
 442     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 443         """
 444         Like _search_regex, but strips HTML tags and unescapes entities.
 445         """
 446         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 447         if res:
 448             return clean_html(res).strip()
 449         else:
 450             return res
 451
 452     def _get_login_info(self):
 453         """
 454         Get the the login info as (username, password)
 455         It will look in the netrc file using the _NETRC_MACHINE value
 456         If there's no info available, return (None, None)
 457         """
 458         if self._downloader is None:
 459             return (None, None)
 460
 461         username = None
 462         password = None
 463         downloader_params = self._downloader.params
 464
 465         # Attempt to use provided username and password or .netrc data
 466         if downloader_params.get('username', None) is not None:
 467             username = downloader_params['username']
 468             password = downloader_params['password']
 469         elif downloader_params.get('usenetrc', False):
 470             try:
 471                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 472                 if info is not None:
 473                     username = info[0]
 474                     password = info[2]
 475                 else:
 476                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 477             except (IOError, netrc.NetrcParseError) as err:
 478                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 479
 480         return (username, password)
 481
 482     def _get_tfa_info(self):
 483         """
 484         Get the two-factor authentication info
 485         TODO - asking the user will be required for sms/phone verify
 486         currently just uses the command line option
 487         If there's no info available, return None
 488         """
 489         if self._downloader is None:
 490             return None
 491         downloader_params = self._downloader.params
 492
 493         if downloader_params.get('twofactor', None) is not None:
 494             return downloader_params['twofactor']
 495
 496         return None
 497
 498     # Helper functions for extracting OpenGraph info
 499     @staticmethod
 500     def _og_regexes(prop):
 501         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 502         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 503         template = r'<meta[^>]+?%s[^>]+?%s'
 504         return [
 505             template % (property_re, content_re),
 506             template % (content_re, property_re),
 507         ]
 508
 509     def _og_search_property(self, prop, html, name=None, **kargs):
 510         if name is None:
 511             name = 'OpenGraph %s' % prop
 512         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 513         if escaped is None:
 514             return None
 515         return unescapeHTML(escaped)
 516
 517     def _og_search_thumbnail(self, html, **kargs):
 518         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 519
 520     def _og_search_description(self, html, **kargs):
 521         return self._og_search_property('description', html, fatal=False, **kargs)
 522
 523     def _og_search_title(self, html, **kargs):
 524         return self._og_search_property('title', html, **kargs)
 525
 526     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 527         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 528         if secure:
 529             regexes = self._og_regexes('video:secure_url') + regexes
 530         return self._html_search_regex(regexes, html, name, **kargs)
 531
 532     def _og_search_url(self, html, **kargs):
 533         return self._og_search_property('url', html, **kargs)
 534
 535     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 536         if display_name is None:
 537             display_name = name
 538         return self._html_search_regex(
 539             r'''(?ix)<meta
 540                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 541                     [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
 542             html, display_name, fatal=fatal, group='content', **kwargs)
 543
 544     def _dc_search_uploader(self, html):
 545         return self._html_search_meta('dc.creator', html, 'uploader')
 546
 547     def _rta_search(self, html):
 548         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 549         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 550                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 551                      html):
 552             return 18
 553         return 0
 554
 555     def _media_rating_search(self, html):
 556         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 557         rating = self._html_search_meta('rating', html)
 558
 559         if not rating:
 560             return None
 561
 562         RATING_TABLE = {
 563             'safe for kids': 0,
 564             'general': 8,
 565             '14 years': 14,
 566             'mature': 17,
 567             'restricted': 19,
 568         }
 569         return RATING_TABLE.get(rating.lower(), None)
 570
 571     def _twitter_search_player(self, html):
 572         return self._html_search_meta('twitter:player', html,
 573             'twitter card player')
 574
 575     def _sort_formats(self, formats):
 576         if not formats:
 577             raise ExtractorError('No video formats found')
 578
 579         def _formats_key(f):
 580             # TODO remove the following workaround
 581             from ..utils import determine_ext
 582             if not f.get('ext') and 'url' in f:
 583                 f['ext'] = determine_ext(f['url'])
 584
 585             preference = f.get('preference')
 586             if preference is None:
 587                 proto = f.get('protocol')
 588                 if proto is None:
 589                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 590
 591                 preference = 0 if proto in ['http', 'https'] else -0.1
 592                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 593                     preference -= 0.5
 594
 595             if f.get('vcodec') == 'none':  # audio only
 596                 if self._downloader.params.get('prefer_free_formats'):
 597                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 598                 else:
 599                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 600                 ext_preference = 0
 601                 try:
 602                     audio_ext_preference = ORDER.index(f['ext'])
 603                 except ValueError:
 604                     audio_ext_preference = -1
 605             else:
 606                 if self._downloader.params.get('prefer_free_formats'):
 607                     ORDER = ['flv', 'mp4', 'webm']
 608                 else:
 609                     ORDER = ['webm', 'flv', 'mp4']
 610                 try:
 611                     ext_preference = ORDER.index(f['ext'])
 612                 except ValueError:
 613                     ext_preference = -1
 614                 audio_ext_preference = 0
 615
 616             return (
 617                 preference,
 618                 f.get('quality') if f.get('quality') is not None else -1,
 619                 f.get('height') if f.get('height') is not None else -1,
 620                 f.get('width') if f.get('width') is not None else -1,
 621                 ext_preference,
 622                 f.get('tbr') if f.get('tbr') is not None else -1,
 623                 f.get('vbr') if f.get('vbr') is not None else -1,
 624                 f.get('abr') if f.get('abr') is not None else -1,
 625                 audio_ext_preference,
 626                 f.get('fps') if f.get('fps') is not None else -1,
 627                 f.get('filesize') if f.get('filesize') is not None else -1,
 628                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 629                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 630                 f.get('format_id'),
 631             )
 632         formats.sort(key=_formats_key)
 633
 634     def http_scheme(self):
 635         """ Either "http:" or "https:", depending on the user's preferences """
 636         return (
 637             'http:'
 638             if self._downloader.params.get('prefer_insecure', False)
 639             else 'https:')
 640
 641     def _proto_relative_url(self, url, scheme=None):
 642         if url is None:
 643             return url
 644         if url.startswith('//'):
 645             if scheme is None:
 646                 scheme = self.http_scheme()
 647             return scheme + url
 648         else:
 649             return url
 650
 651     def _sleep(self, timeout, video_id, msg_template=None):
 652         if msg_template is None:
 653             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 654         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 655         self.to_screen(msg)
 656         time.sleep(timeout)
 657
 658     def _extract_f4m_formats(self, manifest_url, video_id):
 659         manifest = self._download_xml(
 660             manifest_url, video_id, 'Downloading f4m manifest',
 661             'Unable to download f4m manifest')
 662
 663         formats = []
 664         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 665         for i, media_el in enumerate(media_nodes):
 666             tbr = int_or_none(media_el.attrib.get('bitrate'))
 667             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 668             formats.append({
 669                 'format_id': format_id,
 670                 'url': manifest_url,
 671                 'ext': 'flv',
 672                 'tbr': tbr,
 673                 'width': int_or_none(media_el.attrib.get('width')),
 674                 'height': int_or_none(media_el.attrib.get('height')),
 675             })
 676         self._sort_formats(formats)
 677
 678         return formats
 679
 680     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 681                               entry_protocol='m3u8', preference=None):
 682
 683         formats = [{
 684             'format_id': 'm3u8-meta',
 685             'url': m3u8_url,
 686             'ext': ext,
 687             'protocol': 'm3u8',
 688             'preference': -1,
 689             'resolution': 'multiple',
 690             'format_note': 'Quality selection URL',
 691         }]
 692
 693         format_url = lambda u: (
 694             u
 695             if re.match(r'^https?://', u)
 696             else compat_urlparse.urljoin(m3u8_url, u))
 697
 698         m3u8_doc = self._download_webpage(
 699             m3u8_url, video_id,
 700             note='Downloading m3u8 information',
 701             errnote='Failed to download m3u8 information')
 702         last_info = None
 703         kv_rex = re.compile(
 704             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 705         for line in m3u8_doc.splitlines():
 706             if line.startswith('#EXT-X-STREAM-INF:'):
 707                 last_info = {}
 708                 for m in kv_rex.finditer(line):
 709                     v = m.group('val')
 710                     if v.startswith('"'):
 711                         v = v[1:-1]
 712                     last_info[m.group('key')] = v
 713             elif line.startswith('#') or not line.strip():
 714                 continue
 715             else:
 716                 if last_info is None:
 717                     formats.append({'url': format_url(line)})
 718                     continue
 719                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 720
 721                 f = {
 722                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 723                     'url': format_url(line.strip()),
 724                     'tbr': tbr,
 725                     'ext': ext,
 726                     'protocol': entry_protocol,
 727                     'preference': preference,
 728                 }
 729                 codecs = last_info.get('CODECS')
 730                 if codecs:
 731                     # TODO: looks like video codec is not always necessarily goes first
 732                     va_codecs = codecs.split(',')
 733                     if va_codecs[0]:
 734                         f['vcodec'] = va_codecs[0].partition('.')[0]
 735                     if len(va_codecs) > 1 and va_codecs[1]:
 736                         f['acodec'] = va_codecs[1].partition('.')[0]
 737                 resolution = last_info.get('RESOLUTION')
 738                 if resolution:
 739                     width_str, height_str = resolution.split('x')
 740                     f['width'] = int(width_str)
 741                     f['height'] = int(height_str)
 742                 formats.append(f)
 743                 last_info = {}
 744         self._sort_formats(formats)
 745         return formats
 746
 747     def _live_title(self, name):
 748         """ Generate the title for a live video """
 749         now = datetime.datetime.now()
 750         now_str = now.strftime("%Y-%m-%d %H:%M")
 751         return name + ' ' + now_str
 752
 753     def _int(self, v, name, fatal=False, **kwargs):
 754         res = int_or_none(v, **kwargs)
 755         if 'get_attr' in kwargs:
 756             print(getattr(v, kwargs['get_attr']))
 757         if res is None:
 758             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 759             if fatal:
 760                 raise ExtractorError(msg)
 761             else:
 762                 self._downloader.report_warning(msg)
 763         return res
 764
 765     def _float(self, v, name, fatal=False, **kwargs):
 766         res = float_or_none(v, **kwargs)
 767         if res is None:
 768             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 769             if fatal:
 770                 raise ExtractorError(msg)
 771             else:
 772                 self._downloader.report_warning(msg)
 773         return res
 774
 775
 776 class SearchInfoExtractor(InfoExtractor):
 777     """
 778     Base class for paged search queries extractors.
 779     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 780     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 781     """
 782
 783     @classmethod
 784     def _make_valid_url(cls):
 785         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 786
 787     @classmethod
 788     def suitable(cls, url):
 789         return re.match(cls._make_valid_url(), url) is not None
 790
 791     def _real_extract(self, query):
 792         mobj = re.match(self._make_valid_url(), query)
 793         if mobj is None:
 794             raise ExtractorError('Invalid search query "%s"' % query)
 795
 796         prefix = mobj.group('prefix')
 797         query = mobj.group('query')
 798         if prefix == '':
 799             return self._get_n_results(query, 1)
 800         elif prefix == 'all':
 801             return self._get_n_results(query, self._MAX_RESULTS)
 802         else:
 803             n = int(prefix)
 804             if n <= 0:
 805                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 806             elif n > self._MAX_RESULTS:
 807                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 808                 n = self._MAX_RESULTS
 809             return self._get_n_results(query, n)
 810
 811     def _get_n_results(self, query, n):
 812         """Get a specified number of results for a query"""
 813         raise NotImplementedError("This method must be implemented by subclasses")
 814
 815     @property
 816     def SEARCH_KEY(self):
 817         return self._SEARCH_KEY