_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The dictionaries must include the following fields:
  47
  48     id:             Video identifier.
  49     title:          Video title, unescaped.
  50
  51     Additionally, it must contain either a formats entry or a url one:
  52
  53     formats:        A list of dictionaries for each format available, ordered
  54                     from worst to best quality.
  55
  56                     Potential fields:
  57                     * url        Mandatory. The URL of the video file
  58                     * ext        Will be calculated from url if missing
  59                     * format     A human-readable description of the format
  60                                  ("mp4 container with h264/opus").
  61                                  Calculated from the format_id, width, height.
  62                                  and format_note fields if missing.
  63                     * format_id  A short description of the format
  64                                  ("mp4_h264_opus" or "19").
  65                                 Technically optional, but strongly recommended.
  66                     * format_note Additional info about the format
  67                                  ("3D" or "DASH video")
  68                     * width      Width of the video, if known
  69                     * height     Height of the video, if known
  70                     * resolution Textual description of width and height
  71                     * tbr        Average bitrate of audio and video in KBit/s
  72                     * abr        Average audio bitrate in KBit/s
  73                     * acodec     Name of the audio codec in use
  74                     * asr        Audio sampling rate in Hertz
  75                     * vbr        Average video bitrate in KBit/s
  76                     * fps        Frame rate
  77                     * vcodec     Name of the video codec in use
  78                     * container  Name of the container format
  79                     * filesize   The number of bytes, if known in advance
  80                     * filesize_approx  An estimate for the number of bytes
  81                     * player_url SWF Player URL (used for rtmpdump).
  82                     * protocol   The protocol that will be used for the actual
  83                                  download, lower-case.
  84                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  85                     * preference Order number of this format. If this field is
  86                                  present and not None, the formats get sorted
  87                                  by this field, regardless of all other values.
  88                                  -1 for default (order by other properties),
  89                                  -2 or smaller for less than default.
  90                     * quality    Order number of the video quality of this
  91                                  format, irrespective of the file format.
  92                                  -1 for default (order by other properties),
  93                                  -2 or smaller for less than default.
  94                     * source_preference  Order number for this video source
  95                                   (quality takes higher priority)
  96                                  -1 for default (order by other properties),
  97                                  -2 or smaller for less than default.
  98                     * http_referer  HTTP Referer header value to set.
  99                     * http_method  HTTP method to use for the download.
 100                     * http_headers  A dictionary of additional HTTP headers
 101                                  to add to the request.
 102                     * http_post_data  Additional data to send with a POST
 103                                  request.
 104     url:            Final video URL.
 105     ext:            Video filename extension.
 106     format:         The video format, defaults to ext (used for --get-format)
 107     player_url:     SWF Player URL (used for rtmpdump).
 108
 109     The following fields are optional:
 110
 111     display_id      An alternative identifier for the video, not necessarily
 112                     unique, but available before title. Typically, id is
 113                     something like "4234987", title "Dancing naked mole rats",
 114                     and display_id "dancing-naked-mole-rats"
 115     thumbnails:     A list of dictionaries, with the following entries:
 116                         * "url"
 117                         * "width" (optional, int)
 118                         * "height" (optional, int)
 119                         * "resolution" (optional, string "{width}x{height"},
 120                                         deprecated)
 121     thumbnail:      Full URL to a video thumbnail image.
 122     description:    One-line video description.
 123     uploader:       Full name of the video uploader.
 124     timestamp:      UNIX timestamp of the moment the video became available.
 125     upload_date:    Video upload date (YYYYMMDD).
 126                     If not explicitly set, calculated from timestamp.
 127     uploader_id:    Nickname or id of the video uploader.
 128     location:       Physical location where the video was filmed.
 129     subtitles:      The subtitle file contents as a dictionary in the format
 130                     {language: subtitles}.
 131     duration:       Length of the video in seconds, as an integer.
 132     view_count:     How many users have watched the video on the platform.
 133     like_count:     Number of positive ratings of the video
 134     dislike_count:  Number of negative ratings of the video
 135     comment_count:  Number of comments on the video
 136     age_limit:      Age restriction for the video, as an integer (years)
 137     webpage_url:    The url to the video webpage, if given to youtube-dl it
 138                     should allow to get the same result again. (It will be set
 139                     by YoutubeDL if it's missing)
 140     categories:     A list of categories that the video falls in, for example
 141                     ["Sports", "Berlin"]
 142     is_live:        True, False, or None (=unknown). Whether this video is a
 143                     live stream that goes on instead of a fixed-length video.
 144
 145     Unless mentioned otherwise, the fields should be Unicode strings.
 146
 147     Unless mentioned otherwise, None is equivalent to absence of information.
 148
 149     Subclasses of this one should re-define the _real_initialize() and
 150     _real_extract() methods and define a _VALID_URL regexp.
 151     Probably, they should also be added to the list of extractors.
 152
 153     Finally, the _WORKING attribute should be set to False for broken IEs
 154     in order to warn the users and skip the tests.
 155     """
 156
 157     _ready = False
 158     _downloader = None
 159     _WORKING = True
 160
 161     def __init__(self, downloader=None):
 162         """Constructor. Receives an optional downloader."""
 163         self._ready = False
 164         self.set_downloader(downloader)
 165
 166     @classmethod
 167     def suitable(cls, url):
 168         """Receives a URL and returns True if suitable for this IE."""
 169
 170         # This does not use has/getattr intentionally - we want to know whether
 171         # we have cached the regexp for *this* class, whereas getattr would also
 172         # match the superclass
 173         if '_VALID_URL_RE' not in cls.__dict__:
 174             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 175         return cls._VALID_URL_RE.match(url) is not None
 176
 177     @classmethod
 178     def _match_id(cls, url):
 179         if '_VALID_URL_RE' not in cls.__dict__:
 180             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 181         m = cls._VALID_URL_RE.match(url)
 182         assert m
 183         return m.group('id')
 184
 185     @classmethod
 186     def working(cls):
 187         """Getter method for _WORKING."""
 188         return cls._WORKING
 189
 190     def initialize(self):
 191         """Initializes an instance (authentication, etc)."""
 192         if not self._ready:
 193             self._real_initialize()
 194             self._ready = True
 195
 196     def extract(self, url):
 197         """Extracts URL information and returns it in list of dicts."""
 198         self.initialize()
 199         return self._real_extract(url)
 200
 201     def set_downloader(self, downloader):
 202         """Sets the downloader for this IE."""
 203         self._downloader = downloader
 204
 205     def _real_initialize(self):
 206         """Real initialization process. Redefine in subclasses."""
 207         pass
 208
 209     def _real_extract(self, url):
 210         """Real extraction process. Redefine in subclasses."""
 211         pass
 212
 213     @classmethod
 214     def ie_key(cls):
 215         """A string for getting the InfoExtractor with get_info_extractor"""
 216         return cls.__name__[:-2]
 217
 218     @property
 219     def IE_NAME(self):
 220         return type(self).__name__[:-2]
 221
 222     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 223         """ Returns the response handle """
 224         if note is None:
 225             self.report_download_webpage(video_id)
 226         elif note is not False:
 227             if video_id is None:
 228                 self.to_screen('%s' % (note,))
 229             else:
 230                 self.to_screen('%s: %s' % (video_id, note))
 231         try:
 232             return self._downloader.urlopen(url_or_request)
 233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 234             if errnote is False:
 235                 return False
 236             if errnote is None:
 237                 errnote = 'Unable to download webpage'
 238             errmsg = '%s: %s' % (errnote, compat_str(err))
 239             if fatal:
 240                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 241             else:
 242                 self._downloader.report_warning(errmsg)
 243                 return False
 244
 245     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 246         """ Returns a tuple (page content as string, URL handle) """
 247         # Strip hashes from the URL (#1038)
 248         if isinstance(url_or_request, (compat_str, str)):
 249             url_or_request = url_or_request.partition('#')[0]
 250
 251         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 252         if urlh is False:
 253             assert not fatal
 254             return False
 255         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 256         return (content, urlh)
 257
 258     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 259         content_type = urlh.headers.get('Content-Type', '')
 260         webpage_bytes = urlh.read()
 261         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 262         if m:
 263             encoding = m.group(1)
 264         else:
 265             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 266                           webpage_bytes[:1024])
 267             if m:
 268                 encoding = m.group(1).decode('ascii')
 269             elif webpage_bytes.startswith(b'\xff\xfe'):
 270                 encoding = 'utf-16'
 271             else:
 272                 encoding = 'utf-8'
 273         if self._downloader.params.get('dump_intermediate_pages', False):
 274             try:
 275                 url = url_or_request.get_full_url()
 276             except AttributeError:
 277                 url = url_or_request
 278             self.to_screen('Dumping request to ' + url)
 279             dump = base64.b64encode(webpage_bytes).decode('ascii')
 280             self._downloader.to_screen(dump)
 281         if self._downloader.params.get('write_pages', False):
 282             try:
 283                 url = url_or_request.get_full_url()
 284             except AttributeError:
 285                 url = url_or_request
 286             basen = '%s_%s' % (video_id, url)
 287             if len(basen) > 240:
 288                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 289                 basen = basen[:240 - len(h)] + h
 290             raw_filename = basen + '.dump'
 291             filename = sanitize_filename(raw_filename, restricted=True)
 292             self.to_screen('Saving request to ' + filename)
 293             # Working around MAX_PATH limitation on Windows (see
 294             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 295             if os.name == 'nt':
 296                 absfilepath = os.path.abspath(filename)
 297                 if len(absfilepath) > 259:
 298                     filename = '\\\\?\\' + absfilepath
 299             with open(filename, 'wb') as outf:
 300                 outf.write(webpage_bytes)
 301
 302         try:
 303             content = webpage_bytes.decode(encoding, 'replace')
 304         except LookupError:
 305             content = webpage_bytes.decode('utf-8', 'replace')
 306
 307         if ('<title>Access to this site is blocked</title>' in content and
 308                 'Websense' in content[:512]):
 309             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 310             blocked_iframe = self._html_search_regex(
 311                 r'<iframe src="([^"]+)"', content,
 312                 'Websense information URL', default=None)
 313             if blocked_iframe:
 314                 msg += ' Visit %s for more details' % blocked_iframe
 315             raise ExtractorError(msg, expected=True)
 316
 317         return content
 318
 319     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 320         """ Returns the data of the page as a string """
 321         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 322         if res is False:
 323             return res
 324         else:
 325             content, _ = res
 326             return content
 327
 328     def _download_xml(self, url_or_request, video_id,
 329                       note='Downloading XML', errnote='Unable to download XML',
 330                       transform_source=None, fatal=True):
 331         """Return the xml as an xml.etree.ElementTree.Element"""
 332         xml_string = self._download_webpage(
 333             url_or_request, video_id, note, errnote, fatal=fatal)
 334         if xml_string is False:
 335             return xml_string
 336         if transform_source:
 337             xml_string = transform_source(xml_string)
 338         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 339
 340     def _download_json(self, url_or_request, video_id,
 341                        note='Downloading JSON metadata',
 342                        errnote='Unable to download JSON metadata',
 343                        transform_source=None,
 344                        fatal=True):
 345         json_string = self._download_webpage(
 346             url_or_request, video_id, note, errnote, fatal=fatal)
 347         if (not fatal) and json_string is False:
 348             return None
 349         if transform_source:
 350             json_string = transform_source(json_string)
 351         try:
 352             return json.loads(json_string)
 353         except ValueError as ve:
 354             errmsg = '%s: Failed to parse JSON ' % video_id
 355             if fatal:
 356                 raise ExtractorError(errmsg, cause=ve)
 357             else:
 358                 self.report_warning(errmsg + str(ve))
 359
 360     def report_warning(self, msg, video_id=None):
 361         idstr = '' if video_id is None else '%s: ' % video_id
 362         self._downloader.report_warning(
 363             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 364
 365     def to_screen(self, msg):
 366         """Print msg to screen, prefixing it with '[ie_name]'"""
 367         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 368
 369     def report_extraction(self, id_or_name):
 370         """Report information extraction."""
 371         self.to_screen('%s: Extracting information' % id_or_name)
 372
 373     def report_download_webpage(self, video_id):
 374         """Report webpage download."""
 375         self.to_screen('%s: Downloading webpage' % video_id)
 376
 377     def report_age_confirmation(self):
 378         """Report attempt to confirm age."""
 379         self.to_screen('Confirming age')
 380
 381     def report_login(self):
 382         """Report attempt to log in."""
 383         self.to_screen('Logging in')
 384
 385     #Methods for following #608
 386     @staticmethod
 387     def url_result(url, ie=None, video_id=None):
 388         """Returns a url that points to a page that should be processed"""
 389         #TODO: ie should be the class used for getting the info
 390         video_info = {'_type': 'url',
 391                       'url': url,
 392                       'ie_key': ie}
 393         if video_id is not None:
 394             video_info['id'] = video_id
 395         return video_info
 396     @staticmethod
 397     def playlist_result(entries, playlist_id=None, playlist_title=None):
 398         """Returns a playlist"""
 399         video_info = {'_type': 'playlist',
 400                       'entries': entries}
 401         if playlist_id:
 402             video_info['id'] = playlist_id
 403         if playlist_title:
 404             video_info['title'] = playlist_title
 405         return video_info
 406
 407     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 408         """
 409         Perform a regex search on the given string, using a single or a list of
 410         patterns returning the first matching group.
 411         In case of failure return a default value or raise a WARNING or a
 412         RegexNotFoundError, depending on fatal, specifying the field name.
 413         """
 414         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 415             mobj = re.search(pattern, string, flags)
 416         else:
 417             for p in pattern:
 418                 mobj = re.search(p, string, flags)
 419                 if mobj:
 420                     break
 421
 422         if os.name != 'nt' and sys.stderr.isatty():
 423             _name = '\033[0;34m%s\033[0m' % name
 424         else:
 425             _name = name
 426
 427         if mobj:
 428             # return the first matching group
 429             return next(g for g in mobj.groups() if g is not None)
 430         elif default is not _NO_DEFAULT:
 431             return default
 432         elif fatal:
 433             raise RegexNotFoundError('Unable to extract %s' % _name)
 434         else:
 435             self._downloader.report_warning('unable to extract %s; '
 436                 'please report this issue on http://yt-dl.org/bug' % _name)
 437             return None
 438
 439     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 440         """
 441         Like _search_regex, but strips HTML tags and unescapes entities.
 442         """
 443         res = self._search_regex(pattern, string, name, default, fatal, flags)
 444         if res:
 445             return clean_html(res).strip()
 446         else:
 447             return res
 448
 449     def _get_login_info(self):
 450         """
 451         Get the the login info as (username, password)
 452         It will look in the netrc file using the _NETRC_MACHINE value
 453         If there's no info available, return (None, None)
 454         """
 455         if self._downloader is None:
 456             return (None, None)
 457
 458         username = None
 459         password = None
 460         downloader_params = self._downloader.params
 461
 462         # Attempt to use provided username and password or .netrc data
 463         if downloader_params.get('username', None) is not None:
 464             username = downloader_params['username']
 465             password = downloader_params['password']
 466         elif downloader_params.get('usenetrc', False):
 467             try:
 468                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 469                 if info is not None:
 470                     username = info[0]
 471                     password = info[2]
 472                 else:
 473                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 474             except (IOError, netrc.NetrcParseError) as err:
 475                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 476
 477         return (username, password)
 478
 479     def _get_tfa_info(self):
 480         """
 481         Get the two-factor authentication info
 482         TODO - asking the user will be required for sms/phone verify
 483         currently just uses the command line option
 484         If there's no info available, return None
 485         """
 486         if self._downloader is None:
 487             return None
 488         downloader_params = self._downloader.params
 489
 490         if downloader_params.get('twofactor', None) is not None:
 491             return downloader_params['twofactor']
 492
 493         return None
 494
 495     # Helper functions for extracting OpenGraph info
 496     @staticmethod
 497     def _og_regexes(prop):
 498         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 499         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 500         template = r'<meta[^>]+?%s[^>]+?%s'
 501         return [
 502             template % (property_re, content_re),
 503             template % (content_re, property_re),
 504         ]
 505
 506     def _og_search_property(self, prop, html, name=None, **kargs):
 507         if name is None:
 508             name = 'OpenGraph %s' % prop
 509         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 510         if escaped is None:
 511             return None
 512         return unescapeHTML(escaped)
 513
 514     def _og_search_thumbnail(self, html, **kargs):
 515         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 516
 517     def _og_search_description(self, html, **kargs):
 518         return self._og_search_property('description', html, fatal=False, **kargs)
 519
 520     def _og_search_title(self, html, **kargs):
 521         return self._og_search_property('title', html, **kargs)
 522
 523     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 524         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 525         if secure:
 526             regexes = self._og_regexes('video:secure_url') + regexes
 527         return self._html_search_regex(regexes, html, name, **kargs)
 528
 529     def _og_search_url(self, html, **kargs):
 530         return self._og_search_property('url', html, **kargs)
 531
 532     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 533         if display_name is None:
 534             display_name = name
 535         return self._html_search_regex(
 536             r'''(?ix)<meta
 537                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 538                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 539             html, display_name, fatal=fatal, **kwargs)
 540
 541     def _dc_search_uploader(self, html):
 542         return self._html_search_meta('dc.creator', html, 'uploader')
 543
 544     def _rta_search(self, html):
 545         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 546         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 547                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 548                      html):
 549             return 18
 550         return 0
 551
 552     def _media_rating_search(self, html):
 553         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 554         rating = self._html_search_meta('rating', html)
 555
 556         if not rating:
 557             return None
 558
 559         RATING_TABLE = {
 560             'safe for kids': 0,
 561             'general': 8,
 562             '14 years': 14,
 563             'mature': 17,
 564             'restricted': 19,
 565         }
 566         return RATING_TABLE.get(rating.lower(), None)
 567
 568     def _twitter_search_player(self, html):
 569         return self._html_search_meta('twitter:player', html,
 570             'twitter card player')
 571
 572     def _sort_formats(self, formats):
 573         if not formats:
 574             raise ExtractorError('No video formats found')
 575
 576         def _formats_key(f):
 577             # TODO remove the following workaround
 578             from ..utils import determine_ext
 579             if not f.get('ext') and 'url' in f:
 580                 f['ext'] = determine_ext(f['url'])
 581
 582             preference = f.get('preference')
 583             if preference is None:
 584                 proto = f.get('protocol')
 585                 if proto is None:
 586                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 587
 588                 preference = 0 if proto in ['http', 'https'] else -0.1
 589                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 590                     preference -= 0.5
 591
 592             if f.get('vcodec') == 'none':  # audio only
 593                 if self._downloader.params.get('prefer_free_formats'):
 594                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 595                 else:
 596                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 597                 ext_preference = 0
 598                 try:
 599                     audio_ext_preference = ORDER.index(f['ext'])
 600                 except ValueError:
 601                     audio_ext_preference = -1
 602             else:
 603                 if self._downloader.params.get('prefer_free_formats'):
 604                     ORDER = ['flv', 'mp4', 'webm']
 605                 else:
 606                     ORDER = ['webm', 'flv', 'mp4']
 607                 try:
 608                     ext_preference = ORDER.index(f['ext'])
 609                 except ValueError:
 610                     ext_preference = -1
 611                 audio_ext_preference = 0
 612
 613             return (
 614                 preference,
 615                 f.get('quality') if f.get('quality') is not None else -1,
 616                 f.get('height') if f.get('height') is not None else -1,
 617                 f.get('width') if f.get('width') is not None else -1,
 618                 ext_preference,
 619                 f.get('tbr') if f.get('tbr') is not None else -1,
 620                 f.get('vbr') if f.get('vbr') is not None else -1,
 621                 f.get('abr') if f.get('abr') is not None else -1,
 622                 audio_ext_preference,
 623                 f.get('fps') if f.get('fps') is not None else -1,
 624                 f.get('filesize') if f.get('filesize') is not None else -1,
 625                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 626                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 627                 f.get('format_id'),
 628             )
 629         formats.sort(key=_formats_key)
 630
 631     def http_scheme(self):
 632         """ Either "http:" or "https:", depending on the user's preferences """
 633         return (
 634             'http:'
 635             if self._downloader.params.get('prefer_insecure', False)
 636             else 'https:')
 637
 638     def _proto_relative_url(self, url, scheme=None):
 639         if url is None:
 640             return url
 641         if url.startswith('//'):
 642             if scheme is None:
 643                 scheme = self.http_scheme()
 644             return scheme + url
 645         else:
 646             return url
 647
 648     def _sleep(self, timeout, video_id, msg_template=None):
 649         if msg_template is None:
 650             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 651         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 652         self.to_screen(msg)
 653         time.sleep(timeout)
 654
 655     def _extract_f4m_formats(self, manifest_url, video_id):
 656         manifest = self._download_xml(
 657             manifest_url, video_id, 'Downloading f4m manifest',
 658             'Unable to download f4m manifest')
 659
 660         formats = []
 661         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 662         for i, media_el in enumerate(media_nodes):
 663             tbr = int_or_none(media_el.attrib.get('bitrate'))
 664             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 665             formats.append({
 666                 'format_id': format_id,
 667                 'url': manifest_url,
 668                 'ext': 'flv',
 669                 'tbr': tbr,
 670                 'width': int_or_none(media_el.attrib.get('width')),
 671                 'height': int_or_none(media_el.attrib.get('height')),
 672             })
 673         self._sort_formats(formats)
 674
 675         return formats
 676
 677     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 678                               entry_protocol='m3u8', preference=None):
 679
 680         formats = [{
 681             'format_id': 'm3u8-meta',
 682             'url': m3u8_url,
 683             'ext': ext,
 684             'protocol': 'm3u8',
 685             'preference': -1,
 686             'resolution': 'multiple',
 687             'format_note': 'Quality selection URL',
 688         }]
 689
 690         format_url = lambda u: (
 691             u
 692             if re.match(r'^https?://', u)
 693             else compat_urlparse.urljoin(m3u8_url, u))
 694
 695         m3u8_doc = self._download_webpage(
 696             m3u8_url, video_id,
 697             note='Downloading m3u8 information',
 698             errnote='Failed to download m3u8 information')
 699         last_info = None
 700         kv_rex = re.compile(
 701             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 702         for line in m3u8_doc.splitlines():
 703             if line.startswith('#EXT-X-STREAM-INF:'):
 704                 last_info = {}
 705                 for m in kv_rex.finditer(line):
 706                     v = m.group('val')
 707                     if v.startswith('"'):
 708                         v = v[1:-1]
 709                     last_info[m.group('key')] = v
 710             elif line.startswith('#') or not line.strip():
 711                 continue
 712             else:
 713                 if last_info is None:
 714                     formats.append({'url': format_url(line)})
 715                     continue
 716                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 717
 718                 f = {
 719                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 720                     'url': format_url(line.strip()),
 721                     'tbr': tbr,
 722                     'ext': ext,
 723                     'protocol': entry_protocol,
 724                     'preference': preference,
 725                 }
 726                 codecs = last_info.get('CODECS')
 727                 if codecs:
 728                     # TODO: looks like video codec is not always necessarily goes first
 729                     va_codecs = codecs.split(',')
 730                     if va_codecs[0]:
 731                         f['vcodec'] = va_codecs[0].partition('.')[0]
 732                     if len(va_codecs) > 1 and va_codecs[1]:
 733                         f['acodec'] = va_codecs[1].partition('.')[0]
 734                 resolution = last_info.get('RESOLUTION')
 735                 if resolution:
 736                     width_str, height_str = resolution.split('x')
 737                     f['width'] = int(width_str)
 738                     f['height'] = int(height_str)
 739                 formats.append(f)
 740                 last_info = {}
 741         self._sort_formats(formats)
 742         return formats
 743
 744     def _live_title(self, name):
 745         """ Generate the title for a live video """
 746         now = datetime.datetime.now()
 747         now_str = now.strftime("%Y-%m-%d %H:%M")
 748         return name + ' ' + now_str
 749
 750     def _int(self, v, name, fatal=False, **kwargs):
 751         res = int_or_none(v, **kwargs)
 752         if 'get_attr' in kwargs:
 753             print(getattr(v, kwargs['get_attr']))
 754         if res is None:
 755             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 756             if fatal:
 757                 raise ExtractorError(msg)
 758             else:
 759                 self._downloader.report_warning(msg)
 760         return res
 761
 762     def _float(self, v, name, fatal=False, **kwargs):
 763         res = float_or_none(v, **kwargs)
 764         if res is None:
 765             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 766             if fatal:
 767                 raise ExtractorError(msg)
 768             else:
 769                 self._downloader.report_warning(msg)
 770         return res
 771
 772
 773 class SearchInfoExtractor(InfoExtractor):
 774     """
 775     Base class for paged search queries extractors.
 776     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 777     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 778     """
 779
 780     @classmethod
 781     def _make_valid_url(cls):
 782         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 783
 784     @classmethod
 785     def suitable(cls, url):
 786         return re.match(cls._make_valid_url(), url) is not None
 787
 788     def _real_extract(self, query):
 789         mobj = re.match(self._make_valid_url(), query)
 790         if mobj is None:
 791             raise ExtractorError('Invalid search query "%s"' % query)
 792
 793         prefix = mobj.group('prefix')
 794         query = mobj.group('query')
 795         if prefix == '':
 796             return self._get_n_results(query, 1)
 797         elif prefix == 'all':
 798             return self._get_n_results(query, self._MAX_RESULTS)
 799         else:
 800             n = int(prefix)
 801             if n <= 0:
 802                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 803             elif n > self._MAX_RESULTS:
 804                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 805                 n = self._MAX_RESULTS
 806             return self._get_n_results(query, n)
 807
 808     def _get_n_results(self, query, n):
 809         """Get a specified number of results for a query"""
 810         raise NotImplementedError("This method must be implemented by subclasses")
 811
 812     @property
 813     def SEARCH_KEY(self):
 814         return self._SEARCH_KEY