_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import os
   5 import re
   6 import socket
   7 import sys
   8 import netrc
   9 import xml.etree.ElementTree
  10
  11 from ..utils import (
  12     compat_http_client,
  13     compat_urllib_error,
  14     compat_urllib_parse_urlparse,
  15     compat_str,
  16
  17     clean_html,
  18     compiled_regex_type,
  19     ExtractorError,
  20     RegexNotFoundError,
  21     sanitize_filename,
  22     unescapeHTML,
  23 )
  24 _NO_DEFAULT = object()
  25
  26
  27 class InfoExtractor(object):
  28     """Information Extractor class.
  29
  30     Information extractors are the classes that, given a URL, extract
  31     information about the video (or videos) the URL refers to. This
  32     information includes the real video URL, the video title, author and
  33     others. The information is stored in a dictionary which is then
  34     passed to the FileDownloader. The FileDownloader processes this
  35     information possibly downloading the video to the file system, among
  36     other possible outcomes.
  37
  38     The dictionaries must include the following fields:
  39
  40     id:             Video identifier.
  41     title:          Video title, unescaped.
  42
  43     Additionally, it must contain either a formats entry or a url one:
  44
  45     formats:        A list of dictionaries for each format available, ordered
  46                     from worst to best quality.
  47
  48                     Potential fields:
  49                     * url        Mandatory. The URL of the video file
  50                     * ext        Will be calculated from url if missing
  51                     * format     A human-readable description of the format
  52                                  ("mp4 container with h264/opus").
  53                                  Calculated from the format_id, width, height.
  54                                  and format_note fields if missing.
  55                     * format_id  A short description of the format
  56                                  ("mp4_h264_opus" or "19").
  57                                 Technically optional, but strongly recommended.
  58                     * format_note Additional info about the format
  59                                  ("3D" or "DASH video")
  60                     * width      Width of the video, if known
  61                     * height     Height of the video, if known
  62                     * resolution Textual description of width and height
  63                     * tbr        Average bitrate of audio and video in KBit/s
  64                     * abr        Average audio bitrate in KBit/s
  65                     * acodec     Name of the audio codec in use
  66                     * asr        Audio sampling rate in Hertz
  67                     * vbr        Average video bitrate in KBit/s
  68                     * vcodec     Name of the video codec in use
  69                     * container  Name of the container format
  70                     * filesize   The number of bytes, if known in advance
  71                     * player_url SWF Player URL (used for rtmpdump).
  72                     * protocol   The protocol that will be used for the actual
  73                                  download, lower-case.
  74                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  75                     * preference Order number of this format. If this field is
  76                                  present and not None, the formats get sorted
  77                                  by this field, regardless of all other values.
  78                                  -1 for default (order by other properties),
  79                                  -2 or smaller for less than default.
  80                     * quality    Order number of the video quality of this
  81                                  format, irrespective of the file format.
  82                                  -1 for default (order by other properties),
  83                                  -2 or smaller for less than default.
  84     url:            Final video URL.
  85     ext:            Video filename extension.
  86     format:         The video format, defaults to ext (used for --get-format)
  87     player_url:     SWF Player URL (used for rtmpdump).
  88
  89     The following fields are optional:
  90
  91     display_id      An alternative identifier for the video, not necessarily
  92                     unique, but available before title. Typically, id is
  93                     something like "4234987", title "Dancing naked mole rats",
  94                     and display_id "dancing-naked-mole-rats"
  95     thumbnails:     A list of dictionaries, with the following entries:
  96                         * "url"
  97                         * "width" (optional, int)
  98                         * "height" (optional, int)
  99                         * "resolution" (optional, string "{width}x{height"},
 100                                         deprecated)
 101     thumbnail:      Full URL to a video thumbnail image.
 102     description:    One-line video description.
 103     uploader:       Full name of the video uploader.
 104     timestamp:      UNIX timestamp of the moment the video became available.
 105     upload_date:    Video upload date (YYYYMMDD).
 106                     If not explicitly set, calculated from timestamp.
 107     uploader_id:    Nickname or id of the video uploader.
 108     location:       Physical location of the video.
 109     subtitles:      The subtitle file contents as a dictionary in the format
 110                     {language: subtitles}.
 111     duration:       Length of the video in seconds, as an integer.
 112     view_count:     How many users have watched the video on the platform.
 113     like_count:     Number of positive ratings of the video
 114     dislike_count:  Number of negative ratings of the video
 115     comment_count:  Number of comments on the video
 116     age_limit:      Age restriction for the video, as an integer (years)
 117     webpage_url:    The url to the video webpage, if given to youtube-dl it
 118                     should allow to get the same result again. (It will be set
 119                     by YoutubeDL if it's missing)
 120     categories:     A list of categories that the video falls in, for example
 121                     ["Sports", "Berlin"]
 122
 123     Unless mentioned otherwise, the fields should be Unicode strings.
 124
 125     Subclasses of this one should re-define the _real_initialize() and
 126     _real_extract() methods and define a _VALID_URL regexp.
 127     Probably, they should also be added to the list of extractors.
 128
 129     Finally, the _WORKING attribute should be set to False for broken IEs
 130     in order to warn the users and skip the tests.
 131     """
 132
 133     _ready = False
 134     _downloader = None
 135     _WORKING = True
 136
 137     def __init__(self, downloader=None):
 138         """Constructor. Receives an optional downloader."""
 139         self._ready = False
 140         self.set_downloader(downloader)
 141
 142     @classmethod
 143     def suitable(cls, url):
 144         """Receives a URL and returns True if suitable for this IE."""
 145
 146         # This does not use has/getattr intentionally - we want to know whether
 147         # we have cached the regexp for *this* class, whereas getattr would also
 148         # match the superclass
 149         if '_VALID_URL_RE' not in cls.__dict__:
 150             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 151         return cls._VALID_URL_RE.match(url) is not None
 152
 153     @classmethod
 154     def working(cls):
 155         """Getter method for _WORKING."""
 156         return cls._WORKING
 157
 158     def initialize(self):
 159         """Initializes an instance (authentication, etc)."""
 160         if not self._ready:
 161             self._real_initialize()
 162             self._ready = True
 163
 164     def extract(self, url):
 165         """Extracts URL information and returns it in list of dicts."""
 166         self.initialize()
 167         return self._real_extract(url)
 168
 169     def set_downloader(self, downloader):
 170         """Sets the downloader for this IE."""
 171         self._downloader = downloader
 172
 173     def _real_initialize(self):
 174         """Real initialization process. Redefine in subclasses."""
 175         pass
 176
 177     def _real_extract(self, url):
 178         """Real extraction process. Redefine in subclasses."""
 179         pass
 180
 181     @classmethod
 182     def ie_key(cls):
 183         """A string for getting the InfoExtractor with get_info_extractor"""
 184         return cls.__name__[:-2]
 185
 186     @property
 187     def IE_NAME(self):
 188         return type(self).__name__[:-2]
 189
 190     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 191         """ Returns the response handle """
 192         if note is None:
 193             self.report_download_webpage(video_id)
 194         elif note is not False:
 195             if video_id is None:
 196                 self.to_screen(u'%s' % (note,))
 197             else:
 198                 self.to_screen(u'%s: %s' % (video_id, note))
 199         try:
 200             return self._downloader.urlopen(url_or_request)
 201         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 202             if errnote is False:
 203                 return False
 204             if errnote is None:
 205                 errnote = u'Unable to download webpage'
 206             errmsg = u'%s: %s' % (errnote, compat_str(err))
 207             if fatal:
 208                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 209             else:
 210                 self._downloader.report_warning(errmsg)
 211                 return False
 212
 213     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 214         """ Returns a tuple (page content as string, URL handle) """
 215
 216         # Strip hashes from the URL (#1038)
 217         if isinstance(url_or_request, (compat_str, str)):
 218             url_or_request = url_or_request.partition('#')[0]
 219
 220         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 221         if urlh is False:
 222             assert not fatal
 223             return False
 224         content_type = urlh.headers.get('Content-Type', '')
 225         webpage_bytes = urlh.read()
 226         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 227         if m:
 228             encoding = m.group(1)
 229         else:
 230             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 231                           webpage_bytes[:1024])
 232             if m:
 233                 encoding = m.group(1).decode('ascii')
 234             elif webpage_bytes.startswith(b'\xff\xfe'):
 235                 encoding = 'utf-16'
 236             else:
 237                 encoding = 'utf-8'
 238         if self._downloader.params.get('dump_intermediate_pages', False):
 239             try:
 240                 url = url_or_request.get_full_url()
 241             except AttributeError:
 242                 url = url_or_request
 243             self.to_screen(u'Dumping request to ' + url)
 244             dump = base64.b64encode(webpage_bytes).decode('ascii')
 245             self._downloader.to_screen(dump)
 246         if self._downloader.params.get('write_pages', False):
 247             try:
 248                 url = url_or_request.get_full_url()
 249             except AttributeError:
 250                 url = url_or_request
 251             basen = '%s_%s' % (video_id, url)
 252             if len(basen) > 240:
 253                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 254                 basen = basen[:240 - len(h)] + h
 255             raw_filename = basen + '.dump'
 256             filename = sanitize_filename(raw_filename, restricted=True)
 257             self.to_screen(u'Saving request to ' + filename)
 258             with open(filename, 'wb') as outf:
 259                 outf.write(webpage_bytes)
 260
 261         try:
 262             content = webpage_bytes.decode(encoding, 'replace')
 263         except LookupError:
 264             content = webpage_bytes.decode('utf-8', 'replace')
 265
 266         if (u'<title>Access to this site is blocked</title>' in content and
 267                 u'Websense' in content[:512]):
 268             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 269             blocked_iframe = self._html_search_regex(
 270                 r'<iframe src="([^"]+)"', content,
 271                 u'Websense information URL', default=None)
 272             if blocked_iframe:
 273                 msg += u' Visit %s for more details' % blocked_iframe
 274             raise ExtractorError(msg, expected=True)
 275
 276         return (content, urlh)
 277
 278     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 279         """ Returns the data of the page as a string """
 280         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 281         if res is False:
 282             return res
 283         else:
 284             content, _ = res
 285             return content
 286
 287     def _download_xml(self, url_or_request, video_id,
 288                       note=u'Downloading XML', errnote=u'Unable to download XML',
 289                       transform_source=None, fatal=True):
 290         """Return the xml as an xml.etree.ElementTree.Element"""
 291         xml_string = self._download_webpage(
 292             url_or_request, video_id, note, errnote, fatal=fatal)
 293         if xml_string is False:
 294             return xml_string
 295         if transform_source:
 296             xml_string = transform_source(xml_string)
 297         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 298
 299     def _download_json(self, url_or_request, video_id,
 300                        note=u'Downloading JSON metadata',
 301                        errnote=u'Unable to download JSON metadata',
 302                        transform_source=None):
 303         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 304         if transform_source:
 305             json_string = transform_source(json_string)
 306         try:
 307             return json.loads(json_string)
 308         except ValueError as ve:
 309             raise ExtractorError('Failed to download JSON', cause=ve)
 310
 311     def report_warning(self, msg, video_id=None):
 312         idstr = u'' if video_id is None else u'%s: ' % video_id
 313         self._downloader.report_warning(
 314             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 315
 316     def to_screen(self, msg):
 317         """Print msg to screen, prefixing it with '[ie_name]'"""
 318         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 319
 320     def report_extraction(self, id_or_name):
 321         """Report information extraction."""
 322         self.to_screen(u'%s: Extracting information' % id_or_name)
 323
 324     def report_download_webpage(self, video_id):
 325         """Report webpage download."""
 326         self.to_screen(u'%s: Downloading webpage' % video_id)
 327
 328     def report_age_confirmation(self):
 329         """Report attempt to confirm age."""
 330         self.to_screen(u'Confirming age')
 331
 332     def report_login(self):
 333         """Report attempt to log in."""
 334         self.to_screen(u'Logging in')
 335
 336     #Methods for following #608
 337     @staticmethod
 338     def url_result(url, ie=None, video_id=None):
 339         """Returns a url that points to a page that should be processed"""
 340         #TODO: ie should be the class used for getting the info
 341         video_info = {'_type': 'url',
 342                       'url': url,
 343                       'ie_key': ie}
 344         if video_id is not None:
 345             video_info['id'] = video_id
 346         return video_info
 347     @staticmethod
 348     def playlist_result(entries, playlist_id=None, playlist_title=None):
 349         """Returns a playlist"""
 350         video_info = {'_type': 'playlist',
 351                       'entries': entries}
 352         if playlist_id:
 353             video_info['id'] = playlist_id
 354         if playlist_title:
 355             video_info['title'] = playlist_title
 356         return video_info
 357
 358     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 359         """
 360         Perform a regex search on the given string, using a single or a list of
 361         patterns returning the first matching group.
 362         In case of failure return a default value or raise a WARNING or a
 363         RegexNotFoundError, depending on fatal, specifying the field name.
 364         """
 365         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 366             mobj = re.search(pattern, string, flags)
 367         else:
 368             for p in pattern:
 369                 mobj = re.search(p, string, flags)
 370                 if mobj: break
 371
 372         if os.name != 'nt' and sys.stderr.isatty():
 373             _name = u'\033[0;34m%s\033[0m' % name
 374         else:
 375             _name = name
 376
 377         if mobj:
 378             # return the first matching group
 379             return next(g for g in mobj.groups() if g is not None)
 380         elif default is not _NO_DEFAULT:
 381             return default
 382         elif fatal:
 383             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 384         else:
 385             self._downloader.report_warning(u'unable to extract %s; '
 386                 u'please report this issue on http://yt-dl.org/bug' % _name)
 387             return None
 388
 389     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 390         """
 391         Like _search_regex, but strips HTML tags and unescapes entities.
 392         """
 393         res = self._search_regex(pattern, string, name, default, fatal, flags)
 394         if res:
 395             return clean_html(res).strip()
 396         else:
 397             return res
 398
 399     def _get_login_info(self):
 400         """
 401         Get the the login info as (username, password)
 402         It will look in the netrc file using the _NETRC_MACHINE value
 403         If there's no info available, return (None, None)
 404         """
 405         if self._downloader is None:
 406             return (None, None)
 407
 408         username = None
 409         password = None
 410         downloader_params = self._downloader.params
 411
 412         # Attempt to use provided username and password or .netrc data
 413         if downloader_params.get('username', None) is not None:
 414             username = downloader_params['username']
 415             password = downloader_params['password']
 416         elif downloader_params.get('usenetrc', False):
 417             try:
 418                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 419                 if info is not None:
 420                     username = info[0]
 421                     password = info[2]
 422                 else:
 423                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 424             except (IOError, netrc.NetrcParseError) as err:
 425                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 426
 427         return (username, password)
 428
 429     # Helper functions for extracting OpenGraph info
 430     @staticmethod
 431     def _og_regexes(prop):
 432         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 433         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 434         template = r'<meta[^>]+?%s[^>]+?%s'
 435         return [
 436             template % (property_re, content_re),
 437             template % (content_re, property_re),
 438         ]
 439
 440     def _og_search_property(self, prop, html, name=None, **kargs):
 441         if name is None:
 442             name = 'OpenGraph %s' % prop
 443         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 444         if escaped is None:
 445             return None
 446         return unescapeHTML(escaped)
 447
 448     def _og_search_thumbnail(self, html, **kargs):
 449         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 450
 451     def _og_search_description(self, html, **kargs):
 452         return self._og_search_property('description', html, fatal=False, **kargs)
 453
 454     def _og_search_title(self, html, **kargs):
 455         return self._og_search_property('title', html, **kargs)
 456
 457     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 458         regexes = self._og_regexes('video')
 459         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 460         return self._html_search_regex(regexes, html, name, **kargs)
 461
 462     def _og_search_url(self, html, **kargs):
 463         return self._og_search_property('url', html, **kargs)
 464
 465     def _html_search_meta(self, name, html, display_name=None, fatal=False):
 466         if display_name is None:
 467             display_name = name
 468         return self._html_search_regex(
 469             r'''(?ix)<meta
 470                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 471                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 472             html, display_name, fatal=fatal)
 473
 474     def _dc_search_uploader(self, html):
 475         return self._html_search_meta('dc.creator', html, 'uploader')
 476
 477     def _rta_search(self, html):
 478         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 479         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 480                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 481                      html):
 482             return 18
 483         return 0
 484
 485     def _media_rating_search(self, html):
 486         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 487         rating = self._html_search_meta('rating', html)
 488
 489         if not rating:
 490             return None
 491
 492         RATING_TABLE = {
 493             'safe for kids': 0,
 494             'general': 8,
 495             '14 years': 14,
 496             'mature': 17,
 497             'restricted': 19,
 498         }
 499         return RATING_TABLE.get(rating.lower(), None)
 500
 501     def _twitter_search_player(self, html):
 502         return self._html_search_meta('twitter:player', html,
 503             'twitter card player')
 504
 505     def _sort_formats(self, formats):
 506         if not formats:
 507             raise ExtractorError(u'No video formats found')
 508
 509         def _formats_key(f):
 510             # TODO remove the following workaround
 511             from ..utils import determine_ext
 512             if not f.get('ext') and 'url' in f:
 513                 f['ext'] = determine_ext(f['url'])
 514
 515             preference = f.get('preference')
 516             if preference is None:
 517                 proto = f.get('protocol')
 518                 if proto is None:
 519                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 520
 521                 preference = 0 if proto in ['http', 'https'] else -0.1
 522                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 523                     preference -= 0.5
 524
 525             if f.get('vcodec') == 'none':  # audio only
 526                 if self._downloader.params.get('prefer_free_formats'):
 527                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 528                 else:
 529                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 530                 ext_preference = 0
 531                 try:
 532                     audio_ext_preference = ORDER.index(f['ext'])
 533                 except ValueError:
 534                     audio_ext_preference = -1
 535             else:
 536                 if self._downloader.params.get('prefer_free_formats'):
 537                     ORDER = [u'flv', u'mp4', u'webm']
 538                 else:
 539                     ORDER = [u'webm', u'flv', u'mp4']
 540                 try:
 541                     ext_preference = ORDER.index(f['ext'])
 542                 except ValueError:
 543                     ext_preference = -1
 544                 audio_ext_preference = 0
 545
 546             return (
 547                 preference,
 548                 f.get('quality') if f.get('quality') is not None else -1,
 549                 f.get('height') if f.get('height') is not None else -1,
 550                 f.get('width') if f.get('width') is not None else -1,
 551                 ext_preference,
 552                 f.get('tbr') if f.get('tbr') is not None else -1,
 553                 f.get('vbr') if f.get('vbr') is not None else -1,
 554                 f.get('abr') if f.get('abr') is not None else -1,
 555                 audio_ext_preference,
 556                 f.get('filesize') if f.get('filesize') is not None else -1,
 557                 f.get('format_id'),
 558             )
 559         formats.sort(key=_formats_key)
 560
 561     def http_scheme(self):
 562         """ Either "https:" or "https:", depending on the user's preferences """
 563         return (
 564             'http:'
 565             if self._downloader.params.get('prefer_insecure', False)
 566             else 'https:')
 567
 568     def _proto_relative_url(self, url, scheme=None):
 569         if url is None:
 570             return url
 571         if url.startswith('//'):
 572             if scheme is None:
 573                 scheme = self.http_scheme()
 574             return scheme + url
 575         else:
 576             return url
 577
 578
 579 class SearchInfoExtractor(InfoExtractor):
 580     """
 581     Base class for paged search queries extractors.
 582     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 583     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 584     """
 585
 586     @classmethod
 587     def _make_valid_url(cls):
 588         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 589
 590     @classmethod
 591     def suitable(cls, url):
 592         return re.match(cls._make_valid_url(), url) is not None
 593
 594     def _real_extract(self, query):
 595         mobj = re.match(self._make_valid_url(), query)
 596         if mobj is None:
 597             raise ExtractorError(u'Invalid search query "%s"' % query)
 598
 599         prefix = mobj.group('prefix')
 600         query = mobj.group('query')
 601         if prefix == '':
 602             return self._get_n_results(query, 1)
 603         elif prefix == 'all':
 604             return self._get_n_results(query, self._MAX_RESULTS)
 605         else:
 606             n = int(prefix)
 607             if n <= 0:
 608                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 609             elif n > self._MAX_RESULTS:
 610                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 611                 n = self._MAX_RESULTS
 612             return self._get_n_results(query, n)
 613
 614     def _get_n_results(self, query, n):
 615         """Get a specified number of results for a query"""
 616         raise NotImplementedError("This method must be implemented by subclasses")
 617
 618     @property
 619     def SEARCH_KEY(self):
 620         return self._SEARCH_KEY
 621