_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21 _NO_DEFAULT = object()
  22
  23
  24 class InfoExtractor(object):
  25     """Information Extractor class.
  26
  27     Information extractors are the classes that, given a URL, extract
  28     information about the video (or videos) the URL refers to. This
  29     information includes the real video URL, the video title, author and
  30     others. The information is stored in a dictionary which is then
  31     passed to the FileDownloader. The FileDownloader processes this
  32     information possibly downloading the video to the file system, among
  33     other possible outcomes.
  34
  35     The dictionaries must include the following fields:
  36
  37     id:             Video identifier.
  38     title:          Video title, unescaped.
  39
  40     Additionally, it must contain either a formats entry or url and ext:
  41
  42     formats:        A list of dictionaries for each format available, it must
  43                     be ordered from worst to best quality. Potential fields:
  44                     * url        Mandatory. The URL of the video file
  45                     * ext        Will be calculated from url if missing
  46                     * format     A human-readable description of the format
  47                                  ("mp4 container with h264/opus").
  48                                  Calculated from the format_id, width, height.
  49                                  and format_note fields if missing.
  50                     * format_id  A short description of the format
  51                                  ("mp4_h264_opus" or "19")
  52                     * format_note Additional info about the format
  53                                  ("3D" or "DASH video")
  54                     * width      Width of the video, if known
  55                     * height     Height of the video, if known
  56                     * abr        Average audio bitrate in KBit/s
  57                     * acodec     Name of the audio codec in use
  58                     * vbr        Average video bitrate in KBit/s
  59                     * vcodec     Name of the video codec in use
  60                     * filesize   The number of bytes, if known in advance
  61                     * player_url SWF Player URL (used for rtmpdump).
  62     url:            Final video URL.
  63     ext:            Video filename extension.
  64     format:         The video format, defaults to ext (used for --get-format)
  65     player_url:     SWF Player URL (used for rtmpdump).
  66     urlhandle:      [internal] The urlHandle to be used to download the file,
  67                     like returned by urllib.request.urlopen
  68
  69     The following fields are optional:
  70
  71     thumbnails:     A list of dictionaries (with the entries "resolution" and
  72                     "url") for the varying thumbnails
  73     thumbnail:      Full URL to a video thumbnail image.
  74     description:    One-line video description.
  75     uploader:       Full name of the video uploader.
  76     upload_date:    Video upload date (YYYYMMDD).
  77     uploader_id:    Nickname or id of the video uploader.
  78     location:       Physical location of the video.
  79     subtitles:      The subtitle file contents as a dictionary in the format
  80                     {language: subtitles}.
  81     duration:       Length of the video in seconds, as an integer.
  82     view_count:     How many users have watched the video on the platform.
  83     like_count:     Number of positive ratings of the video
  84     dislike_count:  Number of negative ratings of the video
  85     comment_count:  Number of comments on the video
  86     age_limit:      Age restriction for the video, as an integer (years)
  87     webpage_url:    The url to the video webpage, if given to youtube-dl it
  88                     should allow to get the same result again. (It will be set
  89                     by YoutubeDL if it's missing)
  90
  91     Unless mentioned otherwise, the fields should be Unicode strings.
  92
  93     Subclasses of this one should re-define the _real_initialize() and
  94     _real_extract() methods and define a _VALID_URL regexp.
  95     Probably, they should also be added to the list of extractors.
  96
  97     _real_extract() must return a *list* of information dictionaries as
  98     described above.
  99
 100     Finally, the _WORKING attribute should be set to False for broken IEs
 101     in order to warn the users and skip the tests.
 102     """
 103
 104     _ready = False
 105     _downloader = None
 106     _WORKING = True
 107
 108     def __init__(self, downloader=None):
 109         """Constructor. Receives an optional downloader."""
 110         self._ready = False
 111         self.set_downloader(downloader)
 112
 113     @classmethod
 114     def suitable(cls, url):
 115         """Receives a URL and returns True if suitable for this IE."""
 116
 117         # This does not use has/getattr intentionally - we want to know whether
 118         # we have cached the regexp for *this* class, whereas getattr would also
 119         # match the superclass
 120         if '_VALID_URL_RE' not in cls.__dict__:
 121             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 122         return cls._VALID_URL_RE.match(url) is not None
 123
 124     @classmethod
 125     def working(cls):
 126         """Getter method for _WORKING."""
 127         return cls._WORKING
 128
 129     def initialize(self):
 130         """Initializes an instance (authentication, etc)."""
 131         if not self._ready:
 132             self._real_initialize()
 133             self._ready = True
 134
 135     def extract(self, url):
 136         """Extracts URL information and returns it in list of dicts."""
 137         self.initialize()
 138         return self._real_extract(url)
 139
 140     def set_downloader(self, downloader):
 141         """Sets the downloader for this IE."""
 142         self._downloader = downloader
 143
 144     def _real_initialize(self):
 145         """Real initialization process. Redefine in subclasses."""
 146         pass
 147
 148     def _real_extract(self, url):
 149         """Real extraction process. Redefine in subclasses."""
 150         pass
 151
 152     @classmethod
 153     def ie_key(cls):
 154         """A string for getting the InfoExtractor with get_info_extractor"""
 155         return cls.__name__[:-2]
 156
 157     @property
 158     def IE_NAME(self):
 159         return type(self).__name__[:-2]
 160
 161     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 162         """ Returns the response handle """
 163         if note is None:
 164             self.report_download_webpage(video_id)
 165         elif note is not False:
 166             if video_id is None:
 167                 self.to_screen(u'%s' % (note,))
 168             else:
 169                 self.to_screen(u'%s: %s' % (video_id, note))
 170         try:
 171             return self._downloader.urlopen(url_or_request)
 172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 173             if errnote is False:
 174                 return False
 175             if errnote is None:
 176                 errnote = u'Unable to download webpage'
 177             errmsg = u'%s: %s' % (errnote, compat_str(err))
 178             if fatal:
 179                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 180             else:
 181                 self._downloader.report_warning(errmsg)
 182                 return False
 183
 184     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 185         """ Returns a tuple (page content as string, URL handle) """
 186
 187         # Strip hashes from the URL (#1038)
 188         if isinstance(url_or_request, (compat_str, str)):
 189             url_or_request = url_or_request.partition('#')[0]
 190
 191         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 192         if urlh is False:
 193             assert not fatal
 194             return False
 195         content_type = urlh.headers.get('Content-Type', '')
 196         webpage_bytes = urlh.read()
 197         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 198         if m:
 199             encoding = m.group(1)
 200         else:
 201             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 202                           webpage_bytes[:1024])
 203             if m:
 204                 encoding = m.group(1).decode('ascii')
 205             else:
 206                 encoding = 'utf-8'
 207         if self._downloader.params.get('dump_intermediate_pages', False):
 208             try:
 209                 url = url_or_request.get_full_url()
 210             except AttributeError:
 211                 url = url_or_request
 212             self.to_screen(u'Dumping request to ' + url)
 213             dump = base64.b64encode(webpage_bytes).decode('ascii')
 214             self._downloader.to_screen(dump)
 215         if self._downloader.params.get('write_pages', False):
 216             try:
 217                 url = url_or_request.get_full_url()
 218             except AttributeError:
 219                 url = url_or_request
 220             raw_filename = ('%s_%s.dump' % (video_id, url))
 221             filename = sanitize_filename(raw_filename, restricted=True)
 222             self.to_screen(u'Saving request to ' + filename)
 223             with open(filename, 'wb') as outf:
 224                 outf.write(webpage_bytes)
 225
 226         content = webpage_bytes.decode(encoding, 'replace')
 227         return (content, urlh)
 228
 229     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 230         """ Returns the data of the page as a string """
 231         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 232         if res is False:
 233             return res
 234         else:
 235             content, _ = res
 236             return content
 237
 238     def _download_xml(self, url_or_request, video_id,
 239                       note=u'Downloading XML', errnote=u'Unable to download XML',
 240                       transform_source=None):
 241         """Return the xml as an xml.etree.ElementTree.Element"""
 242         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 243         if transform_source:
 244             xml_string = transform_source(xml_string)
 245         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 246
 247     def to_screen(self, msg):
 248         """Print msg to screen, prefixing it with '[ie_name]'"""
 249         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 250
 251     def report_extraction(self, id_or_name):
 252         """Report information extraction."""
 253         self.to_screen(u'%s: Extracting information' % id_or_name)
 254
 255     def report_download_webpage(self, video_id):
 256         """Report webpage download."""
 257         self.to_screen(u'%s: Downloading webpage' % video_id)
 258
 259     def report_age_confirmation(self):
 260         """Report attempt to confirm age."""
 261         self.to_screen(u'Confirming age')
 262
 263     def report_login(self):
 264         """Report attempt to log in."""
 265         self.to_screen(u'Logging in')
 266
 267     #Methods for following #608
 268     @staticmethod
 269     def url_result(url, ie=None, video_id=None):
 270         """Returns a url that points to a page that should be processed"""
 271         #TODO: ie should be the class used for getting the info
 272         video_info = {'_type': 'url',
 273                       'url': url,
 274                       'ie_key': ie}
 275         if video_id is not None:
 276             video_info['id'] = video_id
 277         return video_info
 278     @staticmethod
 279     def playlist_result(entries, playlist_id=None, playlist_title=None):
 280         """Returns a playlist"""
 281         video_info = {'_type': 'playlist',
 282                       'entries': entries}
 283         if playlist_id:
 284             video_info['id'] = playlist_id
 285         if playlist_title:
 286             video_info['title'] = playlist_title
 287         return video_info
 288
 289     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 290         """
 291         Perform a regex search on the given string, using a single or a list of
 292         patterns returning the first matching group.
 293         In case of failure return a default value or raise a WARNING or a
 294         RegexNotFoundError, depending on fatal, specifying the field name.
 295         """
 296         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 297             mobj = re.search(pattern, string, flags)
 298         else:
 299             for p in pattern:
 300                 mobj = re.search(p, string, flags)
 301                 if mobj: break
 302
 303         if os.name != 'nt' and sys.stderr.isatty():
 304             _name = u'\033[0;34m%s\033[0m' % name
 305         else:
 306             _name = name
 307
 308         if mobj:
 309             # return the first matching group
 310             return next(g for g in mobj.groups() if g is not None)
 311         elif default is not _NO_DEFAULT:
 312             return default
 313         elif fatal:
 314             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 315         else:
 316             self._downloader.report_warning(u'unable to extract %s; '
 317                 u'please report this issue on http://yt-dl.org/bug' % _name)
 318             return None
 319
 320     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 321         """
 322         Like _search_regex, but strips HTML tags and unescapes entities.
 323         """
 324         res = self._search_regex(pattern, string, name, default, fatal, flags)
 325         if res:
 326             return clean_html(res).strip()
 327         else:
 328             return res
 329
 330     def _get_login_info(self):
 331         """
 332         Get the the login info as (username, password)
 333         It will look in the netrc file using the _NETRC_MACHINE value
 334         If there's no info available, return (None, None)
 335         """
 336         if self._downloader is None:
 337             return (None, None)
 338
 339         username = None
 340         password = None
 341         downloader_params = self._downloader.params
 342
 343         # Attempt to use provided username and password or .netrc data
 344         if downloader_params.get('username', None) is not None:
 345             username = downloader_params['username']
 346             password = downloader_params['password']
 347         elif downloader_params.get('usenetrc', False):
 348             try:
 349                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 350                 if info is not None:
 351                     username = info[0]
 352                     password = info[2]
 353                 else:
 354                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 355             except (IOError, netrc.NetrcParseError) as err:
 356                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 357
 358         return (username, password)
 359
 360     # Helper functions for extracting OpenGraph info
 361     @staticmethod
 362     def _og_regexes(prop):
 363         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 364         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 365         template = r'<meta[^>]+?%s[^>]+?%s'
 366         return [
 367             template % (property_re, content_re),
 368             template % (content_re, property_re),
 369         ]
 370
 371     def _og_search_property(self, prop, html, name=None, **kargs):
 372         if name is None:
 373             name = 'OpenGraph %s' % prop
 374         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 375         if escaped is None:
 376             return None
 377         return unescapeHTML(escaped)
 378
 379     def _og_search_thumbnail(self, html, **kargs):
 380         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 381
 382     def _og_search_description(self, html, **kargs):
 383         return self._og_search_property('description', html, fatal=False, **kargs)
 384
 385     def _og_search_title(self, html, **kargs):
 386         return self._og_search_property('title', html, **kargs)
 387
 388     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 389         regexes = self._og_regexes('video')
 390         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 391         return self._html_search_regex(regexes, html, name, **kargs)
 392
 393     def _html_search_meta(self, name, html, display_name=None):
 394         if display_name is None:
 395             display_name = name
 396         return self._html_search_regex(
 397             r'''(?ix)<meta
 398                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 399                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 400             html, display_name, fatal=False)
 401
 402     def _dc_search_uploader(self, html):
 403         return self._html_search_meta('dc.creator', html, 'uploader')
 404
 405     def _rta_search(self, html):
 406         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 407         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 408                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 409                      html):
 410             return 18
 411         return 0
 412
 413     def _media_rating_search(self, html):
 414         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 415         rating = self._html_search_meta('rating', html)
 416
 417         if not rating:
 418             return None
 419
 420         RATING_TABLE = {
 421             'safe for kids': 0,
 422             'general': 8,
 423             '14 years': 14,
 424             'mature': 17,
 425             'restricted': 19,
 426         }
 427         return RATING_TABLE.get(rating.lower(), None)
 428
 429
 430
 431 class SearchInfoExtractor(InfoExtractor):
 432     """
 433     Base class for paged search queries extractors.
 434     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 435     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 436     """
 437
 438     @classmethod
 439     def _make_valid_url(cls):
 440         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 441
 442     @classmethod
 443     def suitable(cls, url):
 444         return re.match(cls._make_valid_url(), url) is not None
 445
 446     def _real_extract(self, query):
 447         mobj = re.match(self._make_valid_url(), query)
 448         if mobj is None:
 449             raise ExtractorError(u'Invalid search query "%s"' % query)
 450
 451         prefix = mobj.group('prefix')
 452         query = mobj.group('query')
 453         if prefix == '':
 454             return self._get_n_results(query, 1)
 455         elif prefix == 'all':
 456             return self._get_n_results(query, self._MAX_RESULTS)
 457         else:
 458             n = int(prefix)
 459             if n <= 0:
 460                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 461             elif n > self._MAX_RESULTS:
 462                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 463                 n = self._MAX_RESULTS
 464             return self._get_n_results(query, n)
 465
 466     def _get_n_results(self, query, n):
 467         """Get a specified number of results for a query"""
 468         raise NotImplementedError("This method must be implemented by subclasses")
 469
 470     @property
 471     def SEARCH_KEY(self):
 472         return self._SEARCH_KEY