_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21 _NO_DEFAULT = object()
  22
  23
  24 class InfoExtractor(object):
  25     """Information Extractor class.
  26
  27     Information extractors are the classes that, given a URL, extract
  28     information about the video (or videos) the URL refers to. This
  29     information includes the real video URL, the video title, author and
  30     others. The information is stored in a dictionary which is then
  31     passed to the FileDownloader. The FileDownloader processes this
  32     information possibly downloading the video to the file system, among
  33     other possible outcomes.
  34
  35     The dictionaries must include the following fields:
  36
  37     id:             Video identifier.
  38     title:          Video title, unescaped.
  39
  40     Additionally, it must contain either a formats entry or url and ext:
  41
  42     formats:        A list of dictionaries for each format available, it must
  43                     be ordered from worst to best quality. Potential fields:
  44                     * url        Mandatory. The URL of the video file
  45                     * ext        Will be calculated from url if missing
  46                     * format     A human-readable description of the format
  47                                  ("mp4 container with h264/opus").
  48                                  Calculated from the format_id, width, height.
  49                                  and format_note fields if missing.
  50                     * format_id  A short description of the format
  51                                  ("mp4_h264_opus" or "19")
  52                     * format_note Additional info about the format
  53                                  ("3D" or "DASH video")
  54                     * width      Width of the video, if known
  55                     * height     Height of the video, if known
  56                     * abr        Average audio bitrate in KBit/s
  57                     * acodec     Name of the audio codec in use
  58                     * vbr        Average video bitrate in KBit/s
  59                     * vcodec     Name of the video codec in use
  60                     * filesize   The number of bytes, if known in advance
  61                     * player_url SWF Player URL (used for rtmpdump).
  62     url:            Final video URL.
  63     ext:            Video filename extension.
  64     format:         The video format, defaults to ext (used for --get-format)
  65     player_url:     SWF Player URL (used for rtmpdump).
  66
  67     The following fields are optional:
  68
  69     thumbnails:     A list of dictionaries (with the entries "resolution" and
  70                     "url") for the varying thumbnails
  71     thumbnail:      Full URL to a video thumbnail image.
  72     description:    One-line video description.
  73     uploader:       Full name of the video uploader.
  74     upload_date:    Video upload date (YYYYMMDD).
  75     uploader_id:    Nickname or id of the video uploader.
  76     location:       Physical location of the video.
  77     subtitles:      The subtitle file contents as a dictionary in the format
  78                     {language: subtitles}.
  79     duration:       Length of the video in seconds, as an integer.
  80     view_count:     How many users have watched the video on the platform.
  81     like_count:     Number of positive ratings of the video
  82     dislike_count:  Number of negative ratings of the video
  83     comment_count:  Number of comments on the video
  84     age_limit:      Age restriction for the video, as an integer (years)
  85     webpage_url:    The url to the video webpage, if given to youtube-dl it
  86                     should allow to get the same result again. (It will be set
  87                     by YoutubeDL if it's missing)
  88
  89     Unless mentioned otherwise, the fields should be Unicode strings.
  90
  91     Subclasses of this one should re-define the _real_initialize() and
  92     _real_extract() methods and define a _VALID_URL regexp.
  93     Probably, they should also be added to the list of extractors.
  94
  95     _real_extract() must return a *list* of information dictionaries as
  96     described above.
  97
  98     Finally, the _WORKING attribute should be set to False for broken IEs
  99     in order to warn the users and skip the tests.
 100     """
 101
 102     _ready = False
 103     _downloader = None
 104     _WORKING = True
 105
 106     def __init__(self, downloader=None):
 107         """Constructor. Receives an optional downloader."""
 108         self._ready = False
 109         self.set_downloader(downloader)
 110
 111     @classmethod
 112     def suitable(cls, url):
 113         """Receives a URL and returns True if suitable for this IE."""
 114
 115         # This does not use has/getattr intentionally - we want to know whether
 116         # we have cached the regexp for *this* class, whereas getattr would also
 117         # match the superclass
 118         if '_VALID_URL_RE' not in cls.__dict__:
 119             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 120         return cls._VALID_URL_RE.match(url) is not None
 121
 122     @classmethod
 123     def working(cls):
 124         """Getter method for _WORKING."""
 125         return cls._WORKING
 126
 127     def initialize(self):
 128         """Initializes an instance (authentication, etc)."""
 129         if not self._ready:
 130             self._real_initialize()
 131             self._ready = True
 132
 133     def extract(self, url):
 134         """Extracts URL information and returns it in list of dicts."""
 135         self.initialize()
 136         return self._real_extract(url)
 137
 138     def set_downloader(self, downloader):
 139         """Sets the downloader for this IE."""
 140         self._downloader = downloader
 141
 142     def _real_initialize(self):
 143         """Real initialization process. Redefine in subclasses."""
 144         pass
 145
 146     def _real_extract(self, url):
 147         """Real extraction process. Redefine in subclasses."""
 148         pass
 149
 150     @classmethod
 151     def ie_key(cls):
 152         """A string for getting the InfoExtractor with get_info_extractor"""
 153         return cls.__name__[:-2]
 154
 155     @property
 156     def IE_NAME(self):
 157         return type(self).__name__[:-2]
 158
 159     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 160         """ Returns the response handle """
 161         if note is None:
 162             self.report_download_webpage(video_id)
 163         elif note is not False:
 164             if video_id is None:
 165                 self.to_screen(u'%s' % (note,))
 166             else:
 167                 self.to_screen(u'%s: %s' % (video_id, note))
 168         try:
 169             return self._downloader.urlopen(url_or_request)
 170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 171             if errnote is False:
 172                 return False
 173             if errnote is None:
 174                 errnote = u'Unable to download webpage'
 175             errmsg = u'%s: %s' % (errnote, compat_str(err))
 176             if fatal:
 177                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 178             else:
 179                 self._downloader.report_warning(errmsg)
 180                 return False
 181
 182     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 183         """ Returns a tuple (page content as string, URL handle) """
 184
 185         # Strip hashes from the URL (#1038)
 186         if isinstance(url_or_request, (compat_str, str)):
 187             url_or_request = url_or_request.partition('#')[0]
 188
 189         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 190         if urlh is False:
 191             assert not fatal
 192             return False
 193         content_type = urlh.headers.get('Content-Type', '')
 194         webpage_bytes = urlh.read()
 195         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 196         if m:
 197             encoding = m.group(1)
 198         else:
 199             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 200                           webpage_bytes[:1024])
 201             if m:
 202                 encoding = m.group(1).decode('ascii')
 203             else:
 204                 encoding = 'utf-8'
 205         if self._downloader.params.get('dump_intermediate_pages', False):
 206             try:
 207                 url = url_or_request.get_full_url()
 208             except AttributeError:
 209                 url = url_or_request
 210             self.to_screen(u'Dumping request to ' + url)
 211             dump = base64.b64encode(webpage_bytes).decode('ascii')
 212             self._downloader.to_screen(dump)
 213         if self._downloader.params.get('write_pages', False):
 214             try:
 215                 url = url_or_request.get_full_url()
 216             except AttributeError:
 217                 url = url_or_request
 218             raw_filename = ('%s_%s.dump' % (video_id, url))
 219             filename = sanitize_filename(raw_filename, restricted=True)
 220             self.to_screen(u'Saving request to ' + filename)
 221             with open(filename, 'wb') as outf:
 222                 outf.write(webpage_bytes)
 223
 224         content = webpage_bytes.decode(encoding, 'replace')
 225         return (content, urlh)
 226
 227     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 228         """ Returns the data of the page as a string """
 229         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 230         if res is False:
 231             return res
 232         else:
 233             content, _ = res
 234             return content
 235
 236     def _download_xml(self, url_or_request, video_id,
 237                       note=u'Downloading XML', errnote=u'Unable to download XML',
 238                       transform_source=None):
 239         """Return the xml as an xml.etree.ElementTree.Element"""
 240         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 241         if transform_source:
 242             xml_string = transform_source(xml_string)
 243         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 244
 245     def to_screen(self, msg):
 246         """Print msg to screen, prefixing it with '[ie_name]'"""
 247         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 248
 249     def report_extraction(self, id_or_name):
 250         """Report information extraction."""
 251         self.to_screen(u'%s: Extracting information' % id_or_name)
 252
 253     def report_download_webpage(self, video_id):
 254         """Report webpage download."""
 255         self.to_screen(u'%s: Downloading webpage' % video_id)
 256
 257     def report_age_confirmation(self):
 258         """Report attempt to confirm age."""
 259         self.to_screen(u'Confirming age')
 260
 261     def report_login(self):
 262         """Report attempt to log in."""
 263         self.to_screen(u'Logging in')
 264
 265     #Methods for following #608
 266     @staticmethod
 267     def url_result(url, ie=None, video_id=None):
 268         """Returns a url that points to a page that should be processed"""
 269         #TODO: ie should be the class used for getting the info
 270         video_info = {'_type': 'url',
 271                       'url': url,
 272                       'ie_key': ie}
 273         if video_id is not None:
 274             video_info['id'] = video_id
 275         return video_info
 276     @staticmethod
 277     def playlist_result(entries, playlist_id=None, playlist_title=None):
 278         """Returns a playlist"""
 279         video_info = {'_type': 'playlist',
 280                       'entries': entries}
 281         if playlist_id:
 282             video_info['id'] = playlist_id
 283         if playlist_title:
 284             video_info['title'] = playlist_title
 285         return video_info
 286
 287     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 288         """
 289         Perform a regex search on the given string, using a single or a list of
 290         patterns returning the first matching group.
 291         In case of failure return a default value or raise a WARNING or a
 292         RegexNotFoundError, depending on fatal, specifying the field name.
 293         """
 294         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 295             mobj = re.search(pattern, string, flags)
 296         else:
 297             for p in pattern:
 298                 mobj = re.search(p, string, flags)
 299                 if mobj: break
 300
 301         if os.name != 'nt' and sys.stderr.isatty():
 302             _name = u'\033[0;34m%s\033[0m' % name
 303         else:
 304             _name = name
 305
 306         if mobj:
 307             # return the first matching group
 308             return next(g for g in mobj.groups() if g is not None)
 309         elif default is not _NO_DEFAULT:
 310             return default
 311         elif fatal:
 312             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 313         else:
 314             self._downloader.report_warning(u'unable to extract %s; '
 315                 u'please report this issue on http://yt-dl.org/bug' % _name)
 316             return None
 317
 318     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 319         """
 320         Like _search_regex, but strips HTML tags and unescapes entities.
 321         """
 322         res = self._search_regex(pattern, string, name, default, fatal, flags)
 323         if res:
 324             return clean_html(res).strip()
 325         else:
 326             return res
 327
 328     def _get_login_info(self):
 329         """
 330         Get the the login info as (username, password)
 331         It will look in the netrc file using the _NETRC_MACHINE value
 332         If there's no info available, return (None, None)
 333         """
 334         if self._downloader is None:
 335             return (None, None)
 336
 337         username = None
 338         password = None
 339         downloader_params = self._downloader.params
 340
 341         # Attempt to use provided username and password or .netrc data
 342         if downloader_params.get('username', None) is not None:
 343             username = downloader_params['username']
 344             password = downloader_params['password']
 345         elif downloader_params.get('usenetrc', False):
 346             try:
 347                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 348                 if info is not None:
 349                     username = info[0]
 350                     password = info[2]
 351                 else:
 352                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 353             except (IOError, netrc.NetrcParseError) as err:
 354                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 355
 356         return (username, password)
 357
 358     # Helper functions for extracting OpenGraph info
 359     @staticmethod
 360     def _og_regexes(prop):
 361         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 362         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 363         template = r'<meta[^>]+?%s[^>]+?%s'
 364         return [
 365             template % (property_re, content_re),
 366             template % (content_re, property_re),
 367         ]
 368
 369     def _og_search_property(self, prop, html, name=None, **kargs):
 370         if name is None:
 371             name = 'OpenGraph %s' % prop
 372         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 373         if escaped is None:
 374             return None
 375         return unescapeHTML(escaped)
 376
 377     def _og_search_thumbnail(self, html, **kargs):
 378         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 379
 380     def _og_search_description(self, html, **kargs):
 381         return self._og_search_property('description', html, fatal=False, **kargs)
 382
 383     def _og_search_title(self, html, **kargs):
 384         return self._og_search_property('title', html, **kargs)
 385
 386     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 387         regexes = self._og_regexes('video')
 388         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 389         return self._html_search_regex(regexes, html, name, **kargs)
 390
 391     def _html_search_meta(self, name, html, display_name=None):
 392         if display_name is None:
 393             display_name = name
 394         return self._html_search_regex(
 395             r'''(?ix)<meta
 396                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 397                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 398             html, display_name, fatal=False)
 399
 400     def _dc_search_uploader(self, html):
 401         return self._html_search_meta('dc.creator', html, 'uploader')
 402
 403     def _rta_search(self, html):
 404         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 405         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 406                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 407                      html):
 408             return 18
 409         return 0
 410
 411     def _media_rating_search(self, html):
 412         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 413         rating = self._html_search_meta('rating', html)
 414
 415         if not rating:
 416             return None
 417
 418         RATING_TABLE = {
 419             'safe for kids': 0,
 420             'general': 8,
 421             '14 years': 14,
 422             'mature': 17,
 423             'restricted': 19,
 424         }
 425         return RATING_TABLE.get(rating.lower(), None)
 426
 427
 428
 429 class SearchInfoExtractor(InfoExtractor):
 430     """
 431     Base class for paged search queries extractors.
 432     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 433     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 434     """
 435
 436     @classmethod
 437     def _make_valid_url(cls):
 438         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 439
 440     @classmethod
 441     def suitable(cls, url):
 442         return re.match(cls._make_valid_url(), url) is not None
 443
 444     def _real_extract(self, query):
 445         mobj = re.match(self._make_valid_url(), query)
 446         if mobj is None:
 447             raise ExtractorError(u'Invalid search query "%s"' % query)
 448
 449         prefix = mobj.group('prefix')
 450         query = mobj.group('query')
 451         if prefix == '':
 452             return self._get_n_results(query, 1)
 453         elif prefix == 'all':
 454             return self._get_n_results(query, self._MAX_RESULTS)
 455         else:
 456             n = int(prefix)
 457             if n <= 0:
 458                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 459             elif n > self._MAX_RESULTS:
 460                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 461                 n = self._MAX_RESULTS
 462             return self._get_n_results(query, n)
 463
 464     def _get_n_results(self, query, n):
 465         """Get a specified number of results for a query"""
 466         raise NotImplementedError("This method must be implemented by subclasses")
 467
 468     @property
 469     def SEARCH_KEY(self):
 470         return self._SEARCH_KEY