_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     unescapeHTML,
  19 )
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38
  39     Instead of url and ext, formats can also specified.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnails:     A list of dictionaries (with the entries "resolution" and
  45                     "url") for the varying thumbnails
  46     thumbnail:      Full URL to a video thumbnail image.
  47     description:    One-line video description.
  48     uploader:       Full name of the video uploader.
  49     upload_date:    Video upload date (YYYYMMDD).
  50     uploader_id:    Nickname or id of the video uploader.
  51     location:       Physical location of the video.
  52     player_url:     SWF Player URL (used for rtmpdump).
  53     subtitles:      The subtitle file contents as a dictionary in the format
  54                     {language: subtitles}.
  55     view_count:     How many users have watched the video on the platform.
  56     urlhandle:      [internal] The urlHandle to be used to download the file,
  57                     like returned by urllib.request.urlopen
  58     age_limit:      Age restriction for the video, as an integer (years)
  59     formats:        A list of dictionaries for each format available, it must
  60                     be ordered from worst to best quality. Potential fields:
  61                     * url       Mandatory. The URL of the video file
  62                     * ext       Will be calculated from url if missing
  63                     * format    A human-readable description of the format
  64                                 ("mp4 container with h264/opus").
  65                                 Calculated from the format_id, width, height
  66                                 and format_note fields if missing.
  67                     * format_id A short description of the format
  68                                 ("mp4_h264_opus" or "19")
  69                     * format_note Additional info about the format
  70                                 ("3D" or "DASH video")
  71                     * width     Width of the video, if known
  72                     * height    Height of the video, if known
  73
  74     Unless mentioned otherwise, the fields should be Unicode strings.
  75
  76     Subclasses of this one should re-define the _real_initialize() and
  77     _real_extract() methods and define a _VALID_URL regexp.
  78     Probably, they should also be added to the list of extractors.
  79
  80     _real_extract() must return a *list* of information dictionaries as
  81     described above.
  82
  83     Finally, the _WORKING attribute should be set to False for broken IEs
  84     in order to warn the users and skip the tests.
  85     """
  86
  87     _ready = False
  88     _downloader = None
  89     _WORKING = True
  90
  91     def __init__(self, downloader=None):
  92         """Constructor. Receives an optional downloader."""
  93         self._ready = False
  94         self.set_downloader(downloader)
  95
  96     @classmethod
  97     def suitable(cls, url):
  98         """Receives a URL and returns True if suitable for this IE."""
  99
 100         # This does not use has/getattr intentionally - we want to know whether
 101         # we have cached the regexp for *this* class, whereas getattr would also
 102         # match the superclass
 103         if '_VALID_URL_RE' not in cls.__dict__:
 104             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 105         return cls._VALID_URL_RE.match(url) is not None
 106
 107     @classmethod
 108     def working(cls):
 109         """Getter method for _WORKING."""
 110         return cls._WORKING
 111
 112     def initialize(self):
 113         """Initializes an instance (authentication, etc)."""
 114         if not self._ready:
 115             self._real_initialize()
 116             self._ready = True
 117
 118     def extract(self, url):
 119         """Extracts URL information and returns it in list of dicts."""
 120         self.initialize()
 121         return self._real_extract(url)
 122
 123     def set_downloader(self, downloader):
 124         """Sets the downloader for this IE."""
 125         self._downloader = downloader
 126
 127     def _real_initialize(self):
 128         """Real initialization process. Redefine in subclasses."""
 129         pass
 130
 131     def _real_extract(self, url):
 132         """Real extraction process. Redefine in subclasses."""
 133         pass
 134
 135     @classmethod
 136     def ie_key(cls):
 137         """A string for getting the InfoExtractor with get_info_extractor"""
 138         return cls.__name__[:-2]
 139
 140     @property
 141     def IE_NAME(self):
 142         return type(self).__name__[:-2]
 143
 144     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 145         """ Returns the response handle """
 146         if note is None:
 147             self.report_download_webpage(video_id)
 148         elif note is not False:
 149             self.to_screen(u'%s: %s' % (video_id, note))
 150         try:
 151             return compat_urllib_request.urlopen(url_or_request)
 152         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 153             if errnote is None:
 154                 errnote = u'Unable to download webpage'
 155             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 156
 157     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 158         """ Returns a tuple (page content as string, URL handle) """
 159
 160         # Strip hashes from the URL (#1038)
 161         if isinstance(url_or_request, (compat_str, str)):
 162             url_or_request = url_or_request.partition('#')[0]
 163
 164         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 165         content_type = urlh.headers.get('Content-Type', '')
 166         webpage_bytes = urlh.read()
 167         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 168         if m:
 169             encoding = m.group(1)
 170         else:
 171             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 172                           webpage_bytes[:1024])
 173             if m:
 174                 encoding = m.group(1).decode('ascii')
 175             else:
 176                 encoding = 'utf-8'
 177         if self._downloader.params.get('dump_intermediate_pages', False):
 178             try:
 179                 url = url_or_request.get_full_url()
 180             except AttributeError:
 181                 url = url_or_request
 182             self.to_screen(u'Dumping request to ' + url)
 183             dump = base64.b64encode(webpage_bytes).decode('ascii')
 184             self._downloader.to_screen(dump)
 185         content = webpage_bytes.decode(encoding, 'replace')
 186         return (content, urlh)
 187
 188     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 189         """ Returns the data of the page as a string """
 190         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 191
 192     def to_screen(self, msg):
 193         """Print msg to screen, prefixing it with '[ie_name]'"""
 194         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 195
 196     def report_extraction(self, id_or_name):
 197         """Report information extraction."""
 198         self.to_screen(u'%s: Extracting information' % id_or_name)
 199
 200     def report_download_webpage(self, video_id):
 201         """Report webpage download."""
 202         self.to_screen(u'%s: Downloading webpage' % video_id)
 203
 204     def report_age_confirmation(self):
 205         """Report attempt to confirm age."""
 206         self.to_screen(u'Confirming age')
 207
 208     def report_login(self):
 209         """Report attempt to log in."""
 210         self.to_screen(u'Logging in')
 211
 212     #Methods for following #608
 213     def url_result(self, url, ie=None):
 214         """Returns a url that points to a page that should be processed"""
 215         #TODO: ie should be the class used for getting the info
 216         video_info = {'_type': 'url',
 217                       'url': url,
 218                       'ie_key': ie}
 219         return video_info
 220     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 221         """Returns a playlist"""
 222         video_info = {'_type': 'playlist',
 223                       'entries': entries}
 224         if playlist_id:
 225             video_info['id'] = playlist_id
 226         if playlist_title:
 227             video_info['title'] = playlist_title
 228         return video_info
 229
 230     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 231         """
 232         Perform a regex search on the given string, using a single or a list of
 233         patterns returning the first matching group.
 234         In case of failure return a default value or raise a WARNING or a
 235         RegexNotFoundError, depending on fatal, specifying the field name.
 236         """
 237         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 238             mobj = re.search(pattern, string, flags)
 239         else:
 240             for p in pattern:
 241                 mobj = re.search(p, string, flags)
 242                 if mobj: break
 243
 244         if sys.stderr.isatty() and os.name != 'nt':
 245             _name = u'\033[0;34m%s\033[0m' % name
 246         else:
 247             _name = name
 248
 249         if mobj:
 250             # return the first matching group
 251             return next(g for g in mobj.groups() if g is not None)
 252         elif default is not None:
 253             return default
 254         elif fatal:
 255             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 256         else:
 257             self._downloader.report_warning(u'unable to extract %s; '
 258                 u'please report this issue on http://yt-dl.org/bug' % _name)
 259             return None
 260
 261     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 262         """
 263         Like _search_regex, but strips HTML tags and unescapes entities.
 264         """
 265         res = self._search_regex(pattern, string, name, default, fatal, flags)
 266         if res:
 267             return clean_html(res).strip()
 268         else:
 269             return res
 270
 271     def _get_login_info(self):
 272         """
 273         Get the the login info as (username, password)
 274         It will look in the netrc file using the _NETRC_MACHINE value
 275         If there's no info available, return (None, None)
 276         """
 277         if self._downloader is None:
 278             return (None, None)
 279
 280         username = None
 281         password = None
 282         downloader_params = self._downloader.params
 283
 284         # Attempt to use provided username and password or .netrc data
 285         if downloader_params.get('username', None) is not None:
 286             username = downloader_params['username']
 287             password = downloader_params['password']
 288         elif downloader_params.get('usenetrc', False):
 289             try:
 290                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 291                 if info is not None:
 292                     username = info[0]
 293                     password = info[2]
 294                 else:
 295                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 296             except (IOError, netrc.NetrcParseError) as err:
 297                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 298
 299         return (username, password)
 300
 301     # Helper functions for extracting OpenGraph info
 302     @staticmethod
 303     def _og_regex(prop):
 304         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 305
 306     def _og_search_property(self, prop, html, name=None, **kargs):
 307         if name is None:
 308             name = 'OpenGraph %s' % prop
 309         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 310         return unescapeHTML(escaped)
 311
 312     def _og_search_thumbnail(self, html, **kargs):
 313         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 314
 315     def _og_search_description(self, html, **kargs):
 316         return self._og_search_property('description', html, fatal=False, **kargs)
 317
 318     def _og_search_title(self, html, **kargs):
 319         return self._og_search_property('title', html, **kargs)
 320
 321     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 322         regexes = [self._og_regex('video')]
 323         if secure: regexes.insert(0, self._og_regex('video:secure_url'))
 324         return self._html_search_regex(regexes, html, name, **kargs)
 325
 326     def _rta_search(self, html):
 327         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 328         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 329                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 330                      html):
 331             return 18
 332         return 0
 333
 334
 335 class SearchInfoExtractor(InfoExtractor):
 336     """
 337     Base class for paged search queries extractors.
 338     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 339     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 340     """
 341
 342     @classmethod
 343     def _make_valid_url(cls):
 344         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 345
 346     @classmethod
 347     def suitable(cls, url):
 348         return re.match(cls._make_valid_url(), url) is not None
 349
 350     def _real_extract(self, query):
 351         mobj = re.match(self._make_valid_url(), query)
 352         if mobj is None:
 353             raise ExtractorError(u'Invalid search query "%s"' % query)
 354
 355         prefix = mobj.group('prefix')
 356         query = mobj.group('query')
 357         if prefix == '':
 358             return self._get_n_results(query, 1)
 359         elif prefix == 'all':
 360             return self._get_n_results(query, self._MAX_RESULTS)
 361         else:
 362             n = int(prefix)
 363             if n <= 0:
 364                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 365             elif n > self._MAX_RESULTS:
 366                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 367                 n = self._MAX_RESULTS
 368             return self._get_n_results(query, n)
 369
 370     def _get_n_results(self, query, n):
 371         """Get a specified number of results for a query"""
 372         raise NotImplementedError("This method must be implemented by subclasses")
 373
 374     @property
 375     def SEARCH_KEY(self):
 376         return self._SEARCH_KEY