_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     Instead of url and ext, formats can also specified.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnails:     A list of dictionaries (with the entries "resolution" and
  44                     "url") for the varying thumbnails
  45     thumbnail:      Full URL to a video thumbnail image.
  46     description:    One-line video description.
  47     uploader:       Full name of the video uploader.
  48     upload_date:    Video upload date (YYYYMMDD).
  49     uploader_id:    Nickname or id of the video uploader.
  50     location:       Physical location of the video.
  51     player_url:     SWF Player URL (used for rtmpdump).
  52     subtitles:      The subtitle file contents as a dictionary in the format
  53                     {language: subtitles}.
  54     view_count:     How many users have watched the video on the platform.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57     age_limit:      Age restriction for the video, as an integer (years)
  58     formats:        A list of dictionaries for each format available, it must
  59                     be ordered from worst to best quality. Potential fields:
  60                     * url       Mandatory. The URL of the video file
  61                     * ext       Will be calculated from url if missing
  62                     * format    A human-readable description of the format
  63                                 ("mp4 container with h264/opus").
  64                                 Calculated from the format_id, width, height
  65                                 and format_note fields if missing.
  66                     * format_id A short description of the format
  67                                 ("mp4_h264_opus" or "19")
  68                     * format_note Additional info about the format
  69                                 ("3D" or "DASH video")
  70                     * width     Width of the video, if known
  71                     * height    Height of the video, if known
  72
  73     Unless mentioned otherwise, the fields should be Unicode strings.
  74
  75     Subclasses of this one should re-define the _real_initialize() and
  76     _real_extract() methods and define a _VALID_URL regexp.
  77     Probably, they should also be added to the list of extractors.
  78
  79     _real_extract() must return a *list* of information dictionaries as
  80     described above.
  81
  82     Finally, the _WORKING attribute should be set to False for broken IEs
  83     in order to warn the users and skip the tests.
  84     """
  85
  86     _ready = False
  87     _downloader = None
  88     _WORKING = True
  89
  90     def __init__(self, downloader=None):
  91         """Constructor. Receives an optional downloader."""
  92         self._ready = False
  93         self.set_downloader(downloader)
  94
  95     @classmethod
  96     def suitable(cls, url):
  97         """Receives a URL and returns True if suitable for this IE."""
  98
  99         # This does not use has/getattr intentionally - we want to know whether
 100         # we have cached the regexp for *this* class, whereas getattr would also
 101         # match the superclass
 102         if '_VALID_URL_RE' not in cls.__dict__:
 103             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 104         return cls._VALID_URL_RE.match(url) is not None
 105
 106     @classmethod
 107     def working(cls):
 108         """Getter method for _WORKING."""
 109         return cls._WORKING
 110
 111     def initialize(self):
 112         """Initializes an instance (authentication, etc)."""
 113         if not self._ready:
 114             self._real_initialize()
 115             self._ready = True
 116
 117     def extract(self, url):
 118         """Extracts URL information and returns it in list of dicts."""
 119         self.initialize()
 120         return self._real_extract(url)
 121
 122     def set_downloader(self, downloader):
 123         """Sets the downloader for this IE."""
 124         self._downloader = downloader
 125
 126     def _real_initialize(self):
 127         """Real initialization process. Redefine in subclasses."""
 128         pass
 129
 130     def _real_extract(self, url):
 131         """Real extraction process. Redefine in subclasses."""
 132         pass
 133
 134     @classmethod
 135     def ie_key(cls):
 136         """A string for getting the InfoExtractor with get_info_extractor"""
 137         return cls.__name__[:-2]
 138
 139     @property
 140     def IE_NAME(self):
 141         return type(self).__name__[:-2]
 142
 143     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 144         """ Returns the response handle """
 145         if note is None:
 146             self.report_download_webpage(video_id)
 147         elif note is not False:
 148             self.to_screen(u'%s: %s' % (video_id, note))
 149         try:
 150             return compat_urllib_request.urlopen(url_or_request)
 151         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 152             if errnote is None:
 153                 errnote = u'Unable to download webpage'
 154             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 155
 156     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 157         """ Returns a tuple (page content as string, URL handle) """
 158
 159         # Strip hashes from the URL (#1038)
 160         if isinstance(url_or_request, (compat_str, str)):
 161             url_or_request = url_or_request.partition('#')[0]
 162
 163         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 164         content_type = urlh.headers.get('Content-Type', '')
 165         webpage_bytes = urlh.read()
 166         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 167         if m:
 168             encoding = m.group(1)
 169         else:
 170             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 171                           webpage_bytes[:1024])
 172             if m:
 173                 encoding = m.group(1).decode('ascii')
 174             else:
 175                 encoding = 'utf-8'
 176         if self._downloader.params.get('dump_intermediate_pages', False):
 177             try:
 178                 url = url_or_request.get_full_url()
 179             except AttributeError:
 180                 url = url_or_request
 181             self.to_screen(u'Dumping request to ' + url)
 182             dump = base64.b64encode(webpage_bytes).decode('ascii')
 183             self._downloader.to_screen(dump)
 184         content = webpage_bytes.decode(encoding, 'replace')
 185         return (content, urlh)
 186
 187     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 188         """ Returns the data of the page as a string """
 189         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 190
 191     def to_screen(self, msg):
 192         """Print msg to screen, prefixing it with '[ie_name]'"""
 193         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 194
 195     def report_extraction(self, id_or_name):
 196         """Report information extraction."""
 197         self.to_screen(u'%s: Extracting information' % id_or_name)
 198
 199     def report_download_webpage(self, video_id):
 200         """Report webpage download."""
 201         self.to_screen(u'%s: Downloading webpage' % video_id)
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self.to_screen(u'Confirming age')
 206
 207     def report_login(self):
 208         """Report attempt to log in."""
 209         self.to_screen(u'Logging in')
 210
 211     #Methods for following #608
 212     def url_result(self, url, ie=None):
 213         """Returns a url that points to a page that should be processed"""
 214         #TODO: ie should be the class used for getting the info
 215         video_info = {'_type': 'url',
 216                       'url': url,
 217                       'ie_key': ie}
 218         return video_info
 219     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 220         """Returns a playlist"""
 221         video_info = {'_type': 'playlist',
 222                       'entries': entries}
 223         if playlist_id:
 224             video_info['id'] = playlist_id
 225         if playlist_title:
 226             video_info['title'] = playlist_title
 227         return video_info
 228
 229     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 230         """
 231         Perform a regex search on the given string, using a single or a list of
 232         patterns returning the first matching group.
 233         In case of failure return a default value or raise a WARNING or a
 234         ExtractorError, depending on fatal, specifying the field name.
 235         """
 236         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 237             mobj = re.search(pattern, string, flags)
 238         else:
 239             for p in pattern:
 240                 mobj = re.search(p, string, flags)
 241                 if mobj: break
 242
 243         if sys.stderr.isatty() and os.name != 'nt':
 244             _name = u'\033[0;34m%s\033[0m' % name
 245         else:
 246             _name = name
 247
 248         if mobj:
 249             # return the first matching group
 250             return next(g for g in mobj.groups() if g is not None)
 251         elif default is not None:
 252             return default
 253         elif fatal:
 254             raise ExtractorError(u'Unable to extract %s' % _name)
 255         else:
 256             self._downloader.report_warning(u'unable to extract %s; '
 257                 u'please report this issue on http://yt-dl.org/bug' % _name)
 258             return None
 259
 260     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 261         """
 262         Like _search_regex, but strips HTML tags and unescapes entities.
 263         """
 264         res = self._search_regex(pattern, string, name, default, fatal, flags)
 265         if res:
 266             return clean_html(res).strip()
 267         else:
 268             return res
 269
 270     def _get_login_info(self):
 271         """
 272         Get the the login info as (username, password)
 273         It will look in the netrc file using the _NETRC_MACHINE value
 274         If there's no info available, return (None, None)
 275         """
 276         if self._downloader is None:
 277             return (None, None)
 278
 279         username = None
 280         password = None
 281         downloader_params = self._downloader.params
 282
 283         # Attempt to use provided username and password or .netrc data
 284         if downloader_params.get('username', None) is not None:
 285             username = downloader_params['username']
 286             password = downloader_params['password']
 287         elif downloader_params.get('usenetrc', False):
 288             try:
 289                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 290                 if info is not None:
 291                     username = info[0]
 292                     password = info[2]
 293                 else:
 294                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 295             except (IOError, netrc.NetrcParseError) as err:
 296                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 297
 298         return (username, password)
 299
 300     # Helper functions for extracting OpenGraph info
 301     @staticmethod
 302     def _og_regex(prop):
 303         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 304
 305     def _og_search_property(self, prop, html, name=None, **kargs):
 306         if name is None:
 307             name = 'OpenGraph %s' % prop
 308         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 309         return unescapeHTML(escaped)
 310
 311     def _og_search_thumbnail(self, html, **kargs):
 312         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 313
 314     def _og_search_description(self, html, **kargs):
 315         return self._og_search_property('description', html, fatal=False, **kargs)
 316
 317     def _og_search_title(self, html, **kargs):
 318         return self._og_search_property('title', html, **kargs)
 319
 320     def _og_search_video_url(self, html, name='video url', **kargs):
 321         return self._html_search_regex([self._og_regex('video:secure_url'),
 322                                         self._og_regex('video')],
 323                                        html, name, **kargs)
 324
 325     def _rta_search(self, html):
 326         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 327         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 328                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 329                      html):
 330             return 18
 331         return 0
 332
 333
 334 class SearchInfoExtractor(InfoExtractor):
 335     """
 336     Base class for paged search queries extractors.
 337     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 338     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 339     """
 340
 341     @classmethod
 342     def _make_valid_url(cls):
 343         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 344
 345     @classmethod
 346     def suitable(cls, url):
 347         return re.match(cls._make_valid_url(), url) is not None
 348
 349     def _real_extract(self, query):
 350         mobj = re.match(self._make_valid_url(), query)
 351         if mobj is None:
 352             raise ExtractorError(u'Invalid search query "%s"' % query)
 353
 354         prefix = mobj.group('prefix')
 355         query = mobj.group('query')
 356         if prefix == '':
 357             return self._get_n_results(query, 1)
 358         elif prefix == 'all':
 359             return self._get_n_results(query, self._MAX_RESULTS)
 360         else:
 361             n = int(prefix)
 362             if n <= 0:
 363                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 364             elif n > self._MAX_RESULTS:
 365                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 366                 n = self._MAX_RESULTS
 367             return self._get_n_results(query, n)
 368
 369     def _get_n_results(self, query, n):
 370         """Get a specified number of results for a query"""
 371         raise NotImplementedError("This method must be implemented by subclasses")
 372
 373     @property
 374     def SEARCH_KEY(self):
 375         return self._SEARCH_KEY