_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     The following fields are optional:
  39
  40     format:         The video format, defaults to ext (used for --get-format)
  41     thumbnails:     A list of dictionaries (with the entries "resolution" and
  42                     "url") for the varying thumbnails
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The subtitle file contents.
  51     view_count:     How many users have watched the video on the platform.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80
  81         # This does not use has/getattr intentionally - we want to know whether
  82         # we have cached the regexp for *this* class, whereas getattr would also
  83         # match the superclass
  84         if '_VALID_URL_RE' not in cls.__dict__:
  85             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  86         return cls._VALID_URL_RE.match(url) is not None
  87
  88     @classmethod
  89     def working(cls):
  90         """Getter method for _WORKING."""
  91         return cls._WORKING
  92
  93     def initialize(self):
  94         """Initializes an instance (authentication, etc)."""
  95         if not self._ready:
  96             self._real_initialize()
  97             self._ready = True
  98
  99     def extract(self, url):
 100         """Extracts URL information and returns it in list of dicts."""
 101         self.initialize()
 102         return self._real_extract(url)
 103
 104     def set_downloader(self, downloader):
 105         """Sets the downloader for this IE."""
 106         self._downloader = downloader
 107
 108     def _real_initialize(self):
 109         """Real initialization process. Redefine in subclasses."""
 110         pass
 111
 112     def _real_extract(self, url):
 113         """Real extraction process. Redefine in subclasses."""
 114         pass
 115
 116     @property
 117     def IE_NAME(self):
 118         return type(self).__name__[:-2]
 119
 120     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 121         """ Returns the response handle """
 122         if note is None:
 123             self.report_download_webpage(video_id)
 124         elif note is not False:
 125             self.to_screen(u'%s: %s' % (video_id, note))
 126         try:
 127             return compat_urllib_request.urlopen(url_or_request)
 128         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 129             if errnote is None:
 130                 errnote = u'Unable to download webpage'
 131             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 132
 133     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 134         """ Returns a tuple (page content as string, URL handle) """
 135
 136         # Strip hashes from the URL (#1038)
 137         if isinstance(url_or_request, (compat_str, str)):
 138             url_or_request = url_or_request.partition('#')[0]
 139
 140         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 141         content_type = urlh.headers.get('Content-Type', '')
 142         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 143         if m:
 144             encoding = m.group(1)
 145         else:
 146             encoding = 'utf-8'
 147         webpage_bytes = urlh.read()
 148         if self._downloader.params.get('dump_intermediate_pages', False):
 149             try:
 150                 url = url_or_request.get_full_url()
 151             except AttributeError:
 152                 url = url_or_request
 153             self.to_screen(u'Dumping request to ' + url)
 154             dump = base64.b64encode(webpage_bytes).decode('ascii')
 155             self._downloader.to_screen(dump)
 156         content = webpage_bytes.decode(encoding, 'replace')
 157         return (content, urlh)
 158
 159     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 160         """ Returns the data of the page as a string """
 161         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 162
 163     def to_screen(self, msg):
 164         """Print msg to screen, prefixing it with '[ie_name]'"""
 165         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 166
 167     def report_extraction(self, id_or_name):
 168         """Report information extraction."""
 169         self.to_screen(u'%s: Extracting information' % id_or_name)
 170
 171     def report_download_webpage(self, video_id):
 172         """Report webpage download."""
 173         self.to_screen(u'%s: Downloading webpage' % video_id)
 174
 175     def report_age_confirmation(self):
 176         """Report attempt to confirm age."""
 177         self.to_screen(u'Confirming age')
 178
 179     def report_login(self):
 180         """Report attempt to log in."""
 181         self.to_screen(u'Logging in')
 182
 183     #Methods for following #608
 184     def url_result(self, url, ie=None):
 185         """Returns a url that points to a page that should be processed"""
 186         #TODO: ie should be the class used for getting the info
 187         video_info = {'_type': 'url',
 188                       'url': url,
 189                       'ie_key': ie}
 190         return video_info
 191     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 192         """Returns a playlist"""
 193         video_info = {'_type': 'playlist',
 194                       'entries': entries}
 195         if playlist_id:
 196             video_info['id'] = playlist_id
 197         if playlist_title:
 198             video_info['title'] = playlist_title
 199         return video_info
 200
 201     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 202         """
 203         Perform a regex search on the given string, using a single or a list of
 204         patterns returning the first matching group.
 205         In case of failure return a default value or raise a WARNING or a
 206         ExtractorError, depending on fatal, specifying the field name.
 207         """
 208         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 209             mobj = re.search(pattern, string, flags)
 210         else:
 211             for p in pattern:
 212                 mobj = re.search(p, string, flags)
 213                 if mobj: break
 214
 215         if sys.stderr.isatty() and os.name != 'nt':
 216             _name = u'\033[0;34m%s\033[0m' % name
 217         else:
 218             _name = name
 219
 220         if mobj:
 221             # return the first matching group
 222             return next(g for g in mobj.groups() if g is not None)
 223         elif default is not None:
 224             return default
 225         elif fatal:
 226             raise ExtractorError(u'Unable to extract %s' % _name)
 227         else:
 228             self._downloader.report_warning(u'unable to extract %s; '
 229                 u'please report this issue on http://yt-dl.org/bug' % _name)
 230             return None
 231
 232     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 233         """
 234         Like _search_regex, but strips HTML tags and unescapes entities.
 235         """
 236         res = self._search_regex(pattern, string, name, default, fatal, flags)
 237         if res:
 238             return clean_html(res).strip()
 239         else:
 240             return res
 241
 242     def _get_login_info(self):
 243         """
 244         Get the the login info as (username, password)
 245         It will look in the netrc file using the _NETRC_MACHINE value
 246         If there's no info available, return (None, None)
 247         """
 248         if self._downloader is None:
 249             return (None, None)
 250
 251         username = None
 252         password = None
 253         downloader_params = self._downloader.params
 254
 255         # Attempt to use provided username and password or .netrc data
 256         if downloader_params.get('username', None) is not None:
 257             username = downloader_params['username']
 258             password = downloader_params['password']
 259         elif downloader_params.get('usenetrc', False):
 260             try:
 261                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 262                 if info is not None:
 263                     username = info[0]
 264                     password = info[2]
 265                 else:
 266                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 267             except (IOError, netrc.NetrcParseError) as err:
 268                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 269
 270         return (username, password)
 271
 272     # Helper functions for extracting OpenGraph info
 273     @staticmethod
 274     def _og_regex(prop):
 275         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 276
 277     def _og_search_property(self, prop, html, name=None, **kargs):
 278         if name is None:
 279             name = 'OpenGraph %s' % prop
 280         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 281         return unescapeHTML(escaped)
 282
 283     def _og_search_thumbnail(self, html, **kargs):
 284         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 285
 286     def _og_search_description(self, html, **kargs):
 287         return self._og_search_property('description', html, fatal=False, **kargs)
 288
 289     def _og_search_title(self, html, **kargs):
 290         return self._og_search_property('title', html, **kargs)
 291
 292     def _og_search_video_url(self, html, name='video url', **kargs):
 293         return self._html_search_regex([self._og_regex('video:secure_url'),
 294                                         self._og_regex('video')],
 295                                        html, name, **kargs)
 296
 297 class SearchInfoExtractor(InfoExtractor):
 298     """
 299     Base class for paged search queries extractors.
 300     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 301     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 302     """
 303
 304     @classmethod
 305     def _make_valid_url(cls):
 306         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 307
 308     @classmethod
 309     def suitable(cls, url):
 310         return re.match(cls._make_valid_url(), url) is not None
 311
 312     def _real_extract(self, query):
 313         mobj = re.match(self._make_valid_url(), query)
 314         if mobj is None:
 315             raise ExtractorError(u'Invalid search query "%s"' % query)
 316
 317         prefix = mobj.group('prefix')
 318         query = mobj.group('query')
 319         if prefix == '':
 320             return self._get_n_results(query, 1)
 321         elif prefix == 'all':
 322             return self._get_n_results(query, self._MAX_RESULTS)
 323         else:
 324             n = int(prefix)
 325             if n <= 0:
 326                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 327             elif n > self._MAX_RESULTS:
 328                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 329                 n = self._MAX_RESULTS
 330             return self._get_n_results(query, n)
 331
 332     def _get_n_results(self, query, n):
 333         """Get a specified number of results for a query"""
 334         raise NotImplementedError("This method must be implemented by sublclasses")
 335
 336     @property
 337     def SEARCH_KEY(self):
 338         return self._SEARCH_KEY