_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6
   7 from ..utils import (
   8     compat_http_client,
   9     compat_urllib_error,
  10     compat_urllib_request,
  11     compat_str,
  12
  13     clean_html,
  14     compiled_regex_type,
  15     ExtractorError,
  16 )
  17
  18 class InfoExtractor(object):
  19     """Information Extractor class.
  20
  21     Information extractors are the classes that, given a URL, extract
  22     information about the video (or videos) the URL refers to. This
  23     information includes the real video URL, the video title, author and
  24     others. The information is stored in a dictionary which is then
  25     passed to the FileDownloader. The FileDownloader processes this
  26     information possibly downloading the video to the file system, among
  27     other possible outcomes.
  28
  29     The dictionaries must include the following fields:
  30
  31     id:             Video identifier.
  32     url:            Final video URL.
  33     title:          Video title, unescaped.
  34     ext:            Video filename extension.
  35
  36     The following fields are optional:
  37
  38     format:         The video format, defaults to ext (used for --get-format)
  39     thumbnails:     A list of dictionaries (with the entries "resolution" and
  40                     "url") for the varying thumbnails
  41     thumbnail:      Full URL to a video thumbnail image.
  42     description:    One-line video description.
  43     uploader:       Full name of the video uploader.
  44     upload_date:    Video upload date (YYYYMMDD).
  45     uploader_id:    Nickname or id of the video uploader.
  46     location:       Physical location of the video.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The subtitle file contents.
  49     view_count:     How many users have watched the video on the platform.
  50     urlhandle:      [internal] The urlHandle to be used to download the file,
  51                     like returned by urllib.request.urlopen
  52
  53     The fields should all be Unicode strings.
  54
  55     Subclasses of this one should re-define the _real_initialize() and
  56     _real_extract() methods and define a _VALID_URL regexp.
  57     Probably, they should also be added to the list of extractors.
  58
  59     _real_extract() must return a *list* of information dictionaries as
  60     described above.
  61
  62     Finally, the _WORKING attribute should be set to False for broken IEs
  63     in order to warn the users and skip the tests.
  64     """
  65
  66     _ready = False
  67     _downloader = None
  68     _WORKING = True
  69
  70     def __init__(self, downloader=None):
  71         """Constructor. Receives an optional downloader."""
  72         self._ready = False
  73         self.set_downloader(downloader)
  74
  75     @classmethod
  76     def suitable(cls, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(cls._VALID_URL, url) is not None
  79
  80     @classmethod
  81     def working(cls):
  82         """Getter method for _WORKING."""
  83         return cls._WORKING
  84
  85     def initialize(self):
  86         """Initializes an instance (authentication, etc)."""
  87         if not self._ready:
  88             self._real_initialize()
  89             self._ready = True
  90
  91     def extract(self, url):
  92         """Extracts URL information and returns it in list of dicts."""
  93         self.initialize()
  94         return self._real_extract(url)
  95
  96     def set_downloader(self, downloader):
  97         """Sets the downloader for this IE."""
  98         self._downloader = downloader
  99
 100     def _real_initialize(self):
 101         """Real initialization process. Redefine in subclasses."""
 102         pass
 103
 104     def _real_extract(self, url):
 105         """Real extraction process. Redefine in subclasses."""
 106         pass
 107
 108     @property
 109     def IE_NAME(self):
 110         return type(self).__name__[:-2]
 111
 112     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 113         """ Returns the response handle """
 114         if note is None:
 115             self.report_download_webpage(video_id)
 116         elif note is not False:
 117             self.to_screen(u'%s: %s' % (video_id, note))
 118         try:
 119             return compat_urllib_request.urlopen(url_or_request)
 120         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 121             if errnote is None:
 122                 errnote = u'Unable to download webpage'
 123             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 124
 125     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 126         """ Returns a tuple (page content as string, URL handle) """
 127         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 128         content_type = urlh.headers.get('Content-Type', '')
 129         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 130         if m:
 131             encoding = m.group(1)
 132         else:
 133             encoding = 'utf-8'
 134         webpage_bytes = urlh.read()
 135         if self._downloader.params.get('dump_intermediate_pages', False):
 136             try:
 137                 url = url_or_request.get_full_url()
 138             except AttributeError:
 139                 url = url_or_request
 140             self.to_screen(u'Dumping request to ' + url)
 141             dump = base64.b64encode(webpage_bytes).decode('ascii')
 142             self._downloader.to_screen(dump)
 143         content = webpage_bytes.decode(encoding, 'replace')
 144         return (content, urlh)
 145
 146     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 147         """ Returns the data of the page as a string """
 148         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 149
 150     def to_screen(self, msg):
 151         """Print msg to screen, prefixing it with '[ie_name]'"""
 152         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 153
 154     def report_extraction(self, id_or_name):
 155         """Report information extraction."""
 156         self.to_screen(u'%s: Extracting information' % id_or_name)
 157
 158     def report_download_webpage(self, video_id):
 159         """Report webpage download."""
 160         self.to_screen(u'%s: Downloading webpage' % video_id)
 161
 162     def report_age_confirmation(self):
 163         """Report attempt to confirm age."""
 164         self.to_screen(u'Confirming age')
 165
 166     #Methods for following #608
 167     #They set the correct value of the '_type' key
 168     def video_result(self, video_info):
 169         """Returns a video"""
 170         video_info['_type'] = 'video'
 171         return video_info
 172     def url_result(self, url, ie=None):
 173         """Returns a url that points to a page that should be processed"""
 174         #TODO: ie should be the class used for getting the info
 175         video_info = {'_type': 'url',
 176                       'url': url,
 177                       'ie_key': ie}
 178         return video_info
 179     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 180         """Returns a playlist"""
 181         video_info = {'_type': 'playlist',
 182                       'entries': entries}
 183         if playlist_id:
 184             video_info['id'] = playlist_id
 185         if playlist_title:
 186             video_info['title'] = playlist_title
 187         return video_info
 188
 189     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 190         """
 191         Perform a regex search on the given string, using a single or a list of
 192         patterns returning the first matching group.
 193         In case of failure return a default value or raise a WARNING or a
 194         ExtractorError, depending on fatal, specifying the field name.
 195         """
 196         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 197             mobj = re.search(pattern, string, flags)
 198         else:
 199             for p in pattern:
 200                 mobj = re.search(p, string, flags)
 201                 if mobj: break
 202
 203         if sys.stderr.isatty() and os.name != 'nt':
 204             _name = u'\033[0;34m%s\033[0m' % name
 205         else:
 206             _name = name
 207
 208         if mobj:
 209             # return the first matching group
 210             return next(g for g in mobj.groups() if g is not None)
 211         elif default is not None:
 212             return default
 213         elif fatal:
 214             raise ExtractorError(u'Unable to extract %s' % _name)
 215         else:
 216             self._downloader.report_warning(u'unable to extract %s; '
 217                 u'please report this issue on http://yt-dl.org/bug' % _name)
 218             return None
 219
 220     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 221         """
 222         Like _search_regex, but strips HTML tags and unescapes entities.
 223         """
 224         res = self._search_regex(pattern, string, name, default, fatal, flags)
 225         if res:
 226             return clean_html(res).strip()
 227         else:
 228             return res
 229
 230 class SearchInfoExtractor(InfoExtractor):
 231     """
 232     Base class for paged search queries extractors.
 233     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 234     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 235     """
 236
 237     @classmethod
 238     def _make_valid_url(cls):
 239         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 240
 241     @classmethod
 242     def suitable(cls, url):
 243         return re.match(cls._make_valid_url(), url) is not None
 244
 245     def _real_extract(self, query):
 246         mobj = re.match(self._make_valid_url(), query)
 247         if mobj is None:
 248             raise ExtractorError(u'Invalid search query "%s"' % query)
 249
 250         prefix = mobj.group('prefix')
 251         query = mobj.group('query')
 252         if prefix == '':
 253             return self._get_n_results(query, 1)
 254         elif prefix == 'all':
 255             return self._get_n_results(query, self._MAX_RESULTS)
 256         else:
 257             n = int(prefix)
 258             if n <= 0:
 259                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 260             elif n > self._MAX_RESULTS:
 261                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 262                 n = self._MAX_RESULTS
 263             return self._get_n_results(query, n)
 264
 265     def _get_n_results(self, query, n):
 266         """Get a specified number of results for a query"""
 267         raise NotImplementedError("This method must be implemented by sublclasses")
 268
 269     @property
 270     def SEARCH_KEY(self):
 271         return self._SEARCH_KEY