_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     The following fields are optional:
  39
  40     format:         The video format, defaults to ext (used for --get-format)
  41     thumbnails:     A list of dictionaries (with the entries "resolution" and
  42                     "url") for the varying thumbnails
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The subtitle file contents as a dictionary in the format
  51                     {language: subtitles}.
  52     view_count:     How many users have watched the video on the platform.
  53     urlhandle:      [internal] The urlHandle to be used to download the file,
  54                     like returned by urllib.request.urlopen
  55
  56     The fields should all be Unicode strings.
  57
  58     Subclasses of this one should re-define the _real_initialize() and
  59     _real_extract() methods and define a _VALID_URL regexp.
  60     Probably, they should also be added to the list of extractors.
  61
  62     _real_extract() must return a *list* of information dictionaries as
  63     described above.
  64
  65     Finally, the _WORKING attribute should be set to False for broken IEs
  66     in order to warn the users and skip the tests.
  67     """
  68
  69     _ready = False
  70     _downloader = None
  71     _WORKING = True
  72
  73     def __init__(self, downloader=None):
  74         """Constructor. Receives an optional downloader."""
  75         self._ready = False
  76         self.set_downloader(downloader)
  77
  78     @classmethod
  79     def suitable(cls, url):
  80         """Receives a URL and returns True if suitable for this IE."""
  81         return re.match(cls._VALID_URL, url) is not None
  82
  83     @classmethod
  84     def working(cls):
  85         """Getter method for _WORKING."""
  86         return cls._WORKING
  87
  88     def initialize(self):
  89         """Initializes an instance (authentication, etc)."""
  90         if not self._ready:
  91             self._real_initialize()
  92             self._ready = True
  93
  94     def extract(self, url):
  95         """Extracts URL information and returns it in list of dicts."""
  96         self.initialize()
  97         return self._real_extract(url)
  98
  99     def set_downloader(self, downloader):
 100         """Sets the downloader for this IE."""
 101         self._downloader = downloader
 102
 103     def _real_initialize(self):
 104         """Real initialization process. Redefine in subclasses."""
 105         pass
 106
 107     def _real_extract(self, url):
 108         """Real extraction process. Redefine in subclasses."""
 109         pass
 110
 111     @property
 112     def IE_NAME(self):
 113         return type(self).__name__[:-2]
 114
 115     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 116         """ Returns the response handle """
 117         if note is None:
 118             self.report_download_webpage(video_id)
 119         elif note is not False:
 120             self.to_screen(u'%s: %s' % (video_id, note))
 121         try:
 122             return compat_urllib_request.urlopen(url_or_request)
 123         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 124             if errnote is None:
 125                 errnote = u'Unable to download webpage'
 126             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 127
 128     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 129         """ Returns a tuple (page content as string, URL handle) """
 130
 131         # Strip hashes from the URL (#1038)
 132         if isinstance(url_or_request, (compat_str, str)):
 133             url_or_request = url_or_request.partition('#')[0]
 134
 135         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 136         content_type = urlh.headers.get('Content-Type', '')
 137         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 138         if m:
 139             encoding = m.group(1)
 140         else:
 141             encoding = 'utf-8'
 142         webpage_bytes = urlh.read()
 143         if self._downloader.params.get('dump_intermediate_pages', False):
 144             try:
 145                 url = url_or_request.get_full_url()
 146             except AttributeError:
 147                 url = url_or_request
 148             self.to_screen(u'Dumping request to ' + url)
 149             dump = base64.b64encode(webpage_bytes).decode('ascii')
 150             self._downloader.to_screen(dump)
 151         content = webpage_bytes.decode(encoding, 'replace')
 152         return (content, urlh)
 153
 154     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 155         """ Returns the data of the page as a string """
 156         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 157
 158     def to_screen(self, msg):
 159         """Print msg to screen, prefixing it with '[ie_name]'"""
 160         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 161
 162     def report_extraction(self, id_or_name):
 163         """Report information extraction."""
 164         self.to_screen(u'%s: Extracting information' % id_or_name)
 165
 166     def report_download_webpage(self, video_id):
 167         """Report webpage download."""
 168         self.to_screen(u'%s: Downloading webpage' % video_id)
 169
 170     def report_age_confirmation(self):
 171         """Report attempt to confirm age."""
 172         self.to_screen(u'Confirming age')
 173
 174     def report_login(self):
 175         """Report attempt to log in."""
 176         self.to_screen(u'Logging in')
 177
 178     #Methods for following #608
 179     def url_result(self, url, ie=None):
 180         """Returns a url that points to a page that should be processed"""
 181         #TODO: ie should be the class used for getting the info
 182         video_info = {'_type': 'url',
 183                       'url': url,
 184                       'ie_key': ie}
 185         return video_info
 186     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 187         """Returns a playlist"""
 188         video_info = {'_type': 'playlist',
 189                       'entries': entries}
 190         if playlist_id:
 191             video_info['id'] = playlist_id
 192         if playlist_title:
 193             video_info['title'] = playlist_title
 194         return video_info
 195
 196     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 197         """
 198         Perform a regex search on the given string, using a single or a list of
 199         patterns returning the first matching group.
 200         In case of failure return a default value or raise a WARNING or a
 201         ExtractorError, depending on fatal, specifying the field name.
 202         """
 203         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 204             mobj = re.search(pattern, string, flags)
 205         else:
 206             for p in pattern:
 207                 mobj = re.search(p, string, flags)
 208                 if mobj: break
 209
 210         if sys.stderr.isatty() and os.name != 'nt':
 211             _name = u'\033[0;34m%s\033[0m' % name
 212         else:
 213             _name = name
 214
 215         if mobj:
 216             # return the first matching group
 217             return next(g for g in mobj.groups() if g is not None)
 218         elif default is not None:
 219             return default
 220         elif fatal:
 221             raise ExtractorError(u'Unable to extract %s' % _name)
 222         else:
 223             self._downloader.report_warning(u'unable to extract %s; '
 224                 u'please report this issue on http://yt-dl.org/bug' % _name)
 225             return None
 226
 227     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 228         """
 229         Like _search_regex, but strips HTML tags and unescapes entities.
 230         """
 231         res = self._search_regex(pattern, string, name, default, fatal, flags)
 232         if res:
 233             return clean_html(res).strip()
 234         else:
 235             return res
 236
 237     def _get_login_info(self):
 238         """
 239         Get the the login info as (username, password)
 240         It will look in the netrc file using the _NETRC_MACHINE value
 241         If there's no info available, return (None, None)
 242         """
 243         if self._downloader is None:
 244             return (None, None)
 245
 246         username = None
 247         password = None
 248         downloader_params = self._downloader.params
 249
 250         # Attempt to use provided username and password or .netrc data
 251         if downloader_params.get('username', None) is not None:
 252             username = downloader_params['username']
 253             password = downloader_params['password']
 254         elif downloader_params.get('usenetrc', False):
 255             try:
 256                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 257                 if info is not None:
 258                     username = info[0]
 259                     password = info[2]
 260                 else:
 261                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 262             except (IOError, netrc.NetrcParseError) as err:
 263                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 264
 265         return (username, password)
 266
 267     # Helper functions for extracting OpenGraph info
 268     @staticmethod
 269     def _og_regex(prop):
 270         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 271
 272     def _og_search_property(self, prop, html, name=None, **kargs):
 273         if name is None:
 274             name = 'OpenGraph %s' % prop
 275         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 276         return unescapeHTML(escaped)
 277
 278     def _og_search_thumbnail(self, html, **kargs):
 279         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 280
 281     def _og_search_description(self, html, **kargs):
 282         return self._og_search_property('description', html, fatal=False, **kargs)
 283
 284     def _og_search_title(self, html, **kargs):
 285         return self._og_search_property('title', html, **kargs)
 286
 287     def _og_search_video_url(self, html, name='video url', **kargs):
 288         return self._html_search_regex([self._og_regex('video:secure_url'),
 289                                         self._og_regex('video')],
 290                                        html, name, **kargs)
 291
 292 class SearchInfoExtractor(InfoExtractor):
 293     """
 294     Base class for paged search queries extractors.
 295     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 296     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 297     """
 298
 299     @classmethod
 300     def _make_valid_url(cls):
 301         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 302
 303     @classmethod
 304     def suitable(cls, url):
 305         return re.match(cls._make_valid_url(), url) is not None
 306
 307     def _real_extract(self, query):
 308         mobj = re.match(self._make_valid_url(), query)
 309         if mobj is None:
 310             raise ExtractorError(u'Invalid search query "%s"' % query)
 311
 312         prefix = mobj.group('prefix')
 313         query = mobj.group('query')
 314         if prefix == '':
 315             return self._get_n_results(query, 1)
 316         elif prefix == 'all':
 317             return self._get_n_results(query, self._MAX_RESULTS)
 318         else:
 319             n = int(prefix)
 320             if n <= 0:
 321                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 322             elif n > self._MAX_RESULTS:
 323                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 324                 n = self._MAX_RESULTS
 325             return self._get_n_results(query, n)
 326
 327     def _get_n_results(self, query, n):
 328         """Get a specified number of results for a query"""
 329         raise NotImplementedError("This method must be implemented by sublclasses")
 330
 331     @property
 332     def SEARCH_KEY(self):
 333         return self._SEARCH_KEY