_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_request,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     view_count:     How many users have watched the video on the platform.
  58     urlhandle:      [internal] The urlHandle to be used to download the file,
  59                     like returned by urllib.request.urlopen
  60     age_limit:      Age restriction for the video, as an integer (years)
  61     formats:        A list of dictionaries for each format available, it must
  62                     be ordered from worst to best quality. Potential fields:
  63                     * url       Mandatory. The URL of the video file
  64                     * ext       Will be calculated from url if missing
  65                     * format    A human-readable description of the format
  66                                 ("mp4 container with h264/opus").
  67                                 Calculated from the format_id, width, height.
  68                                 and format_note fields if missing.
  69                     * format_id A short description of the format
  70                                 ("mp4_h264_opus" or "19")
  71                     * format_note Additional info about the format
  72                                 ("3D" or "DASH video")
  73                     * width     Width of the video, if known
  74                     * height    Height of the video, if known
  75                     * abr       Average audio bitrate in KBit/s
  76                     * acodec    Name of the audio codec in use
  77                     * vbr       Average video bitrate in KBit/s
  78                     * vcodec    Name of the video codec in use
  79                     * quality_name Human-readable name of the video quality.
  80                     * filesize  The number of bytes, if known in advance
  81     webpage_url:    The url to the video webpage, if given to youtube-dl it
  82                     should allow to get the same result again. (It will be set
  83                     by YoutubeDL if it's missing)
  84
  85     Unless mentioned otherwise, the fields should be Unicode strings.
  86
  87     Subclasses of this one should re-define the _real_initialize() and
  88     _real_extract() methods and define a _VALID_URL regexp.
  89     Probably, they should also be added to the list of extractors.
  90
  91     _real_extract() must return a *list* of information dictionaries as
  92     described above.
  93
  94     Finally, the _WORKING attribute should be set to False for broken IEs
  95     in order to warn the users and skip the tests.
  96     """
  97
  98     _ready = False
  99     _downloader = None
 100     _WORKING = True
 101
 102     def __init__(self, downloader=None):
 103         """Constructor. Receives an optional downloader."""
 104         self._ready = False
 105         self.set_downloader(downloader)
 106
 107     @classmethod
 108     def suitable(cls, url):
 109         """Receives a URL and returns True if suitable for this IE."""
 110
 111         # This does not use has/getattr intentionally - we want to know whether
 112         # we have cached the regexp for *this* class, whereas getattr would also
 113         # match the superclass
 114         if '_VALID_URL_RE' not in cls.__dict__:
 115             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 116         return cls._VALID_URL_RE.match(url) is not None
 117
 118     @classmethod
 119     def working(cls):
 120         """Getter method for _WORKING."""
 121         return cls._WORKING
 122
 123     def initialize(self):
 124         """Initializes an instance (authentication, etc)."""
 125         if not self._ready:
 126             self._real_initialize()
 127             self._ready = True
 128
 129     def extract(self, url):
 130         """Extracts URL information and returns it in list of dicts."""
 131         self.initialize()
 132         return self._real_extract(url)
 133
 134     def set_downloader(self, downloader):
 135         """Sets the downloader for this IE."""
 136         self._downloader = downloader
 137
 138     def _real_initialize(self):
 139         """Real initialization process. Redefine in subclasses."""
 140         pass
 141
 142     def _real_extract(self, url):
 143         """Real extraction process. Redefine in subclasses."""
 144         pass
 145
 146     @classmethod
 147     def ie_key(cls):
 148         """A string for getting the InfoExtractor with get_info_extractor"""
 149         return cls.__name__[:-2]
 150
 151     @property
 152     def IE_NAME(self):
 153         return type(self).__name__[:-2]
 154
 155     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 156         """ Returns the response handle """
 157         if note is None:
 158             self.report_download_webpage(video_id)
 159         elif note is not False:
 160             self.to_screen(u'%s: %s' % (video_id, note))
 161         try:
 162             return compat_urllib_request.urlopen(url_or_request)
 163         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 164             if errnote is None:
 165                 errnote = u'Unable to download webpage'
 166             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 167
 168     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 169         """ Returns a tuple (page content as string, URL handle) """
 170
 171         # Strip hashes from the URL (#1038)
 172         if isinstance(url_or_request, (compat_str, str)):
 173             url_or_request = url_or_request.partition('#')[0]
 174
 175         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 176         content_type = urlh.headers.get('Content-Type', '')
 177         webpage_bytes = urlh.read()
 178         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 179         if m:
 180             encoding = m.group(1)
 181         else:
 182             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 183                           webpage_bytes[:1024])
 184             if m:
 185                 encoding = m.group(1).decode('ascii')
 186             else:
 187                 encoding = 'utf-8'
 188         if self._downloader.params.get('dump_intermediate_pages', False):
 189             try:
 190                 url = url_or_request.get_full_url()
 191             except AttributeError:
 192                 url = url_or_request
 193             self.to_screen(u'Dumping request to ' + url)
 194             dump = base64.b64encode(webpage_bytes).decode('ascii')
 195             self._downloader.to_screen(dump)
 196         if self._downloader.params.get('write_pages', False):
 197             try:
 198                 url = url_or_request.get_full_url()
 199             except AttributeError:
 200                 url = url_or_request
 201             raw_filename = ('%s_%s.dump' % (video_id, url))
 202             filename = sanitize_filename(raw_filename, restricted=True)
 203             self.to_screen(u'Saving request to ' + filename)
 204             with open(filename, 'wb') as outf:
 205                 outf.write(webpage_bytes)
 206
 207         content = webpage_bytes.decode(encoding, 'replace')
 208         return (content, urlh)
 209
 210     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 211         """ Returns the data of the page as a string """
 212         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 213
 214     def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
 215         """Return the xml as an xml.etree.ElementTree.Element"""
 216         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 217         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 218
 219     def to_screen(self, msg):
 220         """Print msg to screen, prefixing it with '[ie_name]'"""
 221         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 222
 223     def report_extraction(self, id_or_name):
 224         """Report information extraction."""
 225         self.to_screen(u'%s: Extracting information' % id_or_name)
 226
 227     def report_download_webpage(self, video_id):
 228         """Report webpage download."""
 229         self.to_screen(u'%s: Downloading webpage' % video_id)
 230
 231     def report_age_confirmation(self):
 232         """Report attempt to confirm age."""
 233         self.to_screen(u'Confirming age')
 234
 235     def report_login(self):
 236         """Report attempt to log in."""
 237         self.to_screen(u'Logging in')
 238
 239     #Methods for following #608
 240     def url_result(self, url, ie=None, video_id=None):
 241         """Returns a url that points to a page that should be processed"""
 242         #TODO: ie should be the class used for getting the info
 243         video_info = {'_type': 'url',
 244                       'url': url,
 245                       'ie_key': ie}
 246         if video_id is not None:
 247             video_info['id'] = video_id
 248         return video_info
 249     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 250         """Returns a playlist"""
 251         video_info = {'_type': 'playlist',
 252                       'entries': entries}
 253         if playlist_id:
 254             video_info['id'] = playlist_id
 255         if playlist_title:
 256             video_info['title'] = playlist_title
 257         return video_info
 258
 259     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 260         """
 261         Perform a regex search on the given string, using a single or a list of
 262         patterns returning the first matching group.
 263         In case of failure return a default value or raise a WARNING or a
 264         RegexNotFoundError, depending on fatal, specifying the field name.
 265         """
 266         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 267             mobj = re.search(pattern, string, flags)
 268         else:
 269             for p in pattern:
 270                 mobj = re.search(p, string, flags)
 271                 if mobj: break
 272
 273         if sys.stderr.isatty() and os.name != 'nt':
 274             _name = u'\033[0;34m%s\033[0m' % name
 275         else:
 276             _name = name
 277
 278         if mobj:
 279             # return the first matching group
 280             return next(g for g in mobj.groups() if g is not None)
 281         elif default is not None:
 282             return default
 283         elif fatal:
 284             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 285         else:
 286             self._downloader.report_warning(u'unable to extract %s; '
 287                 u'please report this issue on http://yt-dl.org/bug' % _name)
 288             return None
 289
 290     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 291         """
 292         Like _search_regex, but strips HTML tags and unescapes entities.
 293         """
 294         res = self._search_regex(pattern, string, name, default, fatal, flags)
 295         if res:
 296             return clean_html(res).strip()
 297         else:
 298             return res
 299
 300     def _get_login_info(self):
 301         """
 302         Get the the login info as (username, password)
 303         It will look in the netrc file using the _NETRC_MACHINE value
 304         If there's no info available, return (None, None)
 305         """
 306         if self._downloader is None:
 307             return (None, None)
 308
 309         username = None
 310         password = None
 311         downloader_params = self._downloader.params
 312
 313         # Attempt to use provided username and password or .netrc data
 314         if downloader_params.get('username', None) is not None:
 315             username = downloader_params['username']
 316             password = downloader_params['password']
 317         elif downloader_params.get('usenetrc', False):
 318             try:
 319                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 320                 if info is not None:
 321                     username = info[0]
 322                     password = info[2]
 323                 else:
 324                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 325             except (IOError, netrc.NetrcParseError) as err:
 326                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 327
 328         return (username, password)
 329
 330     # Helper functions for extracting OpenGraph info
 331     @staticmethod
 332     def _og_regexes(prop):
 333         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 334         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 335         template = r'<meta[^>]+?%s[^>]+?%s'
 336         return [
 337             template % (property_re, content_re),
 338             template % (content_re, property_re),
 339         ]
 340
 341     def _og_search_property(self, prop, html, name=None, **kargs):
 342         if name is None:
 343             name = 'OpenGraph %s' % prop
 344         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 345         if escaped is None:
 346             return None
 347         return unescapeHTML(escaped)
 348
 349     def _og_search_thumbnail(self, html, **kargs):
 350         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 351
 352     def _og_search_description(self, html, **kargs):
 353         return self._og_search_property('description', html, fatal=False, **kargs)
 354
 355     def _og_search_title(self, html, **kargs):
 356         return self._og_search_property('title', html, **kargs)
 357
 358     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 359         regexes = self._og_regexes('video')
 360         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 361         return self._html_search_regex(regexes, html, name, **kargs)
 362
 363     def _html_search_meta(self, name, html, display_name=None):
 364         if display_name is None:
 365             display_name = name
 366         return self._html_search_regex(
 367             r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
 368                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 369             html, display_name, fatal=False)
 370
 371     def _dc_search_uploader(self, html):
 372         return self._html_search_meta('dc.creator', html, 'uploader')
 373
 374     def _rta_search(self, html):
 375         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 376         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 377                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 378                      html):
 379             return 18
 380         return 0
 381
 382     def _media_rating_search(self, html):
 383         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 384         rating = self._html_search_meta('rating', html)
 385
 386         if not rating:
 387             return None
 388
 389         RATING_TABLE = {
 390             'safe for kids': 0,
 391             'general': 8,
 392             '14 years': 14,
 393             'mature': 17,
 394             'restricted': 19,
 395         }
 396         return RATING_TABLE.get(rating.lower(), None)
 397
 398
 399
 400 class SearchInfoExtractor(InfoExtractor):
 401     """
 402     Base class for paged search queries extractors.
 403     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 404     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 405     """
 406
 407     @classmethod
 408     def _make_valid_url(cls):
 409         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 410
 411     @classmethod
 412     def suitable(cls, url):
 413         return re.match(cls._make_valid_url(), url) is not None
 414
 415     def _real_extract(self, query):
 416         mobj = re.match(self._make_valid_url(), query)
 417         if mobj is None:
 418             raise ExtractorError(u'Invalid search query "%s"' % query)
 419
 420         prefix = mobj.group('prefix')
 421         query = mobj.group('query')
 422         if prefix == '':
 423             return self._get_n_results(query, 1)
 424         elif prefix == 'all':
 425             return self._get_n_results(query, self._MAX_RESULTS)
 426         else:
 427             n = int(prefix)
 428             if n <= 0:
 429                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 430             elif n > self._MAX_RESULTS:
 431                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 432                 n = self._MAX_RESULTS
 433             return self._get_n_results(query, n)
 434
 435     def _get_n_results(self, query, n):
 436         """Get a specified number of results for a query"""
 437         raise NotImplementedError("This method must be implemented by subclasses")
 438
 439     @property
 440     def SEARCH_KEY(self):
 441         return self._SEARCH_KEY