_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     view_count:     How many users have watched the video on the platform.
  58     urlhandle:      [internal] The urlHandle to be used to download the file,
  59                     like returned by urllib.request.urlopen
  60     age_limit:      Age restriction for the video, as an integer (years)
  61     formats:        A list of dictionaries for each format available, it must
  62                     be ordered from worst to best quality. Potential fields:
  63                     * url       Mandatory. The URL of the video file
  64                     * ext       Will be calculated from url if missing
  65                     * format    A human-readable description of the format
  66                                 ("mp4 container with h264/opus").
  67                                 Calculated from the format_id, width, height.
  68                                 and format_note fields if missing.
  69                     * format_id A short description of the format
  70                                 ("mp4_h264_opus" or "19")
  71                     * format_note Additional info about the format
  72                                 ("3D" or "DASH video")
  73                     * width     Width of the video, if known
  74                     * height    Height of the video, if known
  75                     * abr       Average audio bitrate in KBit/s
  76                     * acodec    Name of the audio codec in use
  77                     * vbr       Average video bitrate in KBit/s
  78                     * vcodec    Name of the video codec in use
  79     webpage_url:    The url to the video webpage, if given to youtube-dl it
  80                     should allow to get the same result again. (It will be set
  81                     by YoutubeDL if it's missing)
  82
  83     Unless mentioned otherwise, the fields should be Unicode strings.
  84
  85     Subclasses of this one should re-define the _real_initialize() and
  86     _real_extract() methods and define a _VALID_URL regexp.
  87     Probably, they should also be added to the list of extractors.
  88
  89     _real_extract() must return a *list* of information dictionaries as
  90     described above.
  91
  92     Finally, the _WORKING attribute should be set to False for broken IEs
  93     in order to warn the users and skip the tests.
  94     """
  95
  96     _ready = False
  97     _downloader = None
  98     _WORKING = True
  99
 100     def __init__(self, downloader=None):
 101         """Constructor. Receives an optional downloader."""
 102         self._ready = False
 103         self.set_downloader(downloader)
 104
 105     @classmethod
 106     def suitable(cls, url):
 107         """Receives a URL and returns True if suitable for this IE."""
 108
 109         # This does not use has/getattr intentionally - we want to know whether
 110         # we have cached the regexp for *this* class, whereas getattr would also
 111         # match the superclass
 112         if '_VALID_URL_RE' not in cls.__dict__:
 113             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 114         return cls._VALID_URL_RE.match(url) is not None
 115
 116     @classmethod
 117     def working(cls):
 118         """Getter method for _WORKING."""
 119         return cls._WORKING
 120
 121     def initialize(self):
 122         """Initializes an instance (authentication, etc)."""
 123         if not self._ready:
 124             self._real_initialize()
 125             self._ready = True
 126
 127     def extract(self, url):
 128         """Extracts URL information and returns it in list of dicts."""
 129         self.initialize()
 130         return self._real_extract(url)
 131
 132     def set_downloader(self, downloader):
 133         """Sets the downloader for this IE."""
 134         self._downloader = downloader
 135
 136     def _real_initialize(self):
 137         """Real initialization process. Redefine in subclasses."""
 138         pass
 139
 140     def _real_extract(self, url):
 141         """Real extraction process. Redefine in subclasses."""
 142         pass
 143
 144     @classmethod
 145     def ie_key(cls):
 146         """A string for getting the InfoExtractor with get_info_extractor"""
 147         return cls.__name__[:-2]
 148
 149     @property
 150     def IE_NAME(self):
 151         return type(self).__name__[:-2]
 152
 153     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 154         """ Returns the response handle """
 155         if note is None:
 156             self.report_download_webpage(video_id)
 157         elif note is not False:
 158             self.to_screen(u'%s: %s' % (video_id, note))
 159         try:
 160             return self._downloader.urlopen(url_or_request)
 161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 162             if errnote is None:
 163                 errnote = u'Unable to download webpage'
 164             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 165
 166     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 167         """ Returns a tuple (page content as string, URL handle) """
 168
 169         # Strip hashes from the URL (#1038)
 170         if isinstance(url_or_request, (compat_str, str)):
 171             url_or_request = url_or_request.partition('#')[0]
 172
 173         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 174         content_type = urlh.headers.get('Content-Type', '')
 175         webpage_bytes = urlh.read()
 176         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 177         if m:
 178             encoding = m.group(1)
 179         else:
 180             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 181                           webpage_bytes[:1024])
 182             if m:
 183                 encoding = m.group(1).decode('ascii')
 184             else:
 185                 encoding = 'utf-8'
 186         if self._downloader.params.get('dump_intermediate_pages', False):
 187             try:
 188                 url = url_or_request.get_full_url()
 189             except AttributeError:
 190                 url = url_or_request
 191             self.to_screen(u'Dumping request to ' + url)
 192             dump = base64.b64encode(webpage_bytes).decode('ascii')
 193             self._downloader.to_screen(dump)
 194         if self._downloader.params.get('write_pages', False):
 195             try:
 196                 url = url_or_request.get_full_url()
 197             except AttributeError:
 198                 url = url_or_request
 199             raw_filename = ('%s_%s.dump' % (video_id, url))
 200             filename = sanitize_filename(raw_filename, restricted=True)
 201             self.to_screen(u'Saving request to ' + filename)
 202             with open(filename, 'wb') as outf:
 203                 outf.write(webpage_bytes)
 204
 205         content = webpage_bytes.decode(encoding, 'replace')
 206         return (content, urlh)
 207
 208     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 209         """ Returns the data of the page as a string """
 210         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 211
 212     def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
 213         """Return the xml as an xml.etree.ElementTree.Element"""
 214         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 215         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 216
 217     def to_screen(self, msg):
 218         """Print msg to screen, prefixing it with '[ie_name]'"""
 219         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 220
 221     def report_extraction(self, id_or_name):
 222         """Report information extraction."""
 223         self.to_screen(u'%s: Extracting information' % id_or_name)
 224
 225     def report_download_webpage(self, video_id):
 226         """Report webpage download."""
 227         self.to_screen(u'%s: Downloading webpage' % video_id)
 228
 229     def report_age_confirmation(self):
 230         """Report attempt to confirm age."""
 231         self.to_screen(u'Confirming age')
 232
 233     def report_login(self):
 234         """Report attempt to log in."""
 235         self.to_screen(u'Logging in')
 236
 237     #Methods for following #608
 238     def url_result(self, url, ie=None, video_id=None):
 239         """Returns a url that points to a page that should be processed"""
 240         #TODO: ie should be the class used for getting the info
 241         video_info = {'_type': 'url',
 242                       'url': url,
 243                       'ie_key': ie}
 244         if video_id is not None:
 245             video_info['id'] = video_id
 246         return video_info
 247     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 248         """Returns a playlist"""
 249         video_info = {'_type': 'playlist',
 250                       'entries': entries}
 251         if playlist_id:
 252             video_info['id'] = playlist_id
 253         if playlist_title:
 254             video_info['title'] = playlist_title
 255         return video_info
 256
 257     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 258         """
 259         Perform a regex search on the given string, using a single or a list of
 260         patterns returning the first matching group.
 261         In case of failure return a default value or raise a WARNING or a
 262         RegexNotFoundError, depending on fatal, specifying the field name.
 263         """
 264         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 265             mobj = re.search(pattern, string, flags)
 266         else:
 267             for p in pattern:
 268                 mobj = re.search(p, string, flags)
 269                 if mobj: break
 270
 271         if sys.stderr.isatty() and os.name != 'nt':
 272             _name = u'\033[0;34m%s\033[0m' % name
 273         else:
 274             _name = name
 275
 276         if mobj:
 277             # return the first matching group
 278             return next(g for g in mobj.groups() if g is not None)
 279         elif default is not None:
 280             return default
 281         elif fatal:
 282             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 283         else:
 284             self._downloader.report_warning(u'unable to extract %s; '
 285                 u'please report this issue on http://yt-dl.org/bug' % _name)
 286             return None
 287
 288     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 289         """
 290         Like _search_regex, but strips HTML tags and unescapes entities.
 291         """
 292         res = self._search_regex(pattern, string, name, default, fatal, flags)
 293         if res:
 294             return clean_html(res).strip()
 295         else:
 296             return res
 297
 298     def _get_login_info(self):
 299         """
 300         Get the the login info as (username, password)
 301         It will look in the netrc file using the _NETRC_MACHINE value
 302         If there's no info available, return (None, None)
 303         """
 304         if self._downloader is None:
 305             return (None, None)
 306
 307         username = None
 308         password = None
 309         downloader_params = self._downloader.params
 310
 311         # Attempt to use provided username and password or .netrc data
 312         if downloader_params.get('username', None) is not None:
 313             username = downloader_params['username']
 314             password = downloader_params['password']
 315         elif downloader_params.get('usenetrc', False):
 316             try:
 317                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 318                 if info is not None:
 319                     username = info[0]
 320                     password = info[2]
 321                 else:
 322                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 323             except (IOError, netrc.NetrcParseError) as err:
 324                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 325
 326         return (username, password)
 327
 328     # Helper functions for extracting OpenGraph info
 329     @staticmethod
 330     def _og_regexes(prop):
 331         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 332         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 333         template = r'<meta[^>]+?%s[^>]+?%s'
 334         return [
 335             template % (property_re, content_re),
 336             template % (content_re, property_re),
 337         ]
 338
 339     def _og_search_property(self, prop, html, name=None, **kargs):
 340         if name is None:
 341             name = 'OpenGraph %s' % prop
 342         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 343         if escaped is None:
 344             return None
 345         return unescapeHTML(escaped)
 346
 347     def _og_search_thumbnail(self, html, **kargs):
 348         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 349
 350     def _og_search_description(self, html, **kargs):
 351         return self._og_search_property('description', html, fatal=False, **kargs)
 352
 353     def _og_search_title(self, html, **kargs):
 354         return self._og_search_property('title', html, **kargs)
 355
 356     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 357         regexes = self._og_regexes('video')
 358         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 359         return self._html_search_regex(regexes, html, name, **kargs)
 360
 361     def _html_search_meta(self, name, html, display_name=None):
 362         if display_name is None:
 363             display_name = name
 364         return self._html_search_regex(
 365             r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
 366                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 367             html, display_name, fatal=False)
 368
 369     def _dc_search_uploader(self, html):
 370         return self._html_search_meta('dc.creator', html, 'uploader')
 371
 372     def _rta_search(self, html):
 373         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 374         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 375                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 376                      html):
 377             return 18
 378         return 0
 379
 380     def _media_rating_search(self, html):
 381         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 382         rating = self._html_search_meta('rating', html)
 383
 384         if not rating:
 385             return None
 386
 387         RATING_TABLE = {
 388             'safe for kids': 0,
 389             'general': 8,
 390             '14 years': 14,
 391             'mature': 17,
 392             'restricted': 19,
 393         }
 394         return RATING_TABLE.get(rating.lower(), None)
 395
 396
 397
 398 class SearchInfoExtractor(InfoExtractor):
 399     """
 400     Base class for paged search queries extractors.
 401     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 402     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 403     """
 404
 405     @classmethod
 406     def _make_valid_url(cls):
 407         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 408
 409     @classmethod
 410     def suitable(cls, url):
 411         return re.match(cls._make_valid_url(), url) is not None
 412
 413     def _real_extract(self, query):
 414         mobj = re.match(self._make_valid_url(), query)
 415         if mobj is None:
 416             raise ExtractorError(u'Invalid search query "%s"' % query)
 417
 418         prefix = mobj.group('prefix')
 419         query = mobj.group('query')
 420         if prefix == '':
 421             return self._get_n_results(query, 1)
 422         elif prefix == 'all':
 423             return self._get_n_results(query, self._MAX_RESULTS)
 424         else:
 425             n = int(prefix)
 426             if n <= 0:
 427                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 428             elif n > self._MAX_RESULTS:
 429                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 430                 n = self._MAX_RESULTS
 431             return self._get_n_results(query, n)
 432
 433     def _get_n_results(self, query, n):
 434         """Get a specified number of results for a query"""
 435         raise NotImplementedError("This method must be implemented by subclasses")
 436
 437     @property
 438     def SEARCH_KEY(self):
 439         return self._SEARCH_KEY