_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_parse_urlparse,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22 _NO_DEFAULT = object()
  23
  24
  25 class InfoExtractor(object):
  26     """Information Extractor class.
  27
  28     Information extractors are the classes that, given a URL, extract
  29     information about the video (or videos) the URL refers to. This
  30     information includes the real video URL, the video title, author and
  31     others. The information is stored in a dictionary which is then
  32     passed to the FileDownloader. The FileDownloader processes this
  33     information possibly downloading the video to the file system, among
  34     other possible outcomes.
  35
  36     The dictionaries must include the following fields:
  37
  38     id:             Video identifier.
  39     title:          Video title, unescaped.
  40
  41     Additionally, it must contain either a formats entry or a url one:
  42
  43     formats:        A list of dictionaries for each format available, ordered
  44                     from worst to best quality.
  45
  46                     Potential fields:
  47                     * url        Mandatory. The URL of the video file
  48                     * ext        Will be calculated from url if missing
  49                     * format     A human-readable description of the format
  50                                  ("mp4 container with h264/opus").
  51                                  Calculated from the format_id, width, height.
  52                                  and format_note fields if missing.
  53                     * format_id  A short description of the format
  54                                  ("mp4_h264_opus" or "19")
  55                     * format_note Additional info about the format
  56                                  ("3D" or "DASH video")
  57                     * width      Width of the video, if known
  58                     * height     Height of the video, if known
  59                     * resolution Textual description of width and height
  60                     * abr        Average audio bitrate in KBit/s
  61                     * acodec     Name of the audio codec in use
  62                     * vbr        Average video bitrate in KBit/s
  63                     * vcodec     Name of the video codec in use
  64                     * filesize   The number of bytes, if known in advance
  65                     * player_url SWF Player URL (used for rtmpdump).
  66                     * protocol   The protocol that will be used for the actual
  67                                  download, lower-case.
  68                                  "http", "https", "rtsp", "rtmp" or so.
  69                     * preference Order number of this format. If this field is
  70                                  present, the formats get sorted by this field.
  71                                  -1 for default (order by other properties),
  72                                  -2 or smaller for less than default.
  73     url:            Final video URL.
  74     ext:            Video filename extension.
  75     format:         The video format, defaults to ext (used for --get-format)
  76     player_url:     SWF Player URL (used for rtmpdump).
  77
  78     The following fields are optional:
  79
  80     thumbnails:     A list of dictionaries (with the entries "resolution" and
  81                     "url") for the varying thumbnails
  82     thumbnail:      Full URL to a video thumbnail image.
  83     description:    One-line video description.
  84     uploader:       Full name of the video uploader.
  85     upload_date:    Video upload date (YYYYMMDD).
  86     uploader_id:    Nickname or id of the video uploader.
  87     location:       Physical location of the video.
  88     subtitles:      The subtitle file contents as a dictionary in the format
  89                     {language: subtitles}.
  90     duration:       Length of the video in seconds, as an integer.
  91     view_count:     How many users have watched the video on the platform.
  92     like_count:     Number of positive ratings of the video
  93     dislike_count:  Number of negative ratings of the video
  94     comment_count:  Number of comments on the video
  95     age_limit:      Age restriction for the video, as an integer (years)
  96     webpage_url:    The url to the video webpage, if given to youtube-dl it
  97                     should allow to get the same result again. (It will be set
  98                     by YoutubeDL if it's missing)
  99
 100     Unless mentioned otherwise, the fields should be Unicode strings.
 101
 102     Subclasses of this one should re-define the _real_initialize() and
 103     _real_extract() methods and define a _VALID_URL regexp.
 104     Probably, they should also be added to the list of extractors.
 105
 106     _real_extract() must return a *list* of information dictionaries as
 107     described above.
 108
 109     Finally, the _WORKING attribute should be set to False for broken IEs
 110     in order to warn the users and skip the tests.
 111     """
 112
 113     _ready = False
 114     _downloader = None
 115     _WORKING = True
 116
 117     def __init__(self, downloader=None):
 118         """Constructor. Receives an optional downloader."""
 119         self._ready = False
 120         self.set_downloader(downloader)
 121
 122     @classmethod
 123     def suitable(cls, url):
 124         """Receives a URL and returns True if suitable for this IE."""
 125
 126         # This does not use has/getattr intentionally - we want to know whether
 127         # we have cached the regexp for *this* class, whereas getattr would also
 128         # match the superclass
 129         if '_VALID_URL_RE' not in cls.__dict__:
 130             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 131         return cls._VALID_URL_RE.match(url) is not None
 132
 133     @classmethod
 134     def working(cls):
 135         """Getter method for _WORKING."""
 136         return cls._WORKING
 137
 138     def initialize(self):
 139         """Initializes an instance (authentication, etc)."""
 140         if not self._ready:
 141             self._real_initialize()
 142             self._ready = True
 143
 144     def extract(self, url):
 145         """Extracts URL information and returns it in list of dicts."""
 146         self.initialize()
 147         return self._real_extract(url)
 148
 149     def set_downloader(self, downloader):
 150         """Sets the downloader for this IE."""
 151         self._downloader = downloader
 152
 153     def _real_initialize(self):
 154         """Real initialization process. Redefine in subclasses."""
 155         pass
 156
 157     def _real_extract(self, url):
 158         """Real extraction process. Redefine in subclasses."""
 159         pass
 160
 161     @classmethod
 162     def ie_key(cls):
 163         """A string for getting the InfoExtractor with get_info_extractor"""
 164         return cls.__name__[:-2]
 165
 166     @property
 167     def IE_NAME(self):
 168         return type(self).__name__[:-2]
 169
 170     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 171         """ Returns the response handle """
 172         if note is None:
 173             self.report_download_webpage(video_id)
 174         elif note is not False:
 175             if video_id is None:
 176                 self.to_screen(u'%s' % (note,))
 177             else:
 178                 self.to_screen(u'%s: %s' % (video_id, note))
 179         try:
 180             return self._downloader.urlopen(url_or_request)
 181         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 182             if errnote is False:
 183                 return False
 184             if errnote is None:
 185                 errnote = u'Unable to download webpage'
 186             errmsg = u'%s: %s' % (errnote, compat_str(err))
 187             if fatal:
 188                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 189             else:
 190                 self._downloader.report_warning(errmsg)
 191                 return False
 192
 193     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 194         """ Returns a tuple (page content as string, URL handle) """
 195
 196         # Strip hashes from the URL (#1038)
 197         if isinstance(url_or_request, (compat_str, str)):
 198             url_or_request = url_or_request.partition('#')[0]
 199
 200         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 201         if urlh is False:
 202             assert not fatal
 203             return False
 204         content_type = urlh.headers.get('Content-Type', '')
 205         webpage_bytes = urlh.read()
 206         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 207         if m:
 208             encoding = m.group(1)
 209         else:
 210             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 211                           webpage_bytes[:1024])
 212             if m:
 213                 encoding = m.group(1).decode('ascii')
 214             else:
 215                 encoding = 'utf-8'
 216         if self._downloader.params.get('dump_intermediate_pages', False):
 217             try:
 218                 url = url_or_request.get_full_url()
 219             except AttributeError:
 220                 url = url_or_request
 221             self.to_screen(u'Dumping request to ' + url)
 222             dump = base64.b64encode(webpage_bytes).decode('ascii')
 223             self._downloader.to_screen(dump)
 224         if self._downloader.params.get('write_pages', False):
 225             try:
 226                 url = url_or_request.get_full_url()
 227             except AttributeError:
 228                 url = url_or_request
 229             raw_filename = ('%s_%s.dump' % (video_id, url))
 230             filename = sanitize_filename(raw_filename, restricted=True)
 231             self.to_screen(u'Saving request to ' + filename)
 232             with open(filename, 'wb') as outf:
 233                 outf.write(webpage_bytes)
 234
 235         content = webpage_bytes.decode(encoding, 'replace')
 236         return (content, urlh)
 237
 238     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 239         """ Returns the data of the page as a string """
 240         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 241         if res is False:
 242             return res
 243         else:
 244             content, _ = res
 245             return content
 246
 247     def _download_xml(self, url_or_request, video_id,
 248                       note=u'Downloading XML', errnote=u'Unable to download XML',
 249                       transform_source=None):
 250         """Return the xml as an xml.etree.ElementTree.Element"""
 251         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 252         if transform_source:
 253             xml_string = transform_source(xml_string)
 254         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 255
 256     def report_warning(self, msg, video_id=None):
 257         idstr = u'' if video_id is None else u'%s: ' % video_id
 258         self._downloader.report_warning(
 259             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 260
 261     def to_screen(self, msg):
 262         """Print msg to screen, prefixing it with '[ie_name]'"""
 263         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 264
 265     def report_extraction(self, id_or_name):
 266         """Report information extraction."""
 267         self.to_screen(u'%s: Extracting information' % id_or_name)
 268
 269     def report_download_webpage(self, video_id):
 270         """Report webpage download."""
 271         self.to_screen(u'%s: Downloading webpage' % video_id)
 272
 273     def report_age_confirmation(self):
 274         """Report attempt to confirm age."""
 275         self.to_screen(u'Confirming age')
 276
 277     def report_login(self):
 278         """Report attempt to log in."""
 279         self.to_screen(u'Logging in')
 280
 281     #Methods for following #608
 282     @staticmethod
 283     def url_result(url, ie=None, video_id=None):
 284         """Returns a url that points to a page that should be processed"""
 285         #TODO: ie should be the class used for getting the info
 286         video_info = {'_type': 'url',
 287                       'url': url,
 288                       'ie_key': ie}
 289         if video_id is not None:
 290             video_info['id'] = video_id
 291         return video_info
 292     @staticmethod
 293     def playlist_result(entries, playlist_id=None, playlist_title=None):
 294         """Returns a playlist"""
 295         video_info = {'_type': 'playlist',
 296                       'entries': entries}
 297         if playlist_id:
 298             video_info['id'] = playlist_id
 299         if playlist_title:
 300             video_info['title'] = playlist_title
 301         return video_info
 302
 303     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 304         """
 305         Perform a regex search on the given string, using a single or a list of
 306         patterns returning the first matching group.
 307         In case of failure return a default value or raise a WARNING or a
 308         RegexNotFoundError, depending on fatal, specifying the field name.
 309         """
 310         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 311             mobj = re.search(pattern, string, flags)
 312         else:
 313             for p in pattern:
 314                 mobj = re.search(p, string, flags)
 315                 if mobj: break
 316
 317         if os.name != 'nt' and sys.stderr.isatty():
 318             _name = u'\033[0;34m%s\033[0m' % name
 319         else:
 320             _name = name
 321
 322         if mobj:
 323             # return the first matching group
 324             return next(g for g in mobj.groups() if g is not None)
 325         elif default is not _NO_DEFAULT:
 326             return default
 327         elif fatal:
 328             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 329         else:
 330             self._downloader.report_warning(u'unable to extract %s; '
 331                 u'please report this issue on http://yt-dl.org/bug' % _name)
 332             return None
 333
 334     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 335         """
 336         Like _search_regex, but strips HTML tags and unescapes entities.
 337         """
 338         res = self._search_regex(pattern, string, name, default, fatal, flags)
 339         if res:
 340             return clean_html(res).strip()
 341         else:
 342             return res
 343
 344     def _get_login_info(self):
 345         """
 346         Get the the login info as (username, password)
 347         It will look in the netrc file using the _NETRC_MACHINE value
 348         If there's no info available, return (None, None)
 349         """
 350         if self._downloader is None:
 351             return (None, None)
 352
 353         username = None
 354         password = None
 355         downloader_params = self._downloader.params
 356
 357         # Attempt to use provided username and password or .netrc data
 358         if downloader_params.get('username', None) is not None:
 359             username = downloader_params['username']
 360             password = downloader_params['password']
 361         elif downloader_params.get('usenetrc', False):
 362             try:
 363                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 364                 if info is not None:
 365                     username = info[0]
 366                     password = info[2]
 367                 else:
 368                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 369             except (IOError, netrc.NetrcParseError) as err:
 370                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 371
 372         return (username, password)
 373
 374     # Helper functions for extracting OpenGraph info
 375     @staticmethod
 376     def _og_regexes(prop):
 377         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 378         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 379         template = r'<meta[^>]+?%s[^>]+?%s'
 380         return [
 381             template % (property_re, content_re),
 382             template % (content_re, property_re),
 383         ]
 384
 385     def _og_search_property(self, prop, html, name=None, **kargs):
 386         if name is None:
 387             name = 'OpenGraph %s' % prop
 388         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 389         if escaped is None:
 390             return None
 391         return unescapeHTML(escaped)
 392
 393     def _og_search_thumbnail(self, html, **kargs):
 394         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 395
 396     def _og_search_description(self, html, **kargs):
 397         return self._og_search_property('description', html, fatal=False, **kargs)
 398
 399     def _og_search_title(self, html, **kargs):
 400         return self._og_search_property('title', html, **kargs)
 401
 402     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 403         regexes = self._og_regexes('video')
 404         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 405         return self._html_search_regex(regexes, html, name, **kargs)
 406
 407     def _html_search_meta(self, name, html, display_name=None):
 408         if display_name is None:
 409             display_name = name
 410         return self._html_search_regex(
 411             r'''(?ix)<meta
 412                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 413                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 414             html, display_name, fatal=False)
 415
 416     def _dc_search_uploader(self, html):
 417         return self._html_search_meta('dc.creator', html, 'uploader')
 418
 419     def _rta_search(self, html):
 420         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 421         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 422                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 423                      html):
 424             return 18
 425         return 0
 426
 427     def _media_rating_search(self, html):
 428         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 429         rating = self._html_search_meta('rating', html)
 430
 431         if not rating:
 432             return None
 433
 434         RATING_TABLE = {
 435             'safe for kids': 0,
 436             'general': 8,
 437             '14 years': 14,
 438             'mature': 17,
 439             'restricted': 19,
 440         }
 441         return RATING_TABLE.get(rating.lower(), None)
 442
 443     def _sort_formats(self, formats):
 444         def _formats_key(f):
 445             # TODO remove the following workaround
 446             from ..utils import determine_ext
 447             if not f.get('ext') and 'url' in f:
 448                 f['ext'] = determine_ext(f['url'])
 449
 450             preference = f.get('preference')
 451             if preference is None:
 452                 proto = f.get('protocol')
 453                 if proto is None:
 454                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 455
 456                 preference = 0 if proto in ['http', 'https'] else -0.1
 457                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 458                     preference -= 0.5
 459
 460             if f.get('vcodec') == 'none':  # audio only
 461                 if self._downloader.params.get('prefer_free_formats'):
 462                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 463                 else:
 464                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 465                 ext_preference = 0
 466                 try:
 467                     audio_ext_preference = ORDER.index(f['ext'])
 468                 except ValueError:
 469                     audio_ext_preference = -1
 470             else:
 471                 if self._downloader.params.get('prefer_free_formats'):
 472                     ORDER = [u'flv', u'mp4', u'webm']
 473                 else:
 474                     ORDER = [u'webm', u'flv', u'mp4']
 475                 try:
 476                     ext_preference = ORDER.index(f['ext'])
 477                 except ValueError:
 478                     ext_preference = -1
 479                 audio_ext_preference = 0
 480
 481             return (
 482                 preference,
 483                 f.get('height') if f.get('height') is not None else -1,
 484                 f.get('width') if f.get('width') is not None else -1,
 485                 ext_preference,
 486                 f.get('vbr') if f.get('vbr') is not None else -1,
 487                 f.get('abr') if f.get('abr') is not None else -1,
 488                 audio_ext_preference,
 489                 f.get('filesize') if f.get('filesize') is not None else -1,
 490                 f.get('format_id'),
 491             )
 492         formats.sort(key=_formats_key)
 493
 494
 495 class SearchInfoExtractor(InfoExtractor):
 496     """
 497     Base class for paged search queries extractors.
 498     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 499     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 500     """
 501
 502     @classmethod
 503     def _make_valid_url(cls):
 504         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 505
 506     @classmethod
 507     def suitable(cls, url):
 508         return re.match(cls._make_valid_url(), url) is not None
 509
 510     def _real_extract(self, query):
 511         mobj = re.match(self._make_valid_url(), query)
 512         if mobj is None:
 513             raise ExtractorError(u'Invalid search query "%s"' % query)
 514
 515         prefix = mobj.group('prefix')
 516         query = mobj.group('query')
 517         if prefix == '':
 518             return self._get_n_results(query, 1)
 519         elif prefix == 'all':
 520             return self._get_n_results(query, self._MAX_RESULTS)
 521         else:
 522             n = int(prefix)
 523             if n <= 0:
 524                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 525             elif n > self._MAX_RESULTS:
 526                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 527                 n = self._MAX_RESULTS
 528             return self._get_n_results(query, n)
 529
 530     def _get_n_results(self, query, n):
 531         """Get a specified number of results for a query"""
 532         raise NotImplementedError("This method must be implemented by subclasses")
 533
 534     @property
 535     def SEARCH_KEY(self):
 536         return self._SEARCH_KEY