_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import json
   3 import os
   4 import re
   5 import socket
   6 import sys
   7 import netrc
   8 import xml.etree.ElementTree
   9
  10 from ..utils import (
  11     compat_http_client,
  12     compat_urllib_error,
  13     compat_urllib_parse_urlparse,
  14     compat_str,
  15
  16     clean_html,
  17     compiled_regex_type,
  18     ExtractorError,
  19     RegexNotFoundError,
  20     sanitize_filename,
  21     unescapeHTML,
  22 )
  23 _NO_DEFAULT = object()
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     title:          Video title, unescaped.
  41
  42     Additionally, it must contain either a formats entry or a url one:
  43
  44     formats:        A list of dictionaries for each format available, ordered
  45                     from worst to best quality.
  46
  47                     Potential fields:
  48                     * url        Mandatory. The URL of the video file
  49                     * ext        Will be calculated from url if missing
  50                     * format     A human-readable description of the format
  51                                  ("mp4 container with h264/opus").
  52                                  Calculated from the format_id, width, height.
  53                                  and format_note fields if missing.
  54                     * format_id  A short description of the format
  55                                  ("mp4_h264_opus" or "19").
  56                                 Technically optional, but strongly recommended.
  57                     * format_note Additional info about the format
  58                                  ("3D" or "DASH video")
  59                     * width      Width of the video, if known
  60                     * height     Height of the video, if known
  61                     * resolution Textual description of width and height
  62                     * tbr        Average bitrate of audio and video in KBit/s
  63                     * abr        Average audio bitrate in KBit/s
  64                     * acodec     Name of the audio codec in use
  65                     * vbr        Average video bitrate in KBit/s
  66                     * vcodec     Name of the video codec in use
  67                     * filesize   The number of bytes, if known in advance
  68                     * player_url SWF Player URL (used for rtmpdump).
  69                     * protocol   The protocol that will be used for the actual
  70                                  download, lower-case.
  71                                  "http", "https", "rtsp", "rtmp" or so.
  72                     * preference Order number of this format. If this field is
  73                                  present and not None, the formats get sorted
  74                                  by this field.
  75                                  -1 for default (order by other properties),
  76                                  -2 or smaller for less than default.
  77                     * quality    Order number of the video quality of this
  78                                  format, irrespective of the file format.
  79                                  -1 for default (order by other properties),
  80                                  -2 or smaller for less than default.
  81     url:            Final video URL.
  82     ext:            Video filename extension.
  83     format:         The video format, defaults to ext (used for --get-format)
  84     player_url:     SWF Player URL (used for rtmpdump).
  85
  86     The following fields are optional:
  87
  88     thumbnails:     A list of dictionaries (with the entries "resolution" and
  89                     "url") for the varying thumbnails
  90     thumbnail:      Full URL to a video thumbnail image.
  91     description:    One-line video description.
  92     uploader:       Full name of the video uploader.
  93     upload_date:    Video upload date (YYYYMMDD).
  94     uploader_id:    Nickname or id of the video uploader.
  95     location:       Physical location of the video.
  96     subtitles:      The subtitle file contents as a dictionary in the format
  97                     {language: subtitles}.
  98     duration:       Length of the video in seconds, as an integer.
  99     view_count:     How many users have watched the video on the platform.
 100     like_count:     Number of positive ratings of the video
 101     dislike_count:  Number of negative ratings of the video
 102     comment_count:  Number of comments on the video
 103     age_limit:      Age restriction for the video, as an integer (years)
 104     webpage_url:    The url to the video webpage, if given to youtube-dl it
 105                     should allow to get the same result again. (It will be set
 106                     by YoutubeDL if it's missing)
 107
 108     Unless mentioned otherwise, the fields should be Unicode strings.
 109
 110     Subclasses of this one should re-define the _real_initialize() and
 111     _real_extract() methods and define a _VALID_URL regexp.
 112     Probably, they should also be added to the list of extractors.
 113
 114     _real_extract() must return a *list* of information dictionaries as
 115     described above.
 116
 117     Finally, the _WORKING attribute should be set to False for broken IEs
 118     in order to warn the users and skip the tests.
 119     """
 120
 121     _ready = False
 122     _downloader = None
 123     _WORKING = True
 124
 125     def __init__(self, downloader=None):
 126         """Constructor. Receives an optional downloader."""
 127         self._ready = False
 128         self.set_downloader(downloader)
 129
 130     @classmethod
 131     def suitable(cls, url):
 132         """Receives a URL and returns True if suitable for this IE."""
 133
 134         # This does not use has/getattr intentionally - we want to know whether
 135         # we have cached the regexp for *this* class, whereas getattr would also
 136         # match the superclass
 137         if '_VALID_URL_RE' not in cls.__dict__:
 138             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 139         return cls._VALID_URL_RE.match(url) is not None
 140
 141     @classmethod
 142     def working(cls):
 143         """Getter method for _WORKING."""
 144         return cls._WORKING
 145
 146     def initialize(self):
 147         """Initializes an instance (authentication, etc)."""
 148         if not self._ready:
 149             self._real_initialize()
 150             self._ready = True
 151
 152     def extract(self, url):
 153         """Extracts URL information and returns it in list of dicts."""
 154         self.initialize()
 155         return self._real_extract(url)
 156
 157     def set_downloader(self, downloader):
 158         """Sets the downloader for this IE."""
 159         self._downloader = downloader
 160
 161     def _real_initialize(self):
 162         """Real initialization process. Redefine in subclasses."""
 163         pass
 164
 165     def _real_extract(self, url):
 166         """Real extraction process. Redefine in subclasses."""
 167         pass
 168
 169     @classmethod
 170     def ie_key(cls):
 171         """A string for getting the InfoExtractor with get_info_extractor"""
 172         return cls.__name__[:-2]
 173
 174     @property
 175     def IE_NAME(self):
 176         return type(self).__name__[:-2]
 177
 178     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 179         """ Returns the response handle """
 180         if note is None:
 181             self.report_download_webpage(video_id)
 182         elif note is not False:
 183             if video_id is None:
 184                 self.to_screen(u'%s' % (note,))
 185             else:
 186                 self.to_screen(u'%s: %s' % (video_id, note))
 187         try:
 188             return self._downloader.urlopen(url_or_request)
 189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 190             if errnote is False:
 191                 return False
 192             if errnote is None:
 193                 errnote = u'Unable to download webpage'
 194             errmsg = u'%s: %s' % (errnote, compat_str(err))
 195             if fatal:
 196                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 197             else:
 198                 self._downloader.report_warning(errmsg)
 199                 return False
 200
 201     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 202         """ Returns a tuple (page content as string, URL handle) """
 203
 204         # Strip hashes from the URL (#1038)
 205         if isinstance(url_or_request, (compat_str, str)):
 206             url_or_request = url_or_request.partition('#')[0]
 207
 208         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 209         if urlh is False:
 210             assert not fatal
 211             return False
 212         content_type = urlh.headers.get('Content-Type', '')
 213         webpage_bytes = urlh.read()
 214         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 215         if m:
 216             encoding = m.group(1)
 217         else:
 218             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 219                           webpage_bytes[:1024])
 220             if m:
 221                 encoding = m.group(1).decode('ascii')
 222             else:
 223                 encoding = 'utf-8'
 224         if self._downloader.params.get('dump_intermediate_pages', False):
 225             try:
 226                 url = url_or_request.get_full_url()
 227             except AttributeError:
 228                 url = url_or_request
 229             self.to_screen(u'Dumping request to ' + url)
 230             dump = base64.b64encode(webpage_bytes).decode('ascii')
 231             self._downloader.to_screen(dump)
 232         if self._downloader.params.get('write_pages', False):
 233             try:
 234                 url = url_or_request.get_full_url()
 235             except AttributeError:
 236                 url = url_or_request
 237             raw_filename = ('%s_%s.dump' % (video_id, url))
 238             filename = sanitize_filename(raw_filename, restricted=True)
 239             self.to_screen(u'Saving request to ' + filename)
 240             with open(filename, 'wb') as outf:
 241                 outf.write(webpage_bytes)
 242
 243         content = webpage_bytes.decode(encoding, 'replace')
 244         return (content, urlh)
 245
 246     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 247         """ Returns the data of the page as a string """
 248         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 249         if res is False:
 250             return res
 251         else:
 252             content, _ = res
 253             return content
 254
 255     def _download_xml(self, url_or_request, video_id,
 256                       note=u'Downloading XML', errnote=u'Unable to download XML',
 257                       transform_source=None):
 258         """Return the xml as an xml.etree.ElementTree.Element"""
 259         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 260         if transform_source:
 261             xml_string = transform_source(xml_string)
 262         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 263
 264     def _download_json(self, url_or_request, video_id,
 265                        note=u'Downloading JSON metadata',
 266                        errnote=u'Unable to download JSON metadata'):
 267         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 268         try:
 269             return json.loads(json_string)
 270         except ValueError as ve:
 271             raise ExtractorError('Failed to download JSON', cause=ve)
 272
 273     def report_warning(self, msg, video_id=None):
 274         idstr = u'' if video_id is None else u'%s: ' % video_id
 275         self._downloader.report_warning(
 276             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 277
 278     def to_screen(self, msg):
 279         """Print msg to screen, prefixing it with '[ie_name]'"""
 280         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 281
 282     def report_extraction(self, id_or_name):
 283         """Report information extraction."""
 284         self.to_screen(u'%s: Extracting information' % id_or_name)
 285
 286     def report_download_webpage(self, video_id):
 287         """Report webpage download."""
 288         self.to_screen(u'%s: Downloading webpage' % video_id)
 289
 290     def report_age_confirmation(self):
 291         """Report attempt to confirm age."""
 292         self.to_screen(u'Confirming age')
 293
 294     def report_login(self):
 295         """Report attempt to log in."""
 296         self.to_screen(u'Logging in')
 297
 298     #Methods for following #608
 299     @staticmethod
 300     def url_result(url, ie=None, video_id=None):
 301         """Returns a url that points to a page that should be processed"""
 302         #TODO: ie should be the class used for getting the info
 303         video_info = {'_type': 'url',
 304                       'url': url,
 305                       'ie_key': ie}
 306         if video_id is not None:
 307             video_info['id'] = video_id
 308         return video_info
 309     @staticmethod
 310     def playlist_result(entries, playlist_id=None, playlist_title=None):
 311         """Returns a playlist"""
 312         video_info = {'_type': 'playlist',
 313                       'entries': entries}
 314         if playlist_id:
 315             video_info['id'] = playlist_id
 316         if playlist_title:
 317             video_info['title'] = playlist_title
 318         return video_info
 319
 320     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 321         """
 322         Perform a regex search on the given string, using a single or a list of
 323         patterns returning the first matching group.
 324         In case of failure return a default value or raise a WARNING or a
 325         RegexNotFoundError, depending on fatal, specifying the field name.
 326         """
 327         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 328             mobj = re.search(pattern, string, flags)
 329         else:
 330             for p in pattern:
 331                 mobj = re.search(p, string, flags)
 332                 if mobj: break
 333
 334         if os.name != 'nt' and sys.stderr.isatty():
 335             _name = u'\033[0;34m%s\033[0m' % name
 336         else:
 337             _name = name
 338
 339         if mobj:
 340             # return the first matching group
 341             return next(g for g in mobj.groups() if g is not None)
 342         elif default is not _NO_DEFAULT:
 343             return default
 344         elif fatal:
 345             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 346         else:
 347             self._downloader.report_warning(u'unable to extract %s; '
 348                 u'please report this issue on http://yt-dl.org/bug' % _name)
 349             return None
 350
 351     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 352         """
 353         Like _search_regex, but strips HTML tags and unescapes entities.
 354         """
 355         res = self._search_regex(pattern, string, name, default, fatal, flags)
 356         if res:
 357             return clean_html(res).strip()
 358         else:
 359             return res
 360
 361     def _get_login_info(self):
 362         """
 363         Get the the login info as (username, password)
 364         It will look in the netrc file using the _NETRC_MACHINE value
 365         If there's no info available, return (None, None)
 366         """
 367         if self._downloader is None:
 368             return (None, None)
 369
 370         username = None
 371         password = None
 372         downloader_params = self._downloader.params
 373
 374         # Attempt to use provided username and password or .netrc data
 375         if downloader_params.get('username', None) is not None:
 376             username = downloader_params['username']
 377             password = downloader_params['password']
 378         elif downloader_params.get('usenetrc', False):
 379             try:
 380                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 381                 if info is not None:
 382                     username = info[0]
 383                     password = info[2]
 384                 else:
 385                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 386             except (IOError, netrc.NetrcParseError) as err:
 387                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 388
 389         return (username, password)
 390
 391     # Helper functions for extracting OpenGraph info
 392     @staticmethod
 393     def _og_regexes(prop):
 394         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 395         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 396         template = r'<meta[^>]+?%s[^>]+?%s'
 397         return [
 398             template % (property_re, content_re),
 399             template % (content_re, property_re),
 400         ]
 401
 402     def _og_search_property(self, prop, html, name=None, **kargs):
 403         if name is None:
 404             name = 'OpenGraph %s' % prop
 405         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 406         if escaped is None:
 407             return None
 408         return unescapeHTML(escaped)
 409
 410     def _og_search_thumbnail(self, html, **kargs):
 411         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 412
 413     def _og_search_description(self, html, **kargs):
 414         return self._og_search_property('description', html, fatal=False, **kargs)
 415
 416     def _og_search_title(self, html, **kargs):
 417         return self._og_search_property('title', html, **kargs)
 418
 419     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 420         regexes = self._og_regexes('video')
 421         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 422         return self._html_search_regex(regexes, html, name, **kargs)
 423
 424     def _html_search_meta(self, name, html, display_name=None):
 425         if display_name is None:
 426             display_name = name
 427         return self._html_search_regex(
 428             r'''(?ix)<meta
 429                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 430                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 431             html, display_name, fatal=False)
 432
 433     def _dc_search_uploader(self, html):
 434         return self._html_search_meta('dc.creator', html, 'uploader')
 435
 436     def _rta_search(self, html):
 437         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 438         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 439                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 440                      html):
 441             return 18
 442         return 0
 443
 444     def _media_rating_search(self, html):
 445         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 446         rating = self._html_search_meta('rating', html)
 447
 448         if not rating:
 449             return None
 450
 451         RATING_TABLE = {
 452             'safe for kids': 0,
 453             'general': 8,
 454             '14 years': 14,
 455             'mature': 17,
 456             'restricted': 19,
 457         }
 458         return RATING_TABLE.get(rating.lower(), None)
 459
 460     def _sort_formats(self, formats):
 461         def _formats_key(f):
 462             # TODO remove the following workaround
 463             from ..utils import determine_ext
 464             if not f.get('ext') and 'url' in f:
 465                 f['ext'] = determine_ext(f['url'])
 466
 467             preference = f.get('preference')
 468             if preference is None:
 469                 proto = f.get('protocol')
 470                 if proto is None:
 471                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 472
 473                 preference = 0 if proto in ['http', 'https'] else -0.1
 474                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 475                     preference -= 0.5
 476
 477             if f.get('vcodec') == 'none':  # audio only
 478                 if self._downloader.params.get('prefer_free_formats'):
 479                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 480                 else:
 481                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 482                 ext_preference = 0
 483                 try:
 484                     audio_ext_preference = ORDER.index(f['ext'])
 485                 except ValueError:
 486                     audio_ext_preference = -1
 487             else:
 488                 if self._downloader.params.get('prefer_free_formats'):
 489                     ORDER = [u'flv', u'mp4', u'webm']
 490                 else:
 491                     ORDER = [u'webm', u'flv', u'mp4']
 492                 try:
 493                     ext_preference = ORDER.index(f['ext'])
 494                 except ValueError:
 495                     ext_preference = -1
 496                 audio_ext_preference = 0
 497
 498             return (
 499                 preference,
 500                 f.get('quality') if f.get('quality') is not None else -1,
 501                 f.get('height') if f.get('height') is not None else -1,
 502                 f.get('width') if f.get('width') is not None else -1,
 503                 ext_preference,
 504                 f.get('tbr') if f.get('tbr') is not None else -1,
 505                 f.get('vbr') if f.get('vbr') is not None else -1,
 506                 f.get('abr') if f.get('abr') is not None else -1,
 507                 audio_ext_preference,
 508                 f.get('filesize') if f.get('filesize') is not None else -1,
 509                 f.get('format_id'),
 510             )
 511         formats.sort(key=_formats_key)
 512
 513
 514 class SearchInfoExtractor(InfoExtractor):
 515     """
 516     Base class for paged search queries extractors.
 517     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 518     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 519     """
 520
 521     @classmethod
 522     def _make_valid_url(cls):
 523         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 524
 525     @classmethod
 526     def suitable(cls, url):
 527         return re.match(cls._make_valid_url(), url) is not None
 528
 529     def _real_extract(self, query):
 530         mobj = re.match(self._make_valid_url(), query)
 531         if mobj is None:
 532             raise ExtractorError(u'Invalid search query "%s"' % query)
 533
 534         prefix = mobj.group('prefix')
 535         query = mobj.group('query')
 536         if prefix == '':
 537             return self._get_n_results(query, 1)
 538         elif prefix == 'all':
 539             return self._get_n_results(query, self._MAX_RESULTS)
 540         else:
 541             n = int(prefix)
 542             if n <= 0:
 543                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 544             elif n > self._MAX_RESULTS:
 545                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 546                 n = self._MAX_RESULTS
 547             return self._get_n_results(query, n)
 548
 549     def _get_n_results(self, query, n):
 550         """Get a specified number of results for a query"""
 551         raise NotImplementedError("This method must be implemented by subclasses")
 552
 553     @property
 554     def SEARCH_KEY(self):
 555         return self._SEARCH_KEY