_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import os
   5 import re
   6 import socket
   7 import sys
   8 import netrc
   9 import xml.etree.ElementTree
  10
  11 from ..utils import (
  12     compat_http_client,
  13     compat_urllib_error,
  14     compat_urllib_parse_urlparse,
  15     compat_str,
  16
  17     clean_html,
  18     compiled_regex_type,
  19     ExtractorError,
  20     RegexNotFoundError,
  21     sanitize_filename,
  22     unescapeHTML,
  23 )
  24 _NO_DEFAULT = object()
  25
  26
  27 class InfoExtractor(object):
  28     """Information Extractor class.
  29
  30     Information extractors are the classes that, given a URL, extract
  31     information about the video (or videos) the URL refers to. This
  32     information includes the real video URL, the video title, author and
  33     others. The information is stored in a dictionary which is then
  34     passed to the FileDownloader. The FileDownloader processes this
  35     information possibly downloading the video to the file system, among
  36     other possible outcomes.
  37
  38     The dictionaries must include the following fields:
  39
  40     id:             Video identifier.
  41     title:          Video title, unescaped.
  42
  43     Additionally, it must contain either a formats entry or a url one:
  44
  45     formats:        A list of dictionaries for each format available, ordered
  46                     from worst to best quality.
  47
  48                     Potential fields:
  49                     * url        Mandatory. The URL of the video file
  50                     * ext        Will be calculated from url if missing
  51                     * format     A human-readable description of the format
  52                                  ("mp4 container with h264/opus").
  53                                  Calculated from the format_id, width, height.
  54                                  and format_note fields if missing.
  55                     * format_id  A short description of the format
  56                                  ("mp4_h264_opus" or "19").
  57                                 Technically optional, but strongly recommended.
  58                     * format_note Additional info about the format
  59                                  ("3D" or "DASH video")
  60                     * width      Width of the video, if known
  61                     * height     Height of the video, if known
  62                     * resolution Textual description of width and height
  63                     * tbr        Average bitrate of audio and video in KBit/s
  64                     * abr        Average audio bitrate in KBit/s
  65                     * acodec     Name of the audio codec in use
  66                     * asr        Audio sampling rate in Hertz
  67                     * vbr        Average video bitrate in KBit/s
  68                     * vcodec     Name of the video codec in use
  69                     * container  Name of the container format
  70                     * filesize   The number of bytes, if known in advance
  71                     * player_url SWF Player URL (used for rtmpdump).
  72                     * protocol   The protocol that will be used for the actual
  73                                  download, lower-case.
  74                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  75                     * preference Order number of this format. If this field is
  76                                  present and not None, the formats get sorted
  77                                  by this field, regardless of all other values.
  78                                  -1 for default (order by other properties),
  79                                  -2 or smaller for less than default.
  80                     * quality    Order number of the video quality of this
  81                                  format, irrespective of the file format.
  82                                  -1 for default (order by other properties),
  83                                  -2 or smaller for less than default.
  84     url:            Final video URL.
  85     ext:            Video filename extension.
  86     format:         The video format, defaults to ext (used for --get-format)
  87     player_url:     SWF Player URL (used for rtmpdump).
  88
  89     The following fields are optional:
  90
  91     display_id      An alternative identifier for the video, not necessarily
  92                     unique, but available before title. Typically, id is
  93                     something like "4234987", title "Dancing naked mole rats",
  94                     and display_id "dancing-naked-mole-rats"
  95     thumbnails:     A list of dictionaries (with the entries "resolution" and
  96                     "url") for the varying thumbnails
  97     thumbnail:      Full URL to a video thumbnail image.
  98     description:    One-line video description.
  99     uploader:       Full name of the video uploader.
 100     timestamp:      UNIX timestamp of the moment the video became available.
 101     upload_date:    Video upload date (YYYYMMDD).
 102                     If not explicitly set, calculated from timestamp.
 103     uploader_id:    Nickname or id of the video uploader.
 104     location:       Physical location of the video.
 105     subtitles:      The subtitle file contents as a dictionary in the format
 106                     {language: subtitles}.
 107     duration:       Length of the video in seconds, as an integer.
 108     view_count:     How many users have watched the video on the platform.
 109     like_count:     Number of positive ratings of the video
 110     dislike_count:  Number of negative ratings of the video
 111     comment_count:  Number of comments on the video
 112     age_limit:      Age restriction for the video, as an integer (years)
 113     webpage_url:    The url to the video webpage, if given to youtube-dl it
 114                     should allow to get the same result again. (It will be set
 115                     by YoutubeDL if it's missing)
 116     categories:     A list of categories that the video falls in, for example
 117                     ["Sports", "Berlin"]
 118
 119     Unless mentioned otherwise, the fields should be Unicode strings.
 120
 121     Subclasses of this one should re-define the _real_initialize() and
 122     _real_extract() methods and define a _VALID_URL regexp.
 123     Probably, they should also be added to the list of extractors.
 124
 125     Finally, the _WORKING attribute should be set to False for broken IEs
 126     in order to warn the users and skip the tests.
 127     """
 128
 129     _ready = False
 130     _downloader = None
 131     _WORKING = True
 132
 133     def __init__(self, downloader=None):
 134         """Constructor. Receives an optional downloader."""
 135         self._ready = False
 136         self.set_downloader(downloader)
 137
 138     @classmethod
 139     def suitable(cls, url):
 140         """Receives a URL and returns True if suitable for this IE."""
 141
 142         # This does not use has/getattr intentionally - we want to know whether
 143         # we have cached the regexp for *this* class, whereas getattr would also
 144         # match the superclass
 145         if '_VALID_URL_RE' not in cls.__dict__:
 146             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 147         return cls._VALID_URL_RE.match(url) is not None
 148
 149     @classmethod
 150     def working(cls):
 151         """Getter method for _WORKING."""
 152         return cls._WORKING
 153
 154     def initialize(self):
 155         """Initializes an instance (authentication, etc)."""
 156         if not self._ready:
 157             self._real_initialize()
 158             self._ready = True
 159
 160     def extract(self, url):
 161         """Extracts URL information and returns it in list of dicts."""
 162         self.initialize()
 163         return self._real_extract(url)
 164
 165     def set_downloader(self, downloader):
 166         """Sets the downloader for this IE."""
 167         self._downloader = downloader
 168
 169     def _real_initialize(self):
 170         """Real initialization process. Redefine in subclasses."""
 171         pass
 172
 173     def _real_extract(self, url):
 174         """Real extraction process. Redefine in subclasses."""
 175         pass
 176
 177     @classmethod
 178     def ie_key(cls):
 179         """A string for getting the InfoExtractor with get_info_extractor"""
 180         return cls.__name__[:-2]
 181
 182     @property
 183     def IE_NAME(self):
 184         return type(self).__name__[:-2]
 185
 186     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 187         """ Returns the response handle """
 188         if note is None:
 189             self.report_download_webpage(video_id)
 190         elif note is not False:
 191             if video_id is None:
 192                 self.to_screen(u'%s' % (note,))
 193             else:
 194                 self.to_screen(u'%s: %s' % (video_id, note))
 195         try:
 196             return self._downloader.urlopen(url_or_request)
 197         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 198             if errnote is False:
 199                 return False
 200             if errnote is None:
 201                 errnote = u'Unable to download webpage'
 202             errmsg = u'%s: %s' % (errnote, compat_str(err))
 203             if fatal:
 204                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 205             else:
 206                 self._downloader.report_warning(errmsg)
 207                 return False
 208
 209     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 210         """ Returns a tuple (page content as string, URL handle) """
 211
 212         # Strip hashes from the URL (#1038)
 213         if isinstance(url_or_request, (compat_str, str)):
 214             url_or_request = url_or_request.partition('#')[0]
 215
 216         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 217         if urlh is False:
 218             assert not fatal
 219             return False
 220         content_type = urlh.headers.get('Content-Type', '')
 221         webpage_bytes = urlh.read()
 222         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 223         if m:
 224             encoding = m.group(1)
 225         else:
 226             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 227                           webpage_bytes[:1024])
 228             if m:
 229                 encoding = m.group(1).decode('ascii')
 230             elif webpage_bytes.startswith(b'\xff\xfe'):
 231                 encoding = 'utf-16'
 232             else:
 233                 encoding = 'utf-8'
 234         if self._downloader.params.get('dump_intermediate_pages', False):
 235             try:
 236                 url = url_or_request.get_full_url()
 237             except AttributeError:
 238                 url = url_or_request
 239             self.to_screen(u'Dumping request to ' + url)
 240             dump = base64.b64encode(webpage_bytes).decode('ascii')
 241             self._downloader.to_screen(dump)
 242         if self._downloader.params.get('write_pages', False):
 243             try:
 244                 url = url_or_request.get_full_url()
 245             except AttributeError:
 246                 url = url_or_request
 247             basen = '%s_%s' % (video_id, url)
 248             if len(basen) > 240:
 249                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 250                 basen = basen[:240 - len(h)] + h
 251             raw_filename = basen + '.dump'
 252             filename = sanitize_filename(raw_filename, restricted=True)
 253             self.to_screen(u'Saving request to ' + filename)
 254             with open(filename, 'wb') as outf:
 255                 outf.write(webpage_bytes)
 256
 257         try:
 258             content = webpage_bytes.decode(encoding, 'replace')
 259         except LookupError:
 260             content = webpage_bytes.decode('utf-8', 'replace')
 261
 262         if (u'<title>Access to this site is blocked</title>' in content and
 263                 u'Websense' in content[:512]):
 264             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 265             blocked_iframe = self._html_search_regex(
 266                 r'<iframe src="([^"]+)"', content,
 267                 u'Websense information URL', default=None)
 268             if blocked_iframe:
 269                 msg += u' Visit %s for more details' % blocked_iframe
 270             raise ExtractorError(msg, expected=True)
 271
 272         return (content, urlh)
 273
 274     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 275         """ Returns the data of the page as a string """
 276         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 277         if res is False:
 278             return res
 279         else:
 280             content, _ = res
 281             return content
 282
 283     def _download_xml(self, url_or_request, video_id,
 284                       note=u'Downloading XML', errnote=u'Unable to download XML',
 285                       transform_source=None, fatal=True):
 286         """Return the xml as an xml.etree.ElementTree.Element"""
 287         xml_string = self._download_webpage(
 288             url_or_request, video_id, note, errnote, fatal=fatal)
 289         if xml_string is False:
 290             return xml_string
 291         if transform_source:
 292             xml_string = transform_source(xml_string)
 293         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 294
 295     def _download_json(self, url_or_request, video_id,
 296                        note=u'Downloading JSON metadata',
 297                        errnote=u'Unable to download JSON metadata',
 298                        transform_source=None):
 299         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 300         if transform_source:
 301             json_string = transform_source(json_string)
 302         try:
 303             return json.loads(json_string)
 304         except ValueError as ve:
 305             raise ExtractorError('Failed to download JSON', cause=ve)
 306
 307     def report_warning(self, msg, video_id=None):
 308         idstr = u'' if video_id is None else u'%s: ' % video_id
 309         self._downloader.report_warning(
 310             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 311
 312     def to_screen(self, msg):
 313         """Print msg to screen, prefixing it with '[ie_name]'"""
 314         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 315
 316     def report_extraction(self, id_or_name):
 317         """Report information extraction."""
 318         self.to_screen(u'%s: Extracting information' % id_or_name)
 319
 320     def report_download_webpage(self, video_id):
 321         """Report webpage download."""
 322         self.to_screen(u'%s: Downloading webpage' % video_id)
 323
 324     def report_age_confirmation(self):
 325         """Report attempt to confirm age."""
 326         self.to_screen(u'Confirming age')
 327
 328     def report_login(self):
 329         """Report attempt to log in."""
 330         self.to_screen(u'Logging in')
 331
 332     #Methods for following #608
 333     @staticmethod
 334     def url_result(url, ie=None, video_id=None):
 335         """Returns a url that points to a page that should be processed"""
 336         #TODO: ie should be the class used for getting the info
 337         video_info = {'_type': 'url',
 338                       'url': url,
 339                       'ie_key': ie}
 340         if video_id is not None:
 341             video_info['id'] = video_id
 342         return video_info
 343     @staticmethod
 344     def playlist_result(entries, playlist_id=None, playlist_title=None):
 345         """Returns a playlist"""
 346         # Ensure we don't have any duplicates in the playlist
 347         seen = set()
 348         new_list = []
 349         for url in entries:
 350             theurl = tuple(url.items())
 351             if theurl not in seen:
 352              seen.add(theurl)
 353              new_list.append(url)
 354              entries = new_list
 355
 356         video_info = {'_type': 'playlist',
 357                       'entries': entries}
 358         if playlist_id:
 359             video_info['id'] = playlist_id
 360         if playlist_title:
 361             video_info['title'] = playlist_title
 362         return video_info
 363
 364     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 365         """
 366         Perform a regex search on the given string, using a single or a list of
 367         patterns returning the first matching group.
 368         In case of failure return a default value or raise a WARNING or a
 369         RegexNotFoundError, depending on fatal, specifying the field name.
 370         """
 371         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 372             mobj = re.search(pattern, string, flags)
 373         else:
 374             for p in pattern:
 375                 mobj = re.search(p, string, flags)
 376                 if mobj: break
 377
 378         if os.name != 'nt' and sys.stderr.isatty():
 379             _name = u'\033[0;34m%s\033[0m' % name
 380         else:
 381             _name = name
 382
 383         if mobj:
 384             # return the first matching group
 385             return next(g for g in mobj.groups() if g is not None)
 386         elif default is not _NO_DEFAULT:
 387             return default
 388         elif fatal:
 389             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 390         else:
 391             self._downloader.report_warning(u'unable to extract %s; '
 392                 u'please report this issue on http://yt-dl.org/bug' % _name)
 393             return None
 394
 395     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 396         """
 397         Like _search_regex, but strips HTML tags and unescapes entities.
 398         """
 399         res = self._search_regex(pattern, string, name, default, fatal, flags)
 400         if res:
 401             return clean_html(res).strip()
 402         else:
 403             return res
 404
 405     def _get_login_info(self):
 406         """
 407         Get the the login info as (username, password)
 408         It will look in the netrc file using the _NETRC_MACHINE value
 409         If there's no info available, return (None, None)
 410         """
 411         if self._downloader is None:
 412             return (None, None)
 413
 414         username = None
 415         password = None
 416         downloader_params = self._downloader.params
 417
 418         # Attempt to use provided username and password or .netrc data
 419         if downloader_params.get('username', None) is not None:
 420             username = downloader_params['username']
 421             password = downloader_params['password']
 422         elif downloader_params.get('usenetrc', False):
 423             try:
 424                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 425                 if info is not None:
 426                     username = info[0]
 427                     password = info[2]
 428                 else:
 429                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 430             except (IOError, netrc.NetrcParseError) as err:
 431                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 432
 433         return (username, password)
 434
 435     # Helper functions for extracting OpenGraph info
 436     @staticmethod
 437     def _og_regexes(prop):
 438         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 439         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 440         template = r'<meta[^>]+?%s[^>]+?%s'
 441         return [
 442             template % (property_re, content_re),
 443             template % (content_re, property_re),
 444         ]
 445
 446     def _og_search_property(self, prop, html, name=None, **kargs):
 447         if name is None:
 448             name = 'OpenGraph %s' % prop
 449         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 450         if escaped is None:
 451             return None
 452         return unescapeHTML(escaped)
 453
 454     def _og_search_thumbnail(self, html, **kargs):
 455         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 456
 457     def _og_search_description(self, html, **kargs):
 458         return self._og_search_property('description', html, fatal=False, **kargs)
 459
 460     def _og_search_title(self, html, **kargs):
 461         return self._og_search_property('title', html, **kargs)
 462
 463     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 464         regexes = self._og_regexes('video')
 465         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 466         return self._html_search_regex(regexes, html, name, **kargs)
 467
 468     def _html_search_meta(self, name, html, display_name=None, fatal=False):
 469         if display_name is None:
 470             display_name = name
 471         return self._html_search_regex(
 472             r'''(?ix)<meta
 473                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 474                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 475             html, display_name, fatal=fatal)
 476
 477     def _dc_search_uploader(self, html):
 478         return self._html_search_meta('dc.creator', html, 'uploader')
 479
 480     def _rta_search(self, html):
 481         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 482         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 483                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 484                      html):
 485             return 18
 486         return 0
 487
 488     def _media_rating_search(self, html):
 489         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 490         rating = self._html_search_meta('rating', html)
 491
 492         if not rating:
 493             return None
 494
 495         RATING_TABLE = {
 496             'safe for kids': 0,
 497             'general': 8,
 498             '14 years': 14,
 499             'mature': 17,
 500             'restricted': 19,
 501         }
 502         return RATING_TABLE.get(rating.lower(), None)
 503
 504     def _twitter_search_player(self, html):
 505         return self._html_search_meta('twitter:player', html,
 506             'twitter card player')
 507
 508     def _sort_formats(self, formats):
 509         if not formats:
 510             raise ExtractorError(u'No video formats found')
 511
 512         def _formats_key(f):
 513             # TODO remove the following workaround
 514             from ..utils import determine_ext
 515             if not f.get('ext') and 'url' in f:
 516                 f['ext'] = determine_ext(f['url'])
 517
 518             preference = f.get('preference')
 519             if preference is None:
 520                 proto = f.get('protocol')
 521                 if proto is None:
 522                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 523
 524                 preference = 0 if proto in ['http', 'https'] else -0.1
 525                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 526                     preference -= 0.5
 527
 528             if f.get('vcodec') == 'none':  # audio only
 529                 if self._downloader.params.get('prefer_free_formats'):
 530                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 531                 else:
 532                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 533                 ext_preference = 0
 534                 try:
 535                     audio_ext_preference = ORDER.index(f['ext'])
 536                 except ValueError:
 537                     audio_ext_preference = -1
 538             else:
 539                 if self._downloader.params.get('prefer_free_formats'):
 540                     ORDER = [u'flv', u'mp4', u'webm']
 541                 else:
 542                     ORDER = [u'webm', u'flv', u'mp4']
 543                 try:
 544                     ext_preference = ORDER.index(f['ext'])
 545                 except ValueError:
 546                     ext_preference = -1
 547                 audio_ext_preference = 0
 548
 549             return (
 550                 preference,
 551                 f.get('quality') if f.get('quality') is not None else -1,
 552                 f.get('height') if f.get('height') is not None else -1,
 553                 f.get('width') if f.get('width') is not None else -1,
 554                 ext_preference,
 555                 f.get('tbr') if f.get('tbr') is not None else -1,
 556                 f.get('vbr') if f.get('vbr') is not None else -1,
 557                 f.get('abr') if f.get('abr') is not None else -1,
 558                 audio_ext_preference,
 559                 f.get('filesize') if f.get('filesize') is not None else -1,
 560                 f.get('format_id'),
 561             )
 562         formats.sort(key=_formats_key)
 563
 564     def http_scheme(self):
 565         """ Either "https:" or "https:", depending on the user's preferences """
 566         return (
 567             'http:'
 568             if self._downloader.params.get('prefer_insecure', False)
 569             else 'https:')
 570
 571     def _proto_relative_url(self, url, scheme=None):
 572         if url is None:
 573             return url
 574         if url.startswith('//'):
 575             if scheme is None:
 576                 scheme = self.http_scheme()
 577             return scheme + url
 578         else:
 579             return url
 580
 581
 582 class SearchInfoExtractor(InfoExtractor):
 583     """
 584     Base class for paged search queries extractors.
 585     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 586     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 587     """
 588
 589     @classmethod
 590     def _make_valid_url(cls):
 591         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 592
 593     @classmethod
 594     def suitable(cls, url):
 595         return re.match(cls._make_valid_url(), url) is not None
 596
 597     def _real_extract(self, query):
 598         mobj = re.match(self._make_valid_url(), query)
 599         if mobj is None:
 600             raise ExtractorError(u'Invalid search query "%s"' % query)
 601
 602         prefix = mobj.group('prefix')
 603         query = mobj.group('query')
 604         if prefix == '':
 605             return self._get_n_results(query, 1)
 606         elif prefix == 'all':
 607             return self._get_n_results(query, self._MAX_RESULTS)
 608         else:
 609             n = int(prefix)
 610             if n <= 0:
 611                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 612             elif n > self._MAX_RESULTS:
 613                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 614                 n = self._MAX_RESULTS
 615             return self._get_n_results(query, n)
 616
 617     def _get_n_results(self, query, n):
 618         """Get a specified number of results for a query"""
 619         raise NotImplementedError("This method must be implemented by subclasses")
 620
 621     @property
 622     def SEARCH_KEY(self):
 623         return self._SEARCH_KEY
 624