_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import os
   5 import re
   6 import socket
   7 import sys
   8 import netrc
   9 import xml.etree.ElementTree
  10
  11 from ..utils import (
  12     compat_http_client,
  13     compat_urllib_error,
  14     compat_urllib_parse_urlparse,
  15     compat_str,
  16
  17     clean_html,
  18     compiled_regex_type,
  19     ExtractorError,
  20     RegexNotFoundError,
  21     sanitize_filename,
  22     unescapeHTML,
  23 )
  24 _NO_DEFAULT = object()
  25
  26
  27 class InfoExtractor(object):
  28     """Information Extractor class.
  29
  30     Information extractors are the classes that, given a URL, extract
  31     information about the video (or videos) the URL refers to. This
  32     information includes the real video URL, the video title, author and
  33     others. The information is stored in a dictionary which is then
  34     passed to the FileDownloader. The FileDownloader processes this
  35     information possibly downloading the video to the file system, among
  36     other possible outcomes.
  37
  38     The dictionaries must include the following fields:
  39
  40     id:             Video identifier.
  41     title:          Video title, unescaped.
  42
  43     Additionally, it must contain either a formats entry or a url one:
  44
  45     formats:        A list of dictionaries for each format available, ordered
  46                     from worst to best quality.
  47
  48                     Potential fields:
  49                     * url        Mandatory. The URL of the video file
  50                     * ext        Will be calculated from url if missing
  51                     * format     A human-readable description of the format
  52                                  ("mp4 container with h264/opus").
  53                                  Calculated from the format_id, width, height.
  54                                  and format_note fields if missing.
  55                     * format_id  A short description of the format
  56                                  ("mp4_h264_opus" or "19").
  57                                 Technically optional, but strongly recommended.
  58                     * format_note Additional info about the format
  59                                  ("3D" or "DASH video")
  60                     * width      Width of the video, if known
  61                     * height     Height of the video, if known
  62                     * resolution Textual description of width and height
  63                     * tbr        Average bitrate of audio and video in KBit/s
  64                     * abr        Average audio bitrate in KBit/s
  65                     * acodec     Name of the audio codec in use
  66                     * asr        Audio sampling rate in Hertz
  67                     * vbr        Average video bitrate in KBit/s
  68                     * vcodec     Name of the video codec in use
  69                     * container  Name of the container format
  70                     * filesize   The number of bytes, if known in advance
  71                     * player_url SWF Player URL (used for rtmpdump).
  72                     * protocol   The protocol that will be used for the actual
  73                                  download, lower-case.
  74                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  75                     * preference Order number of this format. If this field is
  76                                  present and not None, the formats get sorted
  77                                  by this field, regardless of all other values.
  78                                  -1 for default (order by other properties),
  79                                  -2 or smaller for less than default.
  80                     * quality    Order number of the video quality of this
  81                                  format, irrespective of the file format.
  82                                  -1 for default (order by other properties),
  83                                  -2 or smaller for less than default.
  84     url:            Final video URL.
  85     ext:            Video filename extension.
  86     format:         The video format, defaults to ext (used for --get-format)
  87     player_url:     SWF Player URL (used for rtmpdump).
  88
  89     The following fields are optional:
  90
  91     display_id      An alternative identifier for the video, not necessarily
  92                     unique, but available before title. Typically, id is
  93                     something like "4234987", title "Dancing naked mole rats",
  94                     and display_id "dancing-naked-mole-rats"
  95     thumbnails:     A list of dictionaries (with the entries "resolution" and
  96                     "url") for the varying thumbnails
  97     thumbnail:      Full URL to a video thumbnail image.
  98     description:    One-line video description.
  99     uploader:       Full name of the video uploader.
 100     timestamp:      UNIX timestamp of the moment the video became available.
 101     upload_date:    Video upload date (YYYYMMDD).
 102                     If not explicitly set, calculated from timestamp.
 103     uploader_id:    Nickname or id of the video uploader.
 104     location:       Physical location of the video.
 105     subtitles:      The subtitle file contents as a dictionary in the format
 106                     {language: subtitles}.
 107     duration:       Length of the video in seconds, as an integer.
 108     view_count:     How many users have watched the video on the platform.
 109     like_count:     Number of positive ratings of the video
 110     dislike_count:  Number of negative ratings of the video
 111     comment_count:  Number of comments on the video
 112     age_limit:      Age restriction for the video, as an integer (years)
 113     webpage_url:    The url to the video webpage, if given to youtube-dl it
 114                     should allow to get the same result again. (It will be set
 115                     by YoutubeDL if it's missing)
 116     categories:     A list of categories that the video falls in, for example
 117                     ["Sports", "Berlin"]
 118
 119     Unless mentioned otherwise, the fields should be Unicode strings.
 120
 121     Subclasses of this one should re-define the _real_initialize() and
 122     _real_extract() methods and define a _VALID_URL regexp.
 123     Probably, they should also be added to the list of extractors.
 124
 125     Finally, the _WORKING attribute should be set to False for broken IEs
 126     in order to warn the users and skip the tests.
 127     """
 128
 129     _ready = False
 130     _downloader = None
 131     _WORKING = True
 132
 133     def __init__(self, downloader=None):
 134         """Constructor. Receives an optional downloader."""
 135         self._ready = False
 136         self.set_downloader(downloader)
 137
 138     @classmethod
 139     def suitable(cls, url):
 140         """Receives a URL and returns True if suitable for this IE."""
 141
 142         # This does not use has/getattr intentionally - we want to know whether
 143         # we have cached the regexp for *this* class, whereas getattr would also
 144         # match the superclass
 145         if '_VALID_URL_RE' not in cls.__dict__:
 146             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 147         return cls._VALID_URL_RE.match(url) is not None
 148
 149     @classmethod
 150     def working(cls):
 151         """Getter method for _WORKING."""
 152         return cls._WORKING
 153
 154     def initialize(self):
 155         """Initializes an instance (authentication, etc)."""
 156         if not self._ready:
 157             self._real_initialize()
 158             self._ready = True
 159
 160     def extract(self, url):
 161         """Extracts URL information and returns it in list of dicts."""
 162         self.initialize()
 163         return self._real_extract(url)
 164
 165     def set_downloader(self, downloader):
 166         """Sets the downloader for this IE."""
 167         self._downloader = downloader
 168
 169     def _real_initialize(self):
 170         """Real initialization process. Redefine in subclasses."""
 171         pass
 172
 173     def _real_extract(self, url):
 174         """Real extraction process. Redefine in subclasses."""
 175         pass
 176
 177     @classmethod
 178     def ie_key(cls):
 179         """A string for getting the InfoExtractor with get_info_extractor"""
 180         return cls.__name__[:-2]
 181
 182     @property
 183     def IE_NAME(self):
 184         return type(self).__name__[:-2]
 185
 186     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 187         """ Returns the response handle """
 188         if note is None:
 189             self.report_download_webpage(video_id)
 190         elif note is not False:
 191             if video_id is None:
 192                 self.to_screen(u'%s' % (note,))
 193             else:
 194                 self.to_screen(u'%s: %s' % (video_id, note))
 195         try:
 196             return self._downloader.urlopen(url_or_request)
 197         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 198             if errnote is False:
 199                 return False
 200             if errnote is None:
 201                 errnote = u'Unable to download webpage'
 202             errmsg = u'%s: %s' % (errnote, compat_str(err))
 203             if fatal:
 204                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 205             else:
 206                 self._downloader.report_warning(errmsg)
 207                 return False
 208
 209     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 210         """ Returns a tuple (page content as string, URL handle) """
 211
 212         # Strip hashes from the URL (#1038)
 213         if isinstance(url_or_request, (compat_str, str)):
 214             url_or_request = url_or_request.partition('#')[0]
 215
 216         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 217         if urlh is False:
 218             assert not fatal
 219             return False
 220         content_type = urlh.headers.get('Content-Type', '')
 221         webpage_bytes = urlh.read()
 222         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 223         if m:
 224             encoding = m.group(1)
 225         else:
 226             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 227                           webpage_bytes[:1024])
 228             if m:
 229                 encoding = m.group(1).decode('ascii')
 230             elif webpage_bytes.startswith(b'\xff\xfe'):
 231                 encoding = 'utf-16'
 232             else:
 233                 encoding = 'utf-8'
 234         if self._downloader.params.get('dump_intermediate_pages', False):
 235             try:
 236                 url = url_or_request.get_full_url()
 237             except AttributeError:
 238                 url = url_or_request
 239             self.to_screen(u'Dumping request to ' + url)
 240             dump = base64.b64encode(webpage_bytes).decode('ascii')
 241             self._downloader.to_screen(dump)
 242         if self._downloader.params.get('write_pages', False):
 243             try:
 244                 url = url_or_request.get_full_url()
 245             except AttributeError:
 246                 url = url_or_request
 247             basen = '%s_%s' % (video_id, url)
 248             if len(basen) > 240:
 249                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 250                 basen = basen[:240 - len(h)] + h
 251             raw_filename = basen + '.dump'
 252             filename = sanitize_filename(raw_filename, restricted=True)
 253             self.to_screen(u'Saving request to ' + filename)
 254             with open(filename, 'wb') as outf:
 255                 outf.write(webpage_bytes)
 256
 257         try:
 258             content = webpage_bytes.decode(encoding, 'replace')
 259         except LookupError:
 260             content = webpage_bytes.decode('utf-8', 'replace')
 261
 262         if (u'<title>Access to this site is blocked</title>' in content and
 263                 u'Websense' in content[:512]):
 264             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 265             blocked_iframe = self._html_search_regex(
 266                 r'<iframe src="([^"]+)"', content,
 267                 u'Websense information URL', default=None)
 268             if blocked_iframe:
 269                 msg += u' Visit %s for more details' % blocked_iframe
 270             raise ExtractorError(msg, expected=True)
 271
 272         return (content, urlh)
 273
 274     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 275         """ Returns the data of the page as a string """
 276         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 277         if res is False:
 278             return res
 279         else:
 280             content, _ = res
 281             return content
 282
 283     def _download_xml(self, url_or_request, video_id,
 284                       note=u'Downloading XML', errnote=u'Unable to download XML',
 285                       transform_source=None, fatal=True):
 286         """Return the xml as an xml.etree.ElementTree.Element"""
 287         xml_string = self._download_webpage(
 288             url_or_request, video_id, note, errnote, fatal=fatal)
 289         if xml_string is False:
 290             return xml_string
 291         if transform_source:
 292             xml_string = transform_source(xml_string)
 293         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 294
 295     def _download_json(self, url_or_request, video_id,
 296                        note=u'Downloading JSON metadata',
 297                        errnote=u'Unable to download JSON metadata',
 298                        transform_source=None):
 299         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 300         if transform_source:
 301             json_string = transform_source(json_string)
 302         try:
 303             return json.loads(json_string)
 304         except ValueError as ve:
 305             raise ExtractorError('Failed to download JSON', cause=ve)
 306
 307     def report_warning(self, msg, video_id=None):
 308         idstr = u'' if video_id is None else u'%s: ' % video_id
 309         self._downloader.report_warning(
 310             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 311
 312     def to_screen(self, msg):
 313         """Print msg to screen, prefixing it with '[ie_name]'"""
 314         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 315
 316     def report_extraction(self, id_or_name):
 317         """Report information extraction."""
 318         self.to_screen(u'%s: Extracting information' % id_or_name)
 319
 320     def report_download_webpage(self, video_id):
 321         """Report webpage download."""
 322         self.to_screen(u'%s: Downloading webpage' % video_id)
 323
 324     def report_age_confirmation(self):
 325         """Report attempt to confirm age."""
 326         self.to_screen(u'Confirming age')
 327
 328     def report_login(self):
 329         """Report attempt to log in."""
 330         self.to_screen(u'Logging in')
 331
 332     #Methods for following #608
 333     @staticmethod
 334     def url_result(url, ie=None, video_id=None):
 335         """Returns a url that points to a page that should be processed"""
 336         #TODO: ie should be the class used for getting the info
 337         video_info = {'_type': 'url',
 338                       'url': url,
 339                       'ie_key': ie}
 340         if video_id is not None:
 341             video_info['id'] = video_id
 342         return video_info
 343     @staticmethod
 344     def playlist_result(entries, playlist_id=None, playlist_title=None):
 345         """Returns a playlist"""
 346         video_info = {'_type': 'playlist',
 347                       'entries': entries}
 348         if playlist_id:
 349             video_info['id'] = playlist_id
 350         if playlist_title:
 351             video_info['title'] = playlist_title
 352         return video_info
 353
 354     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 355         """
 356         Perform a regex search on the given string, using a single or a list of
 357         patterns returning the first matching group.
 358         In case of failure return a default value or raise a WARNING or a
 359         RegexNotFoundError, depending on fatal, specifying the field name.
 360         """
 361         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 362             mobj = re.search(pattern, string, flags)
 363         else:
 364             for p in pattern:
 365                 mobj = re.search(p, string, flags)
 366                 if mobj: break
 367
 368         if os.name != 'nt' and sys.stderr.isatty():
 369             _name = u'\033[0;34m%s\033[0m' % name
 370         else:
 371             _name = name
 372
 373         if mobj:
 374             # return the first matching group
 375             return next(g for g in mobj.groups() if g is not None)
 376         elif default is not _NO_DEFAULT:
 377             return default
 378         elif fatal:
 379             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 380         else:
 381             self._downloader.report_warning(u'unable to extract %s; '
 382                 u'please report this issue on http://yt-dl.org/bug' % _name)
 383             return None
 384
 385     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 386         """
 387         Like _search_regex, but strips HTML tags and unescapes entities.
 388         """
 389         res = self._search_regex(pattern, string, name, default, fatal, flags)
 390         if res:
 391             return clean_html(res).strip()
 392         else:
 393             return res
 394
 395     def _get_login_info(self):
 396         """
 397         Get the the login info as (username, password)
 398         It will look in the netrc file using the _NETRC_MACHINE value
 399         If there's no info available, return (None, None)
 400         """
 401         if self._downloader is None:
 402             return (None, None)
 403
 404         username = None
 405         password = None
 406         downloader_params = self._downloader.params
 407
 408         # Attempt to use provided username and password or .netrc data
 409         if downloader_params.get('username', None) is not None:
 410             username = downloader_params['username']
 411             password = downloader_params['password']
 412         elif downloader_params.get('usenetrc', False):
 413             try:
 414                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 415                 if info is not None:
 416                     username = info[0]
 417                     password = info[2]
 418                 else:
 419                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 420             except (IOError, netrc.NetrcParseError) as err:
 421                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 422
 423         return (username, password)
 424
 425     # Helper functions for extracting OpenGraph info
 426     @staticmethod
 427     def _og_regexes(prop):
 428         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 429         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 430         template = r'<meta[^>]+?%s[^>]+?%s'
 431         return [
 432             template % (property_re, content_re),
 433             template % (content_re, property_re),
 434         ]
 435
 436     def _og_search_property(self, prop, html, name=None, **kargs):
 437         if name is None:
 438             name = 'OpenGraph %s' % prop
 439         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 440         if escaped is None:
 441             return None
 442         return unescapeHTML(escaped)
 443
 444     def _og_search_thumbnail(self, html, **kargs):
 445         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 446
 447     def _og_search_description(self, html, **kargs):
 448         return self._og_search_property('description', html, fatal=False, **kargs)
 449
 450     def _og_search_title(self, html, **kargs):
 451         return self._og_search_property('title', html, **kargs)
 452
 453     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 454         regexes = self._og_regexes('video')
 455         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 456         return self._html_search_regex(regexes, html, name, **kargs)
 457
 458     def _html_search_meta(self, name, html, display_name=None, fatal=False):
 459         if display_name is None:
 460             display_name = name
 461         return self._html_search_regex(
 462             r'''(?ix)<meta
 463                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 464                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 465             html, display_name, fatal=fatal)
 466
 467     def _dc_search_uploader(self, html):
 468         return self._html_search_meta('dc.creator', html, 'uploader')
 469
 470     def _rta_search(self, html):
 471         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 472         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 473                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 474                      html):
 475             return 18
 476         return 0
 477
 478     def _media_rating_search(self, html):
 479         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 480         rating = self._html_search_meta('rating', html)
 481
 482         if not rating:
 483             return None
 484
 485         RATING_TABLE = {
 486             'safe for kids': 0,
 487             'general': 8,
 488             '14 years': 14,
 489             'mature': 17,
 490             'restricted': 19,
 491         }
 492         return RATING_TABLE.get(rating.lower(), None)
 493
 494     def _twitter_search_player(self, html):
 495         return self._html_search_meta('twitter:player', html,
 496             'twitter card player')
 497
 498     def _sort_formats(self, formats):
 499         if not formats:
 500             raise ExtractorError(u'No video formats found')
 501
 502         def _formats_key(f):
 503             # TODO remove the following workaround
 504             from ..utils import determine_ext
 505             if not f.get('ext') and 'url' in f:
 506                 f['ext'] = determine_ext(f['url'])
 507
 508             preference = f.get('preference')
 509             if preference is None:
 510                 proto = f.get('protocol')
 511                 if proto is None:
 512                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 513
 514                 preference = 0 if proto in ['http', 'https'] else -0.1
 515                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 516                     preference -= 0.5
 517
 518             if f.get('vcodec') == 'none':  # audio only
 519                 if self._downloader.params.get('prefer_free_formats'):
 520                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 521                 else:
 522                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 523                 ext_preference = 0
 524                 try:
 525                     audio_ext_preference = ORDER.index(f['ext'])
 526                 except ValueError:
 527                     audio_ext_preference = -1
 528             else:
 529                 if self._downloader.params.get('prefer_free_formats'):
 530                     ORDER = [u'flv', u'mp4', u'webm']
 531                 else:
 532                     ORDER = [u'webm', u'flv', u'mp4']
 533                 try:
 534                     ext_preference = ORDER.index(f['ext'])
 535                 except ValueError:
 536                     ext_preference = -1
 537                 audio_ext_preference = 0
 538
 539             return (
 540                 preference,
 541                 f.get('quality') if f.get('quality') is not None else -1,
 542                 f.get('height') if f.get('height') is not None else -1,
 543                 f.get('width') if f.get('width') is not None else -1,
 544                 ext_preference,
 545                 f.get('tbr') if f.get('tbr') is not None else -1,
 546                 f.get('vbr') if f.get('vbr') is not None else -1,
 547                 f.get('abr') if f.get('abr') is not None else -1,
 548                 audio_ext_preference,
 549                 f.get('filesize') if f.get('filesize') is not None else -1,
 550                 f.get('format_id'),
 551             )
 552         formats.sort(key=_formats_key)
 553
 554     def http_scheme(self):
 555         """ Either "https:" or "https:", depending on the user's preferences """
 556         return (
 557             'http:'
 558             if self._downloader.params.get('prefer_insecure', False)
 559             else 'https:')
 560
 561     def _proto_relative_url(self, url, scheme=None):
 562         if url is None:
 563             return url
 564         if url.startswith('//'):
 565             if scheme is None:
 566                 scheme = self.http_scheme()
 567             return scheme + url
 568         else:
 569             return url
 570
 571
 572 class SearchInfoExtractor(InfoExtractor):
 573     """
 574     Base class for paged search queries extractors.
 575     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 576     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 577     """
 578
 579     @classmethod
 580     def _make_valid_url(cls):
 581         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 582
 583     @classmethod
 584     def suitable(cls, url):
 585         return re.match(cls._make_valid_url(), url) is not None
 586
 587     def _real_extract(self, query):
 588         mobj = re.match(self._make_valid_url(), query)
 589         if mobj is None:
 590             raise ExtractorError(u'Invalid search query "%s"' % query)
 591
 592         prefix = mobj.group('prefix')
 593         query = mobj.group('query')
 594         if prefix == '':
 595             return self._get_n_results(query, 1)
 596         elif prefix == 'all':
 597             return self._get_n_results(query, self._MAX_RESULTS)
 598         else:
 599             n = int(prefix)
 600             if n <= 0:
 601                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 602             elif n > self._MAX_RESULTS:
 603                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 604                 n = self._MAX_RESULTS
 605             return self._get_n_results(query, n)
 606
 607     def _get_n_results(self, query, n):
 608         """Get a specified number of results for a query"""
 609         raise NotImplementedError("This method must be implemented by subclasses")
 610
 611     @property
 612     def SEARCH_KEY(self):
 613         return self._SEARCH_KEY
 614