_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import sys
   9 import time
  10 import xml.etree.ElementTree
  11
  12 from ..utils import (
  13     compat_http_client,
  14     compat_urllib_error,
  15     compat_urllib_parse_urlparse,
  16     compat_str,
  17
  18     clean_html,
  19     compiled_regex_type,
  20     ExtractorError,
  21     RegexNotFoundError,
  22     sanitize_filename,
  23     unescapeHTML,
  24 )
  25 _NO_DEFAULT = object()
  26
  27
  28 class InfoExtractor(object):
  29     """Information Extractor class.
  30
  31     Information extractors are the classes that, given a URL, extract
  32     information about the video (or videos) the URL refers to. This
  33     information includes the real video URL, the video title, author and
  34     others. The information is stored in a dictionary which is then
  35     passed to the FileDownloader. The FileDownloader processes this
  36     information possibly downloading the video to the file system, among
  37     other possible outcomes.
  38
  39     The dictionaries must include the following fields:
  40
  41     id:             Video identifier.
  42     title:          Video title, unescaped.
  43
  44     Additionally, it must contain either a formats entry or a url one:
  45
  46     formats:        A list of dictionaries for each format available, ordered
  47                     from worst to best quality.
  48
  49                     Potential fields:
  50                     * url        Mandatory. The URL of the video file
  51                     * ext        Will be calculated from url if missing
  52                     * format     A human-readable description of the format
  53                                  ("mp4 container with h264/opus").
  54                                  Calculated from the format_id, width, height.
  55                                  and format_note fields if missing.
  56                     * format_id  A short description of the format
  57                                  ("mp4_h264_opus" or "19").
  58                                 Technically optional, but strongly recommended.
  59                     * format_note Additional info about the format
  60                                  ("3D" or "DASH video")
  61                     * width      Width of the video, if known
  62                     * height     Height of the video, if known
  63                     * resolution Textual description of width and height
  64                     * tbr        Average bitrate of audio and video in KBit/s
  65                     * abr        Average audio bitrate in KBit/s
  66                     * acodec     Name of the audio codec in use
  67                     * asr        Audio sampling rate in Hertz
  68                     * vbr        Average video bitrate in KBit/s
  69                     * vcodec     Name of the video codec in use
  70                     * container  Name of the container format
  71                     * filesize   The number of bytes, if known in advance
  72                     * player_url SWF Player URL (used for rtmpdump).
  73                     * protocol   The protocol that will be used for the actual
  74                                  download, lower-case.
  75                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  76                     * preference Order number of this format. If this field is
  77                                  present and not None, the formats get sorted
  78                                  by this field, regardless of all other values.
  79                                  -1 for default (order by other properties),
  80                                  -2 or smaller for less than default.
  81                     * quality    Order number of the video quality of this
  82                                  format, irrespective of the file format.
  83                                  -1 for default (order by other properties),
  84                                  -2 or smaller for less than default.
  85     url:            Final video URL.
  86     ext:            Video filename extension.
  87     format:         The video format, defaults to ext (used for --get-format)
  88     player_url:     SWF Player URL (used for rtmpdump).
  89
  90     The following fields are optional:
  91
  92     display_id      An alternative identifier for the video, not necessarily
  93                     unique, but available before title. Typically, id is
  94                     something like "4234987", title "Dancing naked mole rats",
  95                     and display_id "dancing-naked-mole-rats"
  96     thumbnails:     A list of dictionaries, with the following entries:
  97                         * "url"
  98                         * "width" (optional, int)
  99                         * "height" (optional, int)
 100                         * "resolution" (optional, string "{width}x{height"},
 101                                         deprecated)
 102     thumbnail:      Full URL to a video thumbnail image.
 103     description:    One-line video description.
 104     uploader:       Full name of the video uploader.
 105     timestamp:      UNIX timestamp of the moment the video became available.
 106     upload_date:    Video upload date (YYYYMMDD).
 107                     If not explicitly set, calculated from timestamp.
 108     uploader_id:    Nickname or id of the video uploader.
 109     location:       Physical location of the video.
 110     subtitles:      The subtitle file contents as a dictionary in the format
 111                     {language: subtitles}.
 112     duration:       Length of the video in seconds, as an integer.
 113     view_count:     How many users have watched the video on the platform.
 114     like_count:     Number of positive ratings of the video
 115     dislike_count:  Number of negative ratings of the video
 116     comment_count:  Number of comments on the video
 117     age_limit:      Age restriction for the video, as an integer (years)
 118     webpage_url:    The url to the video webpage, if given to youtube-dl it
 119                     should allow to get the same result again. (It will be set
 120                     by YoutubeDL if it's missing)
 121     categories:     A list of categories that the video falls in, for example
 122                     ["Sports", "Berlin"]
 123
 124     Unless mentioned otherwise, the fields should be Unicode strings.
 125
 126     Subclasses of this one should re-define the _real_initialize() and
 127     _real_extract() methods and define a _VALID_URL regexp.
 128     Probably, they should also be added to the list of extractors.
 129
 130     Finally, the _WORKING attribute should be set to False for broken IEs
 131     in order to warn the users and skip the tests.
 132     """
 133
 134     _ready = False
 135     _downloader = None
 136     _WORKING = True
 137
 138     def __init__(self, downloader=None):
 139         """Constructor. Receives an optional downloader."""
 140         self._ready = False
 141         self.set_downloader(downloader)
 142
 143     @classmethod
 144     def suitable(cls, url):
 145         """Receives a URL and returns True if suitable for this IE."""
 146
 147         # This does not use has/getattr intentionally - we want to know whether
 148         # we have cached the regexp for *this* class, whereas getattr would also
 149         # match the superclass
 150         if '_VALID_URL_RE' not in cls.__dict__:
 151             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 152         return cls._VALID_URL_RE.match(url) is not None
 153
 154     @classmethod
 155     def working(cls):
 156         """Getter method for _WORKING."""
 157         return cls._WORKING
 158
 159     def initialize(self):
 160         """Initializes an instance (authentication, etc)."""
 161         if not self._ready:
 162             self._real_initialize()
 163             self._ready = True
 164
 165     def extract(self, url):
 166         """Extracts URL information and returns it in list of dicts."""
 167         self.initialize()
 168         return self._real_extract(url)
 169
 170     def set_downloader(self, downloader):
 171         """Sets the downloader for this IE."""
 172         self._downloader = downloader
 173
 174     def _real_initialize(self):
 175         """Real initialization process. Redefine in subclasses."""
 176         pass
 177
 178     def _real_extract(self, url):
 179         """Real extraction process. Redefine in subclasses."""
 180         pass
 181
 182     @classmethod
 183     def ie_key(cls):
 184         """A string for getting the InfoExtractor with get_info_extractor"""
 185         return cls.__name__[:-2]
 186
 187     @property
 188     def IE_NAME(self):
 189         return type(self).__name__[:-2]
 190
 191     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 192         """ Returns the response handle """
 193         if note is None:
 194             self.report_download_webpage(video_id)
 195         elif note is not False:
 196             if video_id is None:
 197                 self.to_screen(u'%s' % (note,))
 198             else:
 199                 self.to_screen(u'%s: %s' % (video_id, note))
 200         try:
 201             return self._downloader.urlopen(url_or_request)
 202         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 203             if errnote is False:
 204                 return False
 205             if errnote is None:
 206                 errnote = u'Unable to download webpage'
 207             errmsg = u'%s: %s' % (errnote, compat_str(err))
 208             if fatal:
 209                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 210             else:
 211                 self._downloader.report_warning(errmsg)
 212                 return False
 213
 214     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 215         """ Returns a tuple (page content as string, URL handle) """
 216
 217         # Strip hashes from the URL (#1038)
 218         if isinstance(url_or_request, (compat_str, str)):
 219             url_or_request = url_or_request.partition('#')[0]
 220
 221         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 222         if urlh is False:
 223             assert not fatal
 224             return False
 225         content_type = urlh.headers.get('Content-Type', '')
 226         webpage_bytes = urlh.read()
 227         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 228         if m:
 229             encoding = m.group(1)
 230         else:
 231             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 232                           webpage_bytes[:1024])
 233             if m:
 234                 encoding = m.group(1).decode('ascii')
 235             elif webpage_bytes.startswith(b'\xff\xfe'):
 236                 encoding = 'utf-16'
 237             else:
 238                 encoding = 'utf-8'
 239         if self._downloader.params.get('dump_intermediate_pages', False):
 240             try:
 241                 url = url_or_request.get_full_url()
 242             except AttributeError:
 243                 url = url_or_request
 244             self.to_screen(u'Dumping request to ' + url)
 245             dump = base64.b64encode(webpage_bytes).decode('ascii')
 246             self._downloader.to_screen(dump)
 247         if self._downloader.params.get('write_pages', False):
 248             try:
 249                 url = url_or_request.get_full_url()
 250             except AttributeError:
 251                 url = url_or_request
 252             basen = '%s_%s' % (video_id, url)
 253             if len(basen) > 240:
 254                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 255                 basen = basen[:240 - len(h)] + h
 256             raw_filename = basen + '.dump'
 257             filename = sanitize_filename(raw_filename, restricted=True)
 258             self.to_screen(u'Saving request to ' + filename)
 259             with open(filename, 'wb') as outf:
 260                 outf.write(webpage_bytes)
 261
 262         try:
 263             content = webpage_bytes.decode(encoding, 'replace')
 264         except LookupError:
 265             content = webpage_bytes.decode('utf-8', 'replace')
 266
 267         if (u'<title>Access to this site is blocked</title>' in content and
 268                 u'Websense' in content[:512]):
 269             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 270             blocked_iframe = self._html_search_regex(
 271                 r'<iframe src="([^"]+)"', content,
 272                 u'Websense information URL', default=None)
 273             if blocked_iframe:
 274                 msg += u' Visit %s for more details' % blocked_iframe
 275             raise ExtractorError(msg, expected=True)
 276
 277         return (content, urlh)
 278
 279     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 280         """ Returns the data of the page as a string """
 281         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 282         if res is False:
 283             return res
 284         else:
 285             content, _ = res
 286             return content
 287
 288     def _download_xml(self, url_or_request, video_id,
 289                       note=u'Downloading XML', errnote=u'Unable to download XML',
 290                       transform_source=None, fatal=True):
 291         """Return the xml as an xml.etree.ElementTree.Element"""
 292         xml_string = self._download_webpage(
 293             url_or_request, video_id, note, errnote, fatal=fatal)
 294         if xml_string is False:
 295             return xml_string
 296         if transform_source:
 297             xml_string = transform_source(xml_string)
 298         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 299
 300     def _download_json(self, url_or_request, video_id,
 301                        note=u'Downloading JSON metadata',
 302                        errnote=u'Unable to download JSON metadata',
 303                        transform_source=None):
 304         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 305         if transform_source:
 306             json_string = transform_source(json_string)
 307         try:
 308             return json.loads(json_string)
 309         except ValueError as ve:
 310             raise ExtractorError('Failed to download JSON', cause=ve)
 311
 312     def report_warning(self, msg, video_id=None):
 313         idstr = u'' if video_id is None else u'%s: ' % video_id
 314         self._downloader.report_warning(
 315             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 316
 317     def to_screen(self, msg):
 318         """Print msg to screen, prefixing it with '[ie_name]'"""
 319         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 320
 321     def report_extraction(self, id_or_name):
 322         """Report information extraction."""
 323         self.to_screen(u'%s: Extracting information' % id_or_name)
 324
 325     def report_download_webpage(self, video_id):
 326         """Report webpage download."""
 327         self.to_screen(u'%s: Downloading webpage' % video_id)
 328
 329     def report_age_confirmation(self):
 330         """Report attempt to confirm age."""
 331         self.to_screen(u'Confirming age')
 332
 333     def report_login(self):
 334         """Report attempt to log in."""
 335         self.to_screen(u'Logging in')
 336
 337     #Methods for following #608
 338     @staticmethod
 339     def url_result(url, ie=None, video_id=None):
 340         """Returns a url that points to a page that should be processed"""
 341         #TODO: ie should be the class used for getting the info
 342         video_info = {'_type': 'url',
 343                       'url': url,
 344                       'ie_key': ie}
 345         if video_id is not None:
 346             video_info['id'] = video_id
 347         return video_info
 348     @staticmethod
 349     def playlist_result(entries, playlist_id=None, playlist_title=None):
 350         """Returns a playlist"""
 351         video_info = {'_type': 'playlist',
 352                       'entries': entries}
 353         if playlist_id:
 354             video_info['id'] = playlist_id
 355         if playlist_title:
 356             video_info['title'] = playlist_title
 357         return video_info
 358
 359     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 360         """
 361         Perform a regex search on the given string, using a single or a list of
 362         patterns returning the first matching group.
 363         In case of failure return a default value or raise a WARNING or a
 364         RegexNotFoundError, depending on fatal, specifying the field name.
 365         """
 366         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 367             mobj = re.search(pattern, string, flags)
 368         else:
 369             for p in pattern:
 370                 mobj = re.search(p, string, flags)
 371                 if mobj: break
 372
 373         if os.name != 'nt' and sys.stderr.isatty():
 374             _name = u'\033[0;34m%s\033[0m' % name
 375         else:
 376             _name = name
 377
 378         if mobj:
 379             # return the first matching group
 380             return next(g for g in mobj.groups() if g is not None)
 381         elif default is not _NO_DEFAULT:
 382             return default
 383         elif fatal:
 384             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 385         else:
 386             self._downloader.report_warning(u'unable to extract %s; '
 387                 u'please report this issue on http://yt-dl.org/bug' % _name)
 388             return None
 389
 390     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 391         """
 392         Like _search_regex, but strips HTML tags and unescapes entities.
 393         """
 394         res = self._search_regex(pattern, string, name, default, fatal, flags)
 395         if res:
 396             return clean_html(res).strip()
 397         else:
 398             return res
 399
 400     def _get_login_info(self):
 401         """
 402         Get the the login info as (username, password)
 403         It will look in the netrc file using the _NETRC_MACHINE value
 404         If there's no info available, return (None, None)
 405         """
 406         if self._downloader is None:
 407             return (None, None)
 408
 409         username = None
 410         password = None
 411         downloader_params = self._downloader.params
 412
 413         # Attempt to use provided username and password or .netrc data
 414         if downloader_params.get('username', None) is not None:
 415             username = downloader_params['username']
 416             password = downloader_params['password']
 417         elif downloader_params.get('usenetrc', False):
 418             try:
 419                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 420                 if info is not None:
 421                     username = info[0]
 422                     password = info[2]
 423                 else:
 424                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 425             except (IOError, netrc.NetrcParseError) as err:
 426                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 427
 428         return (username, password)
 429
 430     # Helper functions for extracting OpenGraph info
 431     @staticmethod
 432     def _og_regexes(prop):
 433         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 434         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 435         template = r'<meta[^>]+?%s[^>]+?%s'
 436         return [
 437             template % (property_re, content_re),
 438             template % (content_re, property_re),
 439         ]
 440
 441     def _og_search_property(self, prop, html, name=None, **kargs):
 442         if name is None:
 443             name = 'OpenGraph %s' % prop
 444         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 445         if escaped is None:
 446             return None
 447         return unescapeHTML(escaped)
 448
 449     def _og_search_thumbnail(self, html, **kargs):
 450         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 451
 452     def _og_search_description(self, html, **kargs):
 453         return self._og_search_property('description', html, fatal=False, **kargs)
 454
 455     def _og_search_title(self, html, **kargs):
 456         return self._og_search_property('title', html, **kargs)
 457
 458     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 459         regexes = self._og_regexes('video')
 460         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 461         return self._html_search_regex(regexes, html, name, **kargs)
 462
 463     def _og_search_url(self, html, **kargs):
 464         return self._og_search_property('url', html, **kargs)
 465
 466     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 467         if display_name is None:
 468             display_name = name
 469         return self._html_search_regex(
 470             r'''(?ix)<meta
 471                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 472                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 473             html, display_name, fatal=fatal, **kwargs)
 474
 475     def _dc_search_uploader(self, html):
 476         return self._html_search_meta('dc.creator', html, 'uploader')
 477
 478     def _rta_search(self, html):
 479         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 480         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 481                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 482                      html):
 483             return 18
 484         return 0
 485
 486     def _media_rating_search(self, html):
 487         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 488         rating = self._html_search_meta('rating', html)
 489
 490         if not rating:
 491             return None
 492
 493         RATING_TABLE = {
 494             'safe for kids': 0,
 495             'general': 8,
 496             '14 years': 14,
 497             'mature': 17,
 498             'restricted': 19,
 499         }
 500         return RATING_TABLE.get(rating.lower(), None)
 501
 502     def _twitter_search_player(self, html):
 503         return self._html_search_meta('twitter:player', html,
 504             'twitter card player')
 505
 506     def _sort_formats(self, formats):
 507         if not formats:
 508             raise ExtractorError(u'No video formats found')
 509
 510         def _formats_key(f):
 511             # TODO remove the following workaround
 512             from ..utils import determine_ext
 513             if not f.get('ext') and 'url' in f:
 514                 f['ext'] = determine_ext(f['url'])
 515
 516             preference = f.get('preference')
 517             if preference is None:
 518                 proto = f.get('protocol')
 519                 if proto is None:
 520                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 521
 522                 preference = 0 if proto in ['http', 'https'] else -0.1
 523                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 524                     preference -= 0.5
 525
 526             if f.get('vcodec') == 'none':  # audio only
 527                 if self._downloader.params.get('prefer_free_formats'):
 528                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 529                 else:
 530                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 531                 ext_preference = 0
 532                 try:
 533                     audio_ext_preference = ORDER.index(f['ext'])
 534                 except ValueError:
 535                     audio_ext_preference = -1
 536             else:
 537                 if self._downloader.params.get('prefer_free_formats'):
 538                     ORDER = [u'flv', u'mp4', u'webm']
 539                 else:
 540                     ORDER = [u'webm', u'flv', u'mp4']
 541                 try:
 542                     ext_preference = ORDER.index(f['ext'])
 543                 except ValueError:
 544                     ext_preference = -1
 545                 audio_ext_preference = 0
 546
 547             return (
 548                 preference,
 549                 f.get('quality') if f.get('quality') is not None else -1,
 550                 f.get('height') if f.get('height') is not None else -1,
 551                 f.get('width') if f.get('width') is not None else -1,
 552                 ext_preference,
 553                 f.get('tbr') if f.get('tbr') is not None else -1,
 554                 f.get('vbr') if f.get('vbr') is not None else -1,
 555                 f.get('abr') if f.get('abr') is not None else -1,
 556                 audio_ext_preference,
 557                 f.get('filesize') if f.get('filesize') is not None else -1,
 558                 f.get('format_id'),
 559             )
 560         formats.sort(key=_formats_key)
 561
 562     def http_scheme(self):
 563         """ Either "https:" or "https:", depending on the user's preferences """
 564         return (
 565             'http:'
 566             if self._downloader.params.get('prefer_insecure', False)
 567             else 'https:')
 568
 569     def _proto_relative_url(self, url, scheme=None):
 570         if url is None:
 571             return url
 572         if url.startswith('//'):
 573             if scheme is None:
 574                 scheme = self.http_scheme()
 575             return scheme + url
 576         else:
 577             return url
 578
 579     def _sleep(self, timeout, video_id, msg_template=None):
 580         if msg_template is None:
 581             msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
 582         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 583         self.to_screen(msg)
 584         time.sleep(timeout)
 585
 586
 587 class SearchInfoExtractor(InfoExtractor):
 588     """
 589     Base class for paged search queries extractors.
 590     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 591     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 592     """
 593
 594     @classmethod
 595     def _make_valid_url(cls):
 596         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 597
 598     @classmethod
 599     def suitable(cls, url):
 600         return re.match(cls._make_valid_url(), url) is not None
 601
 602     def _real_extract(self, query):
 603         mobj = re.match(self._make_valid_url(), query)
 604         if mobj is None:
 605             raise ExtractorError(u'Invalid search query "%s"' % query)
 606
 607         prefix = mobj.group('prefix')
 608         query = mobj.group('query')
 609         if prefix == '':
 610             return self._get_n_results(query, 1)
 611         elif prefix == 'all':
 612             return self._get_n_results(query, self._MAX_RESULTS)
 613         else:
 614             n = int(prefix)
 615             if n <= 0:
 616                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 617             elif n > self._MAX_RESULTS:
 618                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 619                 n = self._MAX_RESULTS
 620             return self._get_n_results(query, n)
 621
 622     def _get_n_results(self, query, n):
 623         """Get a specified number of results for a query"""
 624         raise NotImplementedError("This method must be implemented by subclasses")
 625
 626     @property
 627     def SEARCH_KEY(self):
 628         return self._SEARCH_KEY