git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_parse_urlparse,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22 _NO_DEFAULT = object()
  23
  24
  25 class InfoExtractor(object):
  26     """Information Extractor class.
  27
  28     Information extractors are the classes that, given a URL, extract
  29     information about the video (or videos) the URL refers to. This
  30     information includes the real video URL, the video title, author and
  31     others. The information is stored in a dictionary which is then
  32     passed to the FileDownloader. The FileDownloader processes this
  33     information possibly downloading the video to the file system, among
  34     other possible outcomes.
  35
  36     The dictionaries must include the following fields:
  37
  38     id:             Video identifier.
  39     title:          Video title, unescaped.
  40
  41     Additionally, it must contain either a formats entry or a url one:
  42
  43     formats:        A list of dictionaries for each format available, ordered
  44                     from worst to best quality.
  45
  46                     Potential fields:
  47                     * url        Mandatory. The URL of the video file
  48                     * ext        Will be calculated from url if missing
  49                     * format     A human-readable description of the format
  50                                  ("mp4 container with h264/opus").
  51                                  Calculated from the format_id, width, height.
  52                                  and format_note fields if missing.
  53                     * format_id  A short description of the format
  54                                  ("mp4_h264_opus" or "19").
  55                                 Technically optional, but strongly recommended.
  56                     * format_note Additional info about the format
  57                                  ("3D" or "DASH video")
  58                     * width      Width of the video, if known
  59                     * height     Height of the video, if known
  60                     * resolution Textual description of width and height
  61                     * tbr        Average bitrate of audio and video in KBit/s
  62                     * abr        Average audio bitrate in KBit/s
  63                     * acodec     Name of the audio codec in use
  64                     * vbr        Average video bitrate in KBit/s
  65                     * vcodec     Name of the video codec in use
  66                     * filesize   The number of bytes, if known in advance
  67                     * player_url SWF Player URL (used for rtmpdump).
  68                     * protocol   The protocol that will be used for the actual
  69                                  download, lower-case.
  70                                  "http", "https", "rtsp", "rtmp" or so.
  71                     * preference Order number of this format. If this field is
  72                                  present and not None, the formats get sorted
  73                                  by this field.
  74                                  -1 for default (order by other properties),
  75                                  -2 or smaller for less than default.
  76                     * quality    Order number of the video quality of this
  77                                  format, irrespective of the file format.
  78                                  -1 for default (order by other properties),
  79                                  -2 or smaller for less than default.
  80     url:            Final video URL.
  81     ext:            Video filename extension.
  82     format:         The video format, defaults to ext (used for --get-format)
  83     player_url:     SWF Player URL (used for rtmpdump).
  84
  85     The following fields are optional:
  86
  87     thumbnails:     A list of dictionaries (with the entries "resolution" and
  88                     "url") for the varying thumbnails
  89     thumbnail:      Full URL to a video thumbnail image.
  90     description:    One-line video description.
  91     uploader:       Full name of the video uploader.
  92     upload_date:    Video upload date (YYYYMMDD).
  93     uploader_id:    Nickname or id of the video uploader.
  94     location:       Physical location of the video.
  95     subtitles:      The subtitle file contents as a dictionary in the format
  96                     {language: subtitles}.
  97     duration:       Length of the video in seconds, as an integer.
  98     view_count:     How many users have watched the video on the platform.
  99     like_count:     Number of positive ratings of the video
 100     dislike_count:  Number of negative ratings of the video
 101     comment_count:  Number of comments on the video
 102     age_limit:      Age restriction for the video, as an integer (years)
 103     webpage_url:    The url to the video webpage, if given to youtube-dl it
 104                     should allow to get the same result again. (It will be set
 105                     by YoutubeDL if it's missing)
 106
 107     Unless mentioned otherwise, the fields should be Unicode strings.
 108
 109     Subclasses of this one should re-define the _real_initialize() and
 110     _real_extract() methods and define a _VALID_URL regexp.
 111     Probably, they should also be added to the list of extractors.
 112
 113     _real_extract() must return a *list* of information dictionaries as
 114     described above.
 115
 116     Finally, the _WORKING attribute should be set to False for broken IEs
 117     in order to warn the users and skip the tests.
 118     """
 119
 120     _ready = False
 121     _downloader = None
 122     _WORKING = True
 123
 124     def __init__(self, downloader=None):
 125         """Constructor. Receives an optional downloader."""
 126         self._ready = False
 127         self.set_downloader(downloader)
 128
 129     @classmethod
 130     def suitable(cls, url):
 131         """Receives a URL and returns True if suitable for this IE."""
 132
 133         # This does not use has/getattr intentionally - we want to know whether
 134         # we have cached the regexp for *this* class, whereas getattr would also
 135         # match the superclass
 136         if '_VALID_URL_RE' not in cls.__dict__:
 137             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 138         return cls._VALID_URL_RE.match(url) is not None
 139
 140     @classmethod
 141     def working(cls):
 142         """Getter method for _WORKING."""
 143         return cls._WORKING
 144
 145     def initialize(self):
 146         """Initializes an instance (authentication, etc)."""
 147         if not self._ready:
 148             self._real_initialize()
 149             self._ready = True
 150
 151     def extract(self, url):
 152         """Extracts URL information and returns it in list of dicts."""
 153         self.initialize()
 154         return self._real_extract(url)
 155
 156     def set_downloader(self, downloader):
 157         """Sets the downloader for this IE."""
 158         self._downloader = downloader
 159
 160     def _real_initialize(self):
 161         """Real initialization process. Redefine in subclasses."""
 162         pass
 163
 164     def _real_extract(self, url):
 165         """Real extraction process. Redefine in subclasses."""
 166         pass
 167
 168     @classmethod
 169     def ie_key(cls):
 170         """A string for getting the InfoExtractor with get_info_extractor"""
 171         return cls.__name__[:-2]
 172
 173     @property
 174     def IE_NAME(self):
 175         return type(self).__name__[:-2]
 176
 177     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 178         """ Returns the response handle """
 179         if note is None:
 180             self.report_download_webpage(video_id)
 181         elif note is not False:
 182             if video_id is None:
 183                 self.to_screen(u'%s' % (note,))
 184             else:
 185                 self.to_screen(u'%s: %s' % (video_id, note))
 186         try:
 187             return self._downloader.urlopen(url_or_request)
 188         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 189             if errnote is False:
 190                 return False
 191             if errnote is None:
 192                 errnote = u'Unable to download webpage'
 193             errmsg = u'%s: %s' % (errnote, compat_str(err))
 194             if fatal:
 195                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 196             else:
 197                 self._downloader.report_warning(errmsg)
 198                 return False
 199
 200     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 201         """ Returns a tuple (page content as string, URL handle) """
 202
 203         # Strip hashes from the URL (#1038)
 204         if isinstance(url_or_request, (compat_str, str)):
 205             url_or_request = url_or_request.partition('#')[0]
 206
 207         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 208         if urlh is False:
 209             assert not fatal
 210             return False
 211         content_type = urlh.headers.get('Content-Type', '')
 212         webpage_bytes = urlh.read()
 213         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 214         if m:
 215             encoding = m.group(1)
 216         else:
 217             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 218                           webpage_bytes[:1024])
 219             if m:
 220                 encoding = m.group(1).decode('ascii')
 221             else:
 222                 encoding = 'utf-8'
 223         if self._downloader.params.get('dump_intermediate_pages', False):
 224             try:
 225                 url = url_or_request.get_full_url()
 226             except AttributeError:
 227                 url = url_or_request
 228             self.to_screen(u'Dumping request to ' + url)
 229             dump = base64.b64encode(webpage_bytes).decode('ascii')
 230             self._downloader.to_screen(dump)
 231         if self._downloader.params.get('write_pages', False):
 232             try:
 233                 url = url_or_request.get_full_url()
 234             except AttributeError:
 235                 url = url_or_request
 236             raw_filename = ('%s_%s.dump' % (video_id, url))
 237             filename = sanitize_filename(raw_filename, restricted=True)
 238             self.to_screen(u'Saving request to ' + filename)
 239             with open(filename, 'wb') as outf:
 240                 outf.write(webpage_bytes)
 241
 242         content = webpage_bytes.decode(encoding, 'replace')
 243         return (content, urlh)
 244
 245     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 246         """ Returns the data of the page as a string """
 247         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 248         if res is False:
 249             return res
 250         else:
 251             content, _ = res
 252             return content
 253
 254     def _download_xml(self, url_or_request, video_id,
 255                       note=u'Downloading XML', errnote=u'Unable to download XML',
 256                       transform_source=None):
 257         """Return the xml as an xml.etree.ElementTree.Element"""
 258         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 259         if transform_source:
 260             xml_string = transform_source(xml_string)
 261         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 262
 263     def report_warning(self, msg, video_id=None):
 264         idstr = u'' if video_id is None else u'%s: ' % video_id
 265         self._downloader.report_warning(
 266             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 267
 268     def to_screen(self, msg):
 269         """Print msg to screen, prefixing it with '[ie_name]'"""
 270         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 271
 272     def report_extraction(self, id_or_name):
 273         """Report information extraction."""
 274         self.to_screen(u'%s: Extracting information' % id_or_name)
 275
 276     def report_download_webpage(self, video_id):
 277         """Report webpage download."""
 278         self.to_screen(u'%s: Downloading webpage' % video_id)
 279
 280     def report_age_confirmation(self):
 281         """Report attempt to confirm age."""
 282         self.to_screen(u'Confirming age')
 283
 284     def report_login(self):
 285         """Report attempt to log in."""
 286         self.to_screen(u'Logging in')
 287
 288     #Methods for following #608
 289     @staticmethod
 290     def url_result(url, ie=None, video_id=None):
 291         """Returns a url that points to a page that should be processed"""
 292         #TODO: ie should be the class used for getting the info
 293         video_info = {'_type': 'url',
 294                       'url': url,
 295                       'ie_key': ie}
 296         if video_id is not None:
 297             video_info['id'] = video_id
 298         return video_info
 299     @staticmethod
 300     def playlist_result(entries, playlist_id=None, playlist_title=None):
 301         """Returns a playlist"""
 302         video_info = {'_type': 'playlist',
 303                       'entries': entries}
 304         if playlist_id:
 305             video_info['id'] = playlist_id
 306         if playlist_title:
 307             video_info['title'] = playlist_title
 308         return video_info
 309
 310     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 311         """
 312         Perform a regex search on the given string, using a single or a list of
 313         patterns returning the first matching group.
 314         In case of failure return a default value or raise a WARNING or a
 315         RegexNotFoundError, depending on fatal, specifying the field name.
 316         """
 317         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 318             mobj = re.search(pattern, string, flags)
 319         else:
 320             for p in pattern:
 321                 mobj = re.search(p, string, flags)
 322                 if mobj: break
 323
 324         if os.name != 'nt' and sys.stderr.isatty():
 325             _name = u'\033[0;34m%s\033[0m' % name
 326         else:
 327             _name = name
 328
 329         if mobj:
 330             # return the first matching group
 331             return next(g for g in mobj.groups() if g is not None)
 332         elif default is not _NO_DEFAULT:
 333             return default
 334         elif fatal:
 335             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 336         else:
 337             self._downloader.report_warning(u'unable to extract %s; '
 338                 u'please report this issue on http://yt-dl.org/bug' % _name)
 339             return None
 340
 341     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 342         """
 343         Like _search_regex, but strips HTML tags and unescapes entities.
 344         """
 345         res = self._search_regex(pattern, string, name, default, fatal, flags)
 346         if res:
 347             return clean_html(res).strip()
 348         else:
 349             return res
 350
 351     def _get_login_info(self):
 352         """
 353         Get the the login info as (username, password)
 354         It will look in the netrc file using the _NETRC_MACHINE value
 355         If there's no info available, return (None, None)
 356         """
 357         if self._downloader is None:
 358             return (None, None)
 359
 360         username = None
 361         password = None
 362         downloader_params = self._downloader.params
 363
 364         # Attempt to use provided username and password or .netrc data
 365         if downloader_params.get('username', None) is not None:
 366             username = downloader_params['username']
 367             password = downloader_params['password']
 368         elif downloader_params.get('usenetrc', False):
 369             try:
 370                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 371                 if info is not None:
 372                     username = info[0]
 373                     password = info[2]
 374                 else:
 375                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 376             except (IOError, netrc.NetrcParseError) as err:
 377                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 378
 379         return (username, password)
 380
 381     # Helper functions for extracting OpenGraph info
 382     @staticmethod
 383     def _og_regexes(prop):
 384         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 385         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 386         template = r'<meta[^>]+?%s[^>]+?%s'
 387         return [
 388             template % (property_re, content_re),
 389             template % (content_re, property_re),
 390         ]
 391
 392     def _og_search_property(self, prop, html, name=None, **kargs):
 393         if name is None:
 394             name = 'OpenGraph %s' % prop
 395         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 396         if escaped is None:
 397             return None
 398         return unescapeHTML(escaped)
 399
 400     def _og_search_thumbnail(self, html, **kargs):
 401         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 402
 403     def _og_search_description(self, html, **kargs):
 404         return self._og_search_property('description', html, fatal=False, **kargs)
 405
 406     def _og_search_title(self, html, **kargs):
 407         return self._og_search_property('title', html, **kargs)
 408
 409     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 410         regexes = self._og_regexes('video')
 411         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 412         return self._html_search_regex(regexes, html, name, **kargs)
 413
 414     def _html_search_meta(self, name, html, display_name=None):
 415         if display_name is None:
 416             display_name = name
 417         return self._html_search_regex(
 418             r'''(?ix)<meta
 419                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 420                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 421             html, display_name, fatal=False)
 422
 423     def _dc_search_uploader(self, html):
 424         return self._html_search_meta('dc.creator', html, 'uploader')
 425
 426     def _rta_search(self, html):
 427         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 428         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 429                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 430                      html):
 431             return 18
 432         return 0
 433
 434     def _media_rating_search(self, html):
 435         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 436         rating = self._html_search_meta('rating', html)
 437
 438         if not rating:
 439             return None
 440
 441         RATING_TABLE = {
 442             'safe for kids': 0,
 443             'general': 8,
 444             '14 years': 14,
 445             'mature': 17,
 446             'restricted': 19,
 447         }
 448         return RATING_TABLE.get(rating.lower(), None)
 449
 450     def _sort_formats(self, formats):
 451         def _formats_key(f):
 452             # TODO remove the following workaround
 453             from ..utils import determine_ext
 454             if not f.get('ext') and 'url' in f:
 455                 f['ext'] = determine_ext(f['url'])
 456
 457             preference = f.get('preference')
 458             if preference is None:
 459                 proto = f.get('protocol')
 460                 if proto is None:
 461                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 462
 463                 preference = 0 if proto in ['http', 'https'] else -0.1
 464                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 465                     preference -= 0.5
 466
 467             if f.get('vcodec') == 'none':  # audio only
 468                 if self._downloader.params.get('prefer_free_formats'):
 469                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 470                 else:
 471                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 472                 ext_preference = 0
 473                 try:
 474                     audio_ext_preference = ORDER.index(f['ext'])
 475                 except ValueError:
 476                     audio_ext_preference = -1
 477             else:
 478                 if self._downloader.params.get('prefer_free_formats'):
 479                     ORDER = [u'flv', u'mp4', u'webm']
 480                 else:
 481                     ORDER = [u'webm', u'flv', u'mp4']
 482                 try:
 483                     ext_preference = ORDER.index(f['ext'])
 484                 except ValueError:
 485                     ext_preference = -1
 486                 audio_ext_preference = 0
 487
 488             return (
 489                 preference,
 490                 f.get('quality') if f.get('quality') is not None else -1,
 491                 f.get('height') if f.get('height') is not None else -1,
 492                 f.get('width') if f.get('width') is not None else -1,
 493                 ext_preference,
 494                 f.get('vbr') if f.get('vbr') is not None else -1,
 495                 f.get('abr') if f.get('abr') is not None else -1,
 496                 audio_ext_preference,
 497                 f.get('filesize') if f.get('filesize') is not None else -1,
 498                 f.get('format_id'),
 499             )
 500         formats.sort(key=_formats_key)
 501
 502
 503 class SearchInfoExtractor(InfoExtractor):
 504     """
 505     Base class for paged search queries extractors.
 506     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 507     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 508     """
 509
 510     @classmethod
 511     def _make_valid_url(cls):
 512         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 513
 514     @classmethod
 515     def suitable(cls, url):
 516         return re.match(cls._make_valid_url(), url) is not None
 517
 518     def _real_extract(self, query):
 519         mobj = re.match(self._make_valid_url(), query)
 520         if mobj is None:
 521             raise ExtractorError(u'Invalid search query "%s"' % query)
 522
 523         prefix = mobj.group('prefix')
 524         query = mobj.group('query')
 525         if prefix == '':
 526             return self._get_n_results(query, 1)
 527         elif prefix == 'all':
 528             return self._get_n_results(query, self._MAX_RESULTS)
 529         else:
 530             n = int(prefix)
 531             if n <= 0:
 532                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 533             elif n > self._MAX_RESULTS:
 534                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 535                 n = self._MAX_RESULTS
 536             return self._get_n_results(query, n)
 537
 538     def _get_n_results(self, query, n):
 539         """Get a specified number of results for a query"""
 540         raise NotImplementedError("This method must be implemented by subclasses")
 541
 542     @property
 543     def SEARCH_KEY(self):
 544         return self._SEARCH_KEY