git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import os
   5 import re
   6 import socket
   7 import sys
   8 import netrc
   9 import xml.etree.ElementTree
  10
  11 from ..utils import (
  12     compat_http_client,
  13     compat_urllib_error,
  14     compat_urllib_parse_urlparse,
  15     compat_str,
  16
  17     clean_html,
  18     compiled_regex_type,
  19     ExtractorError,
  20     RegexNotFoundError,
  21     sanitize_filename,
  22     unescapeHTML,
  23 )
  24 _NO_DEFAULT = object()
  25
  26
  27 class InfoExtractor(object):
  28     """Information Extractor class.
  29
  30     Information extractors are the classes that, given a URL, extract
  31     information about the video (or videos) the URL refers to. This
  32     information includes the real video URL, the video title, author and
  33     others. The information is stored in a dictionary which is then
  34     passed to the FileDownloader. The FileDownloader processes this
  35     information possibly downloading the video to the file system, among
  36     other possible outcomes.
  37
  38     The dictionaries must include the following fields:
  39
  40     id:             Video identifier.
  41     title:          Video title, unescaped.
  42
  43     Additionally, it must contain either a formats entry or a url one:
  44
  45     formats:        A list of dictionaries for each format available, ordered
  46                     from worst to best quality.
  47
  48                     Potential fields:
  49                     * url        Mandatory. The URL of the video file
  50                     * ext        Will be calculated from url if missing
  51                     * format     A human-readable description of the format
  52                                  ("mp4 container with h264/opus").
  53                                  Calculated from the format_id, width, height.
  54                                  and format_note fields if missing.
  55                     * format_id  A short description of the format
  56                                  ("mp4_h264_opus" or "19").
  57                                 Technically optional, but strongly recommended.
  58                     * format_note Additional info about the format
  59                                  ("3D" or "DASH video")
  60                     * width      Width of the video, if known
  61                     * height     Height of the video, if known
  62                     * resolution Textual description of width and height
  63                     * tbr        Average bitrate of audio and video in KBit/s
  64                     * abr        Average audio bitrate in KBit/s
  65                     * acodec     Name of the audio codec in use
  66                     * asr        Audio sampling rate in Hertz
  67                     * vbr        Average video bitrate in KBit/s
  68                     * vcodec     Name of the video codec in use
  69                     * container  Name of the container format
  70                     * filesize   The number of bytes, if known in advance
  71                     * player_url SWF Player URL (used for rtmpdump).
  72                     * protocol   The protocol that will be used for the actual
  73                                  download, lower-case.
  74                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  75                     * preference Order number of this format. If this field is
  76                                  present and not None, the formats get sorted
  77                                  by this field.
  78                                  -1 for default (order by other properties),
  79                                  -2 or smaller for less than default.
  80                     * quality    Order number of the video quality of this
  81                                  format, irrespective of the file format.
  82                                  -1 for default (order by other properties),
  83                                  -2 or smaller for less than default.
  84     url:            Final video URL.
  85     ext:            Video filename extension.
  86     format:         The video format, defaults to ext (used for --get-format)
  87     player_url:     SWF Player URL (used for rtmpdump).
  88
  89     The following fields are optional:
  90
  91     display_id      An alternative identifier for the video, not necessarily
  92                     unique, but available before title. Typically, id is
  93                     something like "4234987", title "Dancing naked mole rats",
  94                     and display_id "dancing-naked-mole-rats"
  95     thumbnails:     A list of dictionaries (with the entries "resolution" and
  96                     "url") for the varying thumbnails
  97     thumbnail:      Full URL to a video thumbnail image.
  98     description:    One-line video description.
  99     uploader:       Full name of the video uploader.
 100     upload_date:    Video upload date (YYYYMMDD).
 101     uploader_id:    Nickname or id of the video uploader.
 102     location:       Physical location of the video.
 103     subtitles:      The subtitle file contents as a dictionary in the format
 104                     {language: subtitles}.
 105     duration:       Length of the video in seconds, as an integer.
 106     view_count:     How many users have watched the video on the platform.
 107     like_count:     Number of positive ratings of the video
 108     dislike_count:  Number of negative ratings of the video
 109     comment_count:  Number of comments on the video
 110     age_limit:      Age restriction for the video, as an integer (years)
 111     webpage_url:    The url to the video webpage, if given to youtube-dl it
 112                     should allow to get the same result again. (It will be set
 113                     by YoutubeDL if it's missing)
 114
 115     Unless mentioned otherwise, the fields should be Unicode strings.
 116
 117     Subclasses of this one should re-define the _real_initialize() and
 118     _real_extract() methods and define a _VALID_URL regexp.
 119     Probably, they should also be added to the list of extractors.
 120
 121     _real_extract() must return a *list* of information dictionaries as
 122     described above.
 123
 124     Finally, the _WORKING attribute should be set to False for broken IEs
 125     in order to warn the users and skip the tests.
 126     """
 127
 128     _ready = False
 129     _downloader = None
 130     _WORKING = True
 131
 132     def __init__(self, downloader=None):
 133         """Constructor. Receives an optional downloader."""
 134         self._ready = False
 135         self.set_downloader(downloader)
 136
 137     @classmethod
 138     def suitable(cls, url):
 139         """Receives a URL and returns True if suitable for this IE."""
 140
 141         # This does not use has/getattr intentionally - we want to know whether
 142         # we have cached the regexp for *this* class, whereas getattr would also
 143         # match the superclass
 144         if '_VALID_URL_RE' not in cls.__dict__:
 145             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 146         return cls._VALID_URL_RE.match(url) is not None
 147
 148     @classmethod
 149     def working(cls):
 150         """Getter method for _WORKING."""
 151         return cls._WORKING
 152
 153     def initialize(self):
 154         """Initializes an instance (authentication, etc)."""
 155         if not self._ready:
 156             self._real_initialize()
 157             self._ready = True
 158
 159     def extract(self, url):
 160         """Extracts URL information and returns it in list of dicts."""
 161         self.initialize()
 162         return self._real_extract(url)
 163
 164     def set_downloader(self, downloader):
 165         """Sets the downloader for this IE."""
 166         self._downloader = downloader
 167
 168     def _real_initialize(self):
 169         """Real initialization process. Redefine in subclasses."""
 170         pass
 171
 172     def _real_extract(self, url):
 173         """Real extraction process. Redefine in subclasses."""
 174         pass
 175
 176     @classmethod
 177     def ie_key(cls):
 178         """A string for getting the InfoExtractor with get_info_extractor"""
 179         return cls.__name__[:-2]
 180
 181     @property
 182     def IE_NAME(self):
 183         return type(self).__name__[:-2]
 184
 185     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 186         """ Returns the response handle """
 187         if note is None:
 188             self.report_download_webpage(video_id)
 189         elif note is not False:
 190             if video_id is None:
 191                 self.to_screen(u'%s' % (note,))
 192             else:
 193                 self.to_screen(u'%s: %s' % (video_id, note))
 194         try:
 195             return self._downloader.urlopen(url_or_request)
 196         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 197             if errnote is False:
 198                 return False
 199             if errnote is None:
 200                 errnote = u'Unable to download webpage'
 201             errmsg = u'%s: %s' % (errnote, compat_str(err))
 202             if fatal:
 203                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 204             else:
 205                 self._downloader.report_warning(errmsg)
 206                 return False
 207
 208     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 209         """ Returns a tuple (page content as string, URL handle) """
 210
 211         # Strip hashes from the URL (#1038)
 212         if isinstance(url_or_request, (compat_str, str)):
 213             url_or_request = url_or_request.partition('#')[0]
 214
 215         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 216         if urlh is False:
 217             assert not fatal
 218             return False
 219         content_type = urlh.headers.get('Content-Type', '')
 220         webpage_bytes = urlh.read()
 221         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 222         if m:
 223             encoding = m.group(1)
 224         else:
 225             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 226                           webpage_bytes[:1024])
 227             if m:
 228                 encoding = m.group(1).decode('ascii')
 229             elif webpage_bytes.startswith(b'\xff\xfe'):
 230                 encoding = 'utf-16'
 231             else:
 232                 encoding = 'utf-8'
 233         if self._downloader.params.get('dump_intermediate_pages', False):
 234             try:
 235                 url = url_or_request.get_full_url()
 236             except AttributeError:
 237                 url = url_or_request
 238             self.to_screen(u'Dumping request to ' + url)
 239             dump = base64.b64encode(webpage_bytes).decode('ascii')
 240             self._downloader.to_screen(dump)
 241         if self._downloader.params.get('write_pages', False):
 242             try:
 243                 url = url_or_request.get_full_url()
 244             except AttributeError:
 245                 url = url_or_request
 246             if len(url) > 200:
 247                 h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
 248                 url = url[:200 - len(h)] + h
 249             raw_filename = ('%s_%s.dump' % (video_id, url))
 250             filename = sanitize_filename(raw_filename, restricted=True)
 251             self.to_screen(u'Saving request to ' + filename)
 252             with open(filename, 'wb') as outf:
 253                 outf.write(webpage_bytes)
 254
 255         content = webpage_bytes.decode(encoding, 'replace')
 256         return (content, urlh)
 257
 258     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 259         """ Returns the data of the page as a string """
 260         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 261         if res is False:
 262             return res
 263         else:
 264             content, _ = res
 265             return content
 266
 267     def _download_xml(self, url_or_request, video_id,
 268                       note=u'Downloading XML', errnote=u'Unable to download XML',
 269                       transform_source=None):
 270         """Return the xml as an xml.etree.ElementTree.Element"""
 271         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 272         if transform_source:
 273             xml_string = transform_source(xml_string)
 274         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 275
 276     def _download_json(self, url_or_request, video_id,
 277                        note=u'Downloading JSON metadata',
 278                        errnote=u'Unable to download JSON metadata',
 279                        transform_source=None):
 280         json_string = self._download_webpage(url_or_request, video_id, note, errnote)
 281         if transform_source:
 282             json_string = transform_source(json_string)
 283         try:
 284             return json.loads(json_string)
 285         except ValueError as ve:
 286             raise ExtractorError('Failed to download JSON', cause=ve)
 287
 288     def report_warning(self, msg, video_id=None):
 289         idstr = u'' if video_id is None else u'%s: ' % video_id
 290         self._downloader.report_warning(
 291             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 292
 293     def to_screen(self, msg):
 294         """Print msg to screen, prefixing it with '[ie_name]'"""
 295         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 296
 297     def report_extraction(self, id_or_name):
 298         """Report information extraction."""
 299         self.to_screen(u'%s: Extracting information' % id_or_name)
 300
 301     def report_download_webpage(self, video_id):
 302         """Report webpage download."""
 303         self.to_screen(u'%s: Downloading webpage' % video_id)
 304
 305     def report_age_confirmation(self):
 306         """Report attempt to confirm age."""
 307         self.to_screen(u'Confirming age')
 308
 309     def report_login(self):
 310         """Report attempt to log in."""
 311         self.to_screen(u'Logging in')
 312
 313     #Methods for following #608
 314     @staticmethod
 315     def url_result(url, ie=None, video_id=None):
 316         """Returns a url that points to a page that should be processed"""
 317         #TODO: ie should be the class used for getting the info
 318         video_info = {'_type': 'url',
 319                       'url': url,
 320                       'ie_key': ie}
 321         if video_id is not None:
 322             video_info['id'] = video_id
 323         return video_info
 324     @staticmethod
 325     def playlist_result(entries, playlist_id=None, playlist_title=None):
 326         """Returns a playlist"""
 327         video_info = {'_type': 'playlist',
 328                       'entries': entries}
 329         if playlist_id:
 330             video_info['id'] = playlist_id
 331         if playlist_title:
 332             video_info['title'] = playlist_title
 333         return video_info
 334
 335     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 336         """
 337         Perform a regex search on the given string, using a single or a list of
 338         patterns returning the first matching group.
 339         In case of failure return a default value or raise a WARNING or a
 340         RegexNotFoundError, depending on fatal, specifying the field name.
 341         """
 342         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 343             mobj = re.search(pattern, string, flags)
 344         else:
 345             for p in pattern:
 346                 mobj = re.search(p, string, flags)
 347                 if mobj: break
 348
 349         if os.name != 'nt' and sys.stderr.isatty():
 350             _name = u'\033[0;34m%s\033[0m' % name
 351         else:
 352             _name = name
 353
 354         if mobj:
 355             # return the first matching group
 356             return next(g for g in mobj.groups() if g is not None)
 357         elif default is not _NO_DEFAULT:
 358             return default
 359         elif fatal:
 360             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 361         else:
 362             self._downloader.report_warning(u'unable to extract %s; '
 363                 u'please report this issue on http://yt-dl.org/bug' % _name)
 364             return None
 365
 366     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 367         """
 368         Like _search_regex, but strips HTML tags and unescapes entities.
 369         """
 370         res = self._search_regex(pattern, string, name, default, fatal, flags)
 371         if res:
 372             return clean_html(res).strip()
 373         else:
 374             return res
 375
 376     def _get_login_info(self):
 377         """
 378         Get the the login info as (username, password)
 379         It will look in the netrc file using the _NETRC_MACHINE value
 380         If there's no info available, return (None, None)
 381         """
 382         if self._downloader is None:
 383             return (None, None)
 384
 385         username = None
 386         password = None
 387         downloader_params = self._downloader.params
 388
 389         # Attempt to use provided username and password or .netrc data
 390         if downloader_params.get('username', None) is not None:
 391             username = downloader_params['username']
 392             password = downloader_params['password']
 393         elif downloader_params.get('usenetrc', False):
 394             try:
 395                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 396                 if info is not None:
 397                     username = info[0]
 398                     password = info[2]
 399                 else:
 400                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 401             except (IOError, netrc.NetrcParseError) as err:
 402                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 403
 404         return (username, password)
 405
 406     # Helper functions for extracting OpenGraph info
 407     @staticmethod
 408     def _og_regexes(prop):
 409         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 410         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 411         template = r'<meta[^>]+?%s[^>]+?%s'
 412         return [
 413             template % (property_re, content_re),
 414             template % (content_re, property_re),
 415         ]
 416
 417     def _og_search_property(self, prop, html, name=None, **kargs):
 418         if name is None:
 419             name = 'OpenGraph %s' % prop
 420         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 421         if escaped is None:
 422             return None
 423         return unescapeHTML(escaped)
 424
 425     def _og_search_thumbnail(self, html, **kargs):
 426         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 427
 428     def _og_search_description(self, html, **kargs):
 429         return self._og_search_property('description', html, fatal=False, **kargs)
 430
 431     def _og_search_title(self, html, **kargs):
 432         return self._og_search_property('title', html, **kargs)
 433
 434     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 435         regexes = self._og_regexes('video')
 436         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 437         return self._html_search_regex(regexes, html, name, **kargs)
 438
 439     def _html_search_meta(self, name, html, display_name=None):
 440         if display_name is None:
 441             display_name = name
 442         return self._html_search_regex(
 443             r'''(?ix)<meta
 444                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 445                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 446             html, display_name, fatal=False)
 447
 448     def _dc_search_uploader(self, html):
 449         return self._html_search_meta('dc.creator', html, 'uploader')
 450
 451     def _rta_search(self, html):
 452         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 453         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 454                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 455                      html):
 456             return 18
 457         return 0
 458
 459     def _media_rating_search(self, html):
 460         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 461         rating = self._html_search_meta('rating', html)
 462
 463         if not rating:
 464             return None
 465
 466         RATING_TABLE = {
 467             'safe for kids': 0,
 468             'general': 8,
 469             '14 years': 14,
 470             'mature': 17,
 471             'restricted': 19,
 472         }
 473         return RATING_TABLE.get(rating.lower(), None)
 474
 475     def _twitter_search_player(self, html):
 476         return self._html_search_meta('twitter:player', html,
 477             'twitter card player')
 478
 479     def _sort_formats(self, formats):
 480         if not formats:
 481             raise ExtractorError(u'No video formats found')
 482
 483         def _formats_key(f):
 484             # TODO remove the following workaround
 485             from ..utils import determine_ext
 486             if not f.get('ext') and 'url' in f:
 487                 f['ext'] = determine_ext(f['url'])
 488
 489             preference = f.get('preference')
 490             if preference is None:
 491                 proto = f.get('protocol')
 492                 if proto is None:
 493                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 494
 495                 preference = 0 if proto in ['http', 'https'] else -0.1
 496                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 497                     preference -= 0.5
 498
 499             if f.get('vcodec') == 'none':  # audio only
 500                 if self._downloader.params.get('prefer_free_formats'):
 501                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 502                 else:
 503                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 504                 ext_preference = 0
 505                 try:
 506                     audio_ext_preference = ORDER.index(f['ext'])
 507                 except ValueError:
 508                     audio_ext_preference = -1
 509             else:
 510                 if self._downloader.params.get('prefer_free_formats'):
 511                     ORDER = [u'flv', u'mp4', u'webm']
 512                 else:
 513                     ORDER = [u'webm', u'flv', u'mp4']
 514                 try:
 515                     ext_preference = ORDER.index(f['ext'])
 516                 except ValueError:
 517                     ext_preference = -1
 518                 audio_ext_preference = 0
 519
 520             return (
 521                 preference,
 522                 f.get('quality') if f.get('quality') is not None else -1,
 523                 f.get('height') if f.get('height') is not None else -1,
 524                 f.get('width') if f.get('width') is not None else -1,
 525                 ext_preference,
 526                 f.get('tbr') if f.get('tbr') is not None else -1,
 527                 f.get('vbr') if f.get('vbr') is not None else -1,
 528                 f.get('abr') if f.get('abr') is not None else -1,
 529                 audio_ext_preference,
 530                 f.get('filesize') if f.get('filesize') is not None else -1,
 531                 f.get('format_id'),
 532             )
 533         formats.sort(key=_formats_key)
 534
 535
 536 class SearchInfoExtractor(InfoExtractor):
 537     """
 538     Base class for paged search queries extractors.
 539     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 540     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 541     """
 542
 543     @classmethod
 544     def _make_valid_url(cls):
 545         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 546
 547     @classmethod
 548     def suitable(cls, url):
 549         return re.match(cls._make_valid_url(), url) is not None
 550
 551     def _real_extract(self, query):
 552         mobj = re.match(self._make_valid_url(), query)
 553         if mobj is None:
 554             raise ExtractorError(u'Invalid search query "%s"' % query)
 555
 556         prefix = mobj.group('prefix')
 557         query = mobj.group('query')
 558         if prefix == '':
 559             return self._get_n_results(query, 1)
 560         elif prefix == 'all':
 561             return self._get_n_results(query, self._MAX_RESULTS)
 562         else:
 563             n = int(prefix)
 564             if n <= 0:
 565                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 566             elif n > self._MAX_RESULTS:
 567                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 568                 n = self._MAX_RESULTS
 569             return self._get_n_results(query, n)
 570
 571     def _get_n_results(self, query, n):
 572         """Get a specified number of results for a query"""
 573         raise NotImplementedError("This method must be implemented by subclasses")
 574
 575     @property
 576     def SEARCH_KEY(self):
 577         return self._SEARCH_KEY