_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_http_client,
  18     compat_urllib_error,
  19     compat_urllib_parse_urlparse,
  20     compat_urlparse,
  21     compat_str,
  22 )
  23 from ..utils import (
  24     age_restricted,
  25     clean_html,
  26     compiled_regex_type,
  27     ExtractorError,
  28     float_or_none,
  29     int_or_none,
  30     RegexNotFoundError,
  31     sanitize_filename,
  32     unescapeHTML,
  33 )
  34 _NO_DEFAULT = object()
  35
  36
  37 class InfoExtractor(object):
  38     """Information Extractor class.
  39
  40     Information extractors are the classes that, given a URL, extract
  41     information about the video (or videos) the URL refers to. This
  42     information includes the real video URL, the video title, author and
  43     others. The information is stored in a dictionary which is then
  44     passed to the YoutubeDL. The YoutubeDL processes this
  45     information possibly downloading the video to the file system, among
  46     other possible outcomes.
  47
  48     The type field determines the the type of the result.
  49     By far the most common value (and the default if _type is missing) is
  50     "video", which indicates a single video.
  51
  52     For a video, the dictionaries must include the following fields:
  53
  54     id:             Video identifier.
  55     title:          Video title, unescaped.
  56
  57     Additionally, it must contain either a formats entry or a url one:
  58
  59     formats:        A list of dictionaries for each format available, ordered
  60                     from worst to best quality.
  61
  62                     Potential fields:
  63                     * url        Mandatory. The URL of the video file
  64                     * ext        Will be calculated from url if missing
  65                     * format     A human-readable description of the format
  66                                  ("mp4 container with h264/opus").
  67                                  Calculated from the format_id, width, height.
  68                                  and format_note fields if missing.
  69                     * format_id  A short description of the format
  70                                  ("mp4_h264_opus" or "19").
  71                                 Technically optional, but strongly recommended.
  72                     * format_note Additional info about the format
  73                                  ("3D" or "DASH video")
  74                     * width      Width of the video, if known
  75                     * height     Height of the video, if known
  76                     * resolution Textual description of width and height
  77                     * tbr        Average bitrate of audio and video in KBit/s
  78                     * abr        Average audio bitrate in KBit/s
  79                     * acodec     Name of the audio codec in use
  80                     * asr        Audio sampling rate in Hertz
  81                     * vbr        Average video bitrate in KBit/s
  82                     * fps        Frame rate
  83                     * vcodec     Name of the video codec in use
  84                     * container  Name of the container format
  85                     * filesize   The number of bytes, if known in advance
  86                     * filesize_approx  An estimate for the number of bytes
  87                     * player_url SWF Player URL (used for rtmpdump).
  88                     * protocol   The protocol that will be used for the actual
  89                                  download, lower-case.
  90                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  91                     * preference Order number of this format. If this field is
  92                                  present and not None, the formats get sorted
  93                                  by this field, regardless of all other values.
  94                                  -1 for default (order by other properties),
  95                                  -2 or smaller for less than default.
  96                                  < -1000 to hide the format (if there is
  97                                     another one which is strictly better)
  98                     * language_preference  Is this in the correct requested
  99                                  language?
 100                                  10 if it's what the URL is about,
 101                                  -1 for default (don't know),
 102                                  -10 otherwise, other values reserved for now.
 103                     * quality    Order number of the video quality of this
 104                                  format, irrespective of the file format.
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * source_preference  Order number for this video source
 108                                   (quality takes higher priority)
 109                                  -1 for default (order by other properties),
 110                                  -2 or smaller for less than default.
 111                     * http_referer  HTTP Referer header value to set.
 112                     * http_method  HTTP method to use for the download.
 113                     * http_headers  A dictionary of additional HTTP headers
 114                                  to add to the request.
 115                     * http_post_data  Additional data to send with a POST
 116                                  request.
 117     url:            Final video URL.
 118     ext:            Video filename extension.
 119     format:         The video format, defaults to ext (used for --get-format)
 120     player_url:     SWF Player URL (used for rtmpdump).
 121
 122     The following fields are optional:
 123
 124     alt_title:      A secondary title of the video.
 125     display_id      An alternative identifier for the video, not necessarily
 126                     unique, but available before title. Typically, id is
 127                     something like "4234987", title "Dancing naked mole rats",
 128                     and display_id "dancing-naked-mole-rats"
 129     thumbnails:     A list of dictionaries, with the following entries:
 130                         * "url"
 131                         * "width" (optional, int)
 132                         * "height" (optional, int)
 133                         * "resolution" (optional, string "{width}x{height"},
 134                                         deprecated)
 135     thumbnail:      Full URL to a video thumbnail image.
 136     description:    Full video description.
 137     uploader:       Full name of the video uploader.
 138     timestamp:      UNIX timestamp of the moment the video became available.
 139     upload_date:    Video upload date (YYYYMMDD).
 140                     If not explicitly set, calculated from timestamp.
 141     uploader_id:    Nickname or id of the video uploader.
 142     location:       Physical location where the video was filmed.
 143     subtitles:      The subtitle file contents as a dictionary in the format
 144                     {language: subtitles}.
 145     duration:       Length of the video in seconds, as an integer.
 146     view_count:     How many users have watched the video on the platform.
 147     like_count:     Number of positive ratings of the video
 148     dislike_count:  Number of negative ratings of the video
 149     comment_count:  Number of comments on the video
 150     comments:       A list of comments, each with one or more of the following
 151                     properties (all but one of text or html optional):
 152                         * "author" - human-readable name of the comment author
 153                         * "author_id" - user ID of the comment author
 154                         * "id" - Comment ID
 155                         * "html" - Comment as HTML
 156                         * "text" - Plain text of the comment
 157                         * "timestamp" - UNIX timestamp of comment
 158                         * "parent" - ID of the comment this one is replying to.
 159                                      Set to "root" to indicate that this is a
 160                                      comment to the original video.
 161     age_limit:      Age restriction for the video, as an integer (years)
 162     webpage_url:    The url to the video webpage, if given to youtube-dl it
 163                     should allow to get the same result again. (It will be set
 164                     by YoutubeDL if it's missing)
 165     categories:     A list of categories that the video falls in, for example
 166                     ["Sports", "Berlin"]
 167     is_live:        True, False, or None (=unknown). Whether this video is a
 168                     live stream that goes on instead of a fixed-length video.
 169
 170     Unless mentioned otherwise, the fields should be Unicode strings.
 171
 172     Unless mentioned otherwise, None is equivalent to absence of information.
 173
 174
 175     _type "playlist" indicates multiple videos.
 176     There must be a key "entries", which is a list, an iterable, or a PagedList
 177     object, each element of which is a valid dictionary by this specification.
 178
 179     Additionally, playlists can have "title" and "id" attributes with the same
 180     semantics as videos (see above).
 181
 182
 183     _type "multi_video" indicates that there are multiple videos that
 184     form a single show, for examples multiple acts of an opera or TV episode.
 185     It must have an entries key like a playlist and contain all the keys
 186     required for a video at the same time.
 187
 188
 189     _type "url" indicates that the video must be extracted from another
 190     location, possibly by a different extractor. Its only required key is:
 191     "url" - the next URL to extract.
 192     The key "ie_key" can be set to the class name (minus the trailing "IE",
 193     e.g. "Youtube") if the extractor class is known in advance.
 194     Additionally, the dictionary may have any properties of the resolved entity
 195     known in advance, for example "title" if the title of the referred video is
 196     known ahead of time.
 197
 198
 199     _type "url_transparent" entities have the same specification as "url", but
 200     indicate that the given additional information is more precise than the one
 201     associated with the resolved URL.
 202     This is useful when a site employs a video service that hosts the video and
 203     its technical metadata, but that video service does not embed a useful
 204     title, description etc.
 205
 206
 207     Subclasses of this one should re-define the _real_initialize() and
 208     _real_extract() methods and define a _VALID_URL regexp.
 209     Probably, they should also be added to the list of extractors.
 210
 211     Finally, the _WORKING attribute should be set to False for broken IEs
 212     in order to warn the users and skip the tests.
 213     """
 214
 215     _ready = False
 216     _downloader = None
 217     _WORKING = True
 218
 219     def __init__(self, downloader=None):
 220         """Constructor. Receives an optional downloader."""
 221         self._ready = False
 222         self.set_downloader(downloader)
 223
 224     @classmethod
 225     def suitable(cls, url):
 226         """Receives a URL and returns True if suitable for this IE."""
 227
 228         # This does not use has/getattr intentionally - we want to know whether
 229         # we have cached the regexp for *this* class, whereas getattr would also
 230         # match the superclass
 231         if '_VALID_URL_RE' not in cls.__dict__:
 232             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 233         return cls._VALID_URL_RE.match(url) is not None
 234
 235     @classmethod
 236     def _match_id(cls, url):
 237         if '_VALID_URL_RE' not in cls.__dict__:
 238             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 239         m = cls._VALID_URL_RE.match(url)
 240         assert m
 241         return m.group('id')
 242
 243     @classmethod
 244     def working(cls):
 245         """Getter method for _WORKING."""
 246         return cls._WORKING
 247
 248     def initialize(self):
 249         """Initializes an instance (authentication, etc)."""
 250         if not self._ready:
 251             self._real_initialize()
 252             self._ready = True
 253
 254     def extract(self, url):
 255         """Extracts URL information and returns it in list of dicts."""
 256         self.initialize()
 257         return self._real_extract(url)
 258
 259     def set_downloader(self, downloader):
 260         """Sets the downloader for this IE."""
 261         self._downloader = downloader
 262
 263     def _real_initialize(self):
 264         """Real initialization process. Redefine in subclasses."""
 265         pass
 266
 267     def _real_extract(self, url):
 268         """Real extraction process. Redefine in subclasses."""
 269         pass
 270
 271     @classmethod
 272     def ie_key(cls):
 273         """A string for getting the InfoExtractor with get_info_extractor"""
 274         return cls.__name__[:-2]
 275
 276     @property
 277     def IE_NAME(self):
 278         return type(self).__name__[:-2]
 279
 280     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 281         """ Returns the response handle """
 282         if note is None:
 283             self.report_download_webpage(video_id)
 284         elif note is not False:
 285             if video_id is None:
 286                 self.to_screen('%s' % (note,))
 287             else:
 288                 self.to_screen('%s: %s' % (video_id, note))
 289         try:
 290             return self._downloader.urlopen(url_or_request)
 291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 292             if errnote is False:
 293                 return False
 294             if errnote is None:
 295                 errnote = 'Unable to download webpage'
 296             errmsg = '%s: %s' % (errnote, compat_str(err))
 297             if fatal:
 298                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 299             else:
 300                 self._downloader.report_warning(errmsg)
 301                 return False
 302
 303     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 304         """ Returns a tuple (page content as string, URL handle) """
 305         # Strip hashes from the URL (#1038)
 306         if isinstance(url_or_request, (compat_str, str)):
 307             url_or_request = url_or_request.partition('#')[0]
 308
 309         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 310         if urlh is False:
 311             assert not fatal
 312             return False
 313         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 314         return (content, urlh)
 315
 316     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 317         content_type = urlh.headers.get('Content-Type', '')
 318         webpage_bytes = urlh.read()
 319         if prefix is not None:
 320             webpage_bytes = prefix + webpage_bytes
 321         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 322         if m:
 323             encoding = m.group(1)
 324         else:
 325             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 326                           webpage_bytes[:1024])
 327             if m:
 328                 encoding = m.group(1).decode('ascii')
 329             elif webpage_bytes.startswith(b'\xff\xfe'):
 330                 encoding = 'utf-16'
 331             else:
 332                 encoding = 'utf-8'
 333         if self._downloader.params.get('dump_intermediate_pages', False):
 334             try:
 335                 url = url_or_request.get_full_url()
 336             except AttributeError:
 337                 url = url_or_request
 338             self.to_screen('Dumping request to ' + url)
 339             dump = base64.b64encode(webpage_bytes).decode('ascii')
 340             self._downloader.to_screen(dump)
 341         if self._downloader.params.get('write_pages', False):
 342             try:
 343                 url = url_or_request.get_full_url()
 344             except AttributeError:
 345                 url = url_or_request
 346             basen = '%s_%s' % (video_id, url)
 347             if len(basen) > 240:
 348                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 349                 basen = basen[:240 - len(h)] + h
 350             raw_filename = basen + '.dump'
 351             filename = sanitize_filename(raw_filename, restricted=True)
 352             self.to_screen('Saving request to ' + filename)
 353             # Working around MAX_PATH limitation on Windows (see
 354             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 355             if os.name == 'nt':
 356                 absfilepath = os.path.abspath(filename)
 357                 if len(absfilepath) > 259:
 358                     filename = '\\\\?\\' + absfilepath
 359             with open(filename, 'wb') as outf:
 360                 outf.write(webpage_bytes)
 361
 362         try:
 363             content = webpage_bytes.decode(encoding, 'replace')
 364         except LookupError:
 365             content = webpage_bytes.decode('utf-8', 'replace')
 366
 367         if ('<title>Access to this site is blocked</title>' in content and
 368                 'Websense' in content[:512]):
 369             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 370             blocked_iframe = self._html_search_regex(
 371                 r'<iframe src="([^"]+)"', content,
 372                 'Websense information URL', default=None)
 373             if blocked_iframe:
 374                 msg += ' Visit %s for more details' % blocked_iframe
 375             raise ExtractorError(msg, expected=True)
 376
 377         return content
 378
 379     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
 380         """ Returns the data of the page as a string """
 381         success = False
 382         try_count = 0
 383         while success is False:
 384             try:
 385                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 386                 success = True
 387             except compat_http_client.IncompleteRead as e:
 388                 try_count += 1
 389                 if try_count >= tries:
 390                     raise e
 391                 self._sleep(timeout, video_id)
 392         if res is False:
 393             return res
 394         else:
 395             content, _ = res
 396             return content
 397
 398     def _download_xml(self, url_or_request, video_id,
 399                       note='Downloading XML', errnote='Unable to download XML',
 400                       transform_source=None, fatal=True):
 401         """Return the xml as an xml.etree.ElementTree.Element"""
 402         xml_string = self._download_webpage(
 403             url_or_request, video_id, note, errnote, fatal=fatal)
 404         if xml_string is False:
 405             return xml_string
 406         if transform_source:
 407             xml_string = transform_source(xml_string)
 408         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 409
 410     def _download_json(self, url_or_request, video_id,
 411                        note='Downloading JSON metadata',
 412                        errnote='Unable to download JSON metadata',
 413                        transform_source=None,
 414                        fatal=True):
 415         json_string = self._download_webpage(
 416             url_or_request, video_id, note, errnote, fatal=fatal)
 417         if (not fatal) and json_string is False:
 418             return None
 419         return self._parse_json(
 420             json_string, video_id, transform_source=transform_source, fatal=fatal)
 421
 422     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 423         if transform_source:
 424             json_string = transform_source(json_string)
 425         try:
 426             return json.loads(json_string)
 427         except ValueError as ve:
 428             errmsg = '%s: Failed to parse JSON ' % video_id
 429             if fatal:
 430                 raise ExtractorError(errmsg, cause=ve)
 431             else:
 432                 self.report_warning(errmsg + str(ve))
 433
 434     def report_warning(self, msg, video_id=None):
 435         idstr = '' if video_id is None else '%s: ' % video_id
 436         self._downloader.report_warning(
 437             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 438
 439     def to_screen(self, msg):
 440         """Print msg to screen, prefixing it with '[ie_name]'"""
 441         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 442
 443     def report_extraction(self, id_or_name):
 444         """Report information extraction."""
 445         self.to_screen('%s: Extracting information' % id_or_name)
 446
 447     def report_download_webpage(self, video_id):
 448         """Report webpage download."""
 449         self.to_screen('%s: Downloading webpage' % video_id)
 450
 451     def report_age_confirmation(self):
 452         """Report attempt to confirm age."""
 453         self.to_screen('Confirming age')
 454
 455     def report_login(self):
 456         """Report attempt to log in."""
 457         self.to_screen('Logging in')
 458
 459     # Methods for following #608
 460     @staticmethod
 461     def url_result(url, ie=None, video_id=None):
 462         """Returns a url that points to a page that should be processed"""
 463         # TODO: ie should be the class used for getting the info
 464         video_info = {'_type': 'url',
 465                       'url': url,
 466                       'ie_key': ie}
 467         if video_id is not None:
 468             video_info['id'] = video_id
 469         return video_info
 470
 471     @staticmethod
 472     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 473         """Returns a playlist"""
 474         video_info = {'_type': 'playlist',
 475                       'entries': entries}
 476         if playlist_id:
 477             video_info['id'] = playlist_id
 478         if playlist_title:
 479             video_info['title'] = playlist_title
 480         if playlist_description:
 481             video_info['description'] = playlist_description
 482         return video_info
 483
 484     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 485         """
 486         Perform a regex search on the given string, using a single or a list of
 487         patterns returning the first matching group.
 488         In case of failure return a default value or raise a WARNING or a
 489         RegexNotFoundError, depending on fatal, specifying the field name.
 490         """
 491         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 492             mobj = re.search(pattern, string, flags)
 493         else:
 494             for p in pattern:
 495                 mobj = re.search(p, string, flags)
 496                 if mobj:
 497                     break
 498
 499         if os.name != 'nt' and sys.stderr.isatty():
 500             _name = '\033[0;34m%s\033[0m' % name
 501         else:
 502             _name = name
 503
 504         if mobj:
 505             if group is None:
 506                 # return the first matching group
 507                 return next(g for g in mobj.groups() if g is not None)
 508             else:
 509                 return mobj.group(group)
 510         elif default is not _NO_DEFAULT:
 511             return default
 512         elif fatal:
 513             raise RegexNotFoundError('Unable to extract %s' % _name)
 514         else:
 515             self._downloader.report_warning('unable to extract %s; '
 516                                             'please report this issue on http://yt-dl.org/bug' % _name)
 517             return None
 518
 519     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 520         """
 521         Like _search_regex, but strips HTML tags and unescapes entities.
 522         """
 523         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 524         if res:
 525             return clean_html(res).strip()
 526         else:
 527             return res
 528
 529     def _get_login_info(self):
 530         """
 531         Get the the login info as (username, password)
 532         It will look in the netrc file using the _NETRC_MACHINE value
 533         If there's no info available, return (None, None)
 534         """
 535         if self._downloader is None:
 536             return (None, None)
 537
 538         username = None
 539         password = None
 540         downloader_params = self._downloader.params
 541
 542         # Attempt to use provided username and password or .netrc data
 543         if downloader_params.get('username', None) is not None:
 544             username = downloader_params['username']
 545             password = downloader_params['password']
 546         elif downloader_params.get('usenetrc', False):
 547             try:
 548                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 549                 if info is not None:
 550                     username = info[0]
 551                     password = info[2]
 552                 else:
 553                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 554             except (IOError, netrc.NetrcParseError) as err:
 555                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 556
 557         return (username, password)
 558
 559     def _get_tfa_info(self):
 560         """
 561         Get the two-factor authentication info
 562         TODO - asking the user will be required for sms/phone verify
 563         currently just uses the command line option
 564         If there's no info available, return None
 565         """
 566         if self._downloader is None:
 567             return None
 568         downloader_params = self._downloader.params
 569
 570         if downloader_params.get('twofactor', None) is not None:
 571             return downloader_params['twofactor']
 572
 573         return None
 574
 575     # Helper functions for extracting OpenGraph info
 576     @staticmethod
 577     def _og_regexes(prop):
 578         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 579         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 580         template = r'<meta[^>]+?%s[^>]+?%s'
 581         return [
 582             template % (property_re, content_re),
 583             template % (content_re, property_re),
 584         ]
 585
 586     def _og_search_property(self, prop, html, name=None, **kargs):
 587         if name is None:
 588             name = 'OpenGraph %s' % prop
 589         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 590         if escaped is None:
 591             return None
 592         return unescapeHTML(escaped)
 593
 594     def _og_search_thumbnail(self, html, **kargs):
 595         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 596
 597     def _og_search_description(self, html, **kargs):
 598         return self._og_search_property('description', html, fatal=False, **kargs)
 599
 600     def _og_search_title(self, html, **kargs):
 601         return self._og_search_property('title', html, **kargs)
 602
 603     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 604         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 605         if secure:
 606             regexes = self._og_regexes('video:secure_url') + regexes
 607         return self._html_search_regex(regexes, html, name, **kargs)
 608
 609     def _og_search_url(self, html, **kargs):
 610         return self._og_search_property('url', html, **kargs)
 611
 612     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 613         if display_name is None:
 614             display_name = name
 615         return self._html_search_regex(
 616             r'''(?isx)<meta
 617                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 618                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 619             html, display_name, fatal=fatal, group='content', **kwargs)
 620
 621     def _dc_search_uploader(self, html):
 622         return self._html_search_meta('dc.creator', html, 'uploader')
 623
 624     def _rta_search(self, html):
 625         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 626         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 627                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 628                      html):
 629             return 18
 630         return 0
 631
 632     def _media_rating_search(self, html):
 633         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 634         rating = self._html_search_meta('rating', html)
 635
 636         if not rating:
 637             return None
 638
 639         RATING_TABLE = {
 640             'safe for kids': 0,
 641             'general': 8,
 642             '14 years': 14,
 643             'mature': 17,
 644             'restricted': 19,
 645         }
 646         return RATING_TABLE.get(rating.lower(), None)
 647
 648     def _twitter_search_player(self, html):
 649         return self._html_search_meta('twitter:player', html,
 650                                       'twitter card player')
 651
 652     def _sort_formats(self, formats):
 653         if not formats:
 654             raise ExtractorError('No video formats found')
 655
 656         def _formats_key(f):
 657             # TODO remove the following workaround
 658             from ..utils import determine_ext
 659             if not f.get('ext') and 'url' in f:
 660                 f['ext'] = determine_ext(f['url'])
 661
 662             preference = f.get('preference')
 663             if preference is None:
 664                 proto = f.get('protocol')
 665                 if proto is None:
 666                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 667
 668                 preference = 0 if proto in ['http', 'https'] else -0.1
 669                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 670                     preference -= 0.5
 671
 672             if f.get('vcodec') == 'none':  # audio only
 673                 if self._downloader.params.get('prefer_free_formats'):
 674                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 675                 else:
 676                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 677                 ext_preference = 0
 678                 try:
 679                     audio_ext_preference = ORDER.index(f['ext'])
 680                 except ValueError:
 681                     audio_ext_preference = -1
 682             else:
 683                 if self._downloader.params.get('prefer_free_formats'):
 684                     ORDER = ['flv', 'mp4', 'webm']
 685                 else:
 686                     ORDER = ['webm', 'flv', 'mp4']
 687                 try:
 688                     ext_preference = ORDER.index(f['ext'])
 689                 except ValueError:
 690                     ext_preference = -1
 691                 audio_ext_preference = 0
 692
 693             return (
 694                 preference,
 695                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 696                 f.get('quality') if f.get('quality') is not None else -1,
 697                 f.get('height') if f.get('height') is not None else -1,
 698                 f.get('width') if f.get('width') is not None else -1,
 699                 ext_preference,
 700                 f.get('tbr') if f.get('tbr') is not None else -1,
 701                 f.get('vbr') if f.get('vbr') is not None else -1,
 702                 f.get('abr') if f.get('abr') is not None else -1,
 703                 audio_ext_preference,
 704                 f.get('fps') if f.get('fps') is not None else -1,
 705                 f.get('filesize') if f.get('filesize') is not None else -1,
 706                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 707                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 708                 f.get('format_id'),
 709             )
 710         formats.sort(key=_formats_key)
 711
 712     def http_scheme(self):
 713         """ Either "http:" or "https:", depending on the user's preferences """
 714         return (
 715             'http:'
 716             if self._downloader.params.get('prefer_insecure', False)
 717             else 'https:')
 718
 719     def _proto_relative_url(self, url, scheme=None):
 720         if url is None:
 721             return url
 722         if url.startswith('//'):
 723             if scheme is None:
 724                 scheme = self.http_scheme()
 725             return scheme + url
 726         else:
 727             return url
 728
 729     def _sleep(self, timeout, video_id, msg_template=None):
 730         if msg_template is None:
 731             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 732         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 733         self.to_screen(msg)
 734         time.sleep(timeout)
 735
 736     def _extract_f4m_formats(self, manifest_url, video_id):
 737         manifest = self._download_xml(
 738             manifest_url, video_id, 'Downloading f4m manifest',
 739             'Unable to download f4m manifest')
 740
 741         formats = []
 742         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 743         for i, media_el in enumerate(media_nodes):
 744             tbr = int_or_none(media_el.attrib.get('bitrate'))
 745             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 746             formats.append({
 747                 'format_id': format_id,
 748                 'url': manifest_url,
 749                 'ext': 'flv',
 750                 'tbr': tbr,
 751                 'width': int_or_none(media_el.attrib.get('width')),
 752                 'height': int_or_none(media_el.attrib.get('height')),
 753             })
 754         self._sort_formats(formats)
 755
 756         return formats
 757
 758     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 759                               entry_protocol='m3u8', preference=None):
 760
 761         formats = [{
 762             'format_id': 'm3u8-meta',
 763             'url': m3u8_url,
 764             'ext': ext,
 765             'protocol': 'm3u8',
 766             'preference': -1,
 767             'resolution': 'multiple',
 768             'format_note': 'Quality selection URL',
 769         }]
 770
 771         format_url = lambda u: (
 772             u
 773             if re.match(r'^https?://', u)
 774             else compat_urlparse.urljoin(m3u8_url, u))
 775
 776         m3u8_doc = self._download_webpage(
 777             m3u8_url, video_id,
 778             note='Downloading m3u8 information',
 779             errnote='Failed to download m3u8 information')
 780         last_info = None
 781         kv_rex = re.compile(
 782             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 783         for line in m3u8_doc.splitlines():
 784             if line.startswith('#EXT-X-STREAM-INF:'):
 785                 last_info = {}
 786                 for m in kv_rex.finditer(line):
 787                     v = m.group('val')
 788                     if v.startswith('"'):
 789                         v = v[1:-1]
 790                     last_info[m.group('key')] = v
 791             elif line.startswith('#') or not line.strip():
 792                 continue
 793             else:
 794                 if last_info is None:
 795                     formats.append({'url': format_url(line)})
 796                     continue
 797                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 798
 799                 f = {
 800                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 801                     'url': format_url(line.strip()),
 802                     'tbr': tbr,
 803                     'ext': ext,
 804                     'protocol': entry_protocol,
 805                     'preference': preference,
 806                 }
 807                 codecs = last_info.get('CODECS')
 808                 if codecs:
 809                     # TODO: looks like video codec is not always necessarily goes first
 810                     va_codecs = codecs.split(',')
 811                     if va_codecs[0]:
 812                         f['vcodec'] = va_codecs[0].partition('.')[0]
 813                     if len(va_codecs) > 1 and va_codecs[1]:
 814                         f['acodec'] = va_codecs[1].partition('.')[0]
 815                 resolution = last_info.get('RESOLUTION')
 816                 if resolution:
 817                     width_str, height_str = resolution.split('x')
 818                     f['width'] = int(width_str)
 819                     f['height'] = int(height_str)
 820                 formats.append(f)
 821                 last_info = {}
 822         self._sort_formats(formats)
 823         return formats
 824
 825     # TODO: improve extraction
 826     def _extract_smil_formats(self, smil_url, video_id):
 827         smil = self._download_xml(
 828             smil_url, video_id, 'Downloading SMIL file',
 829             'Unable to download SMIL file')
 830
 831         base = smil.find('./head/meta').get('base')
 832
 833         formats = []
 834         rtmp_count = 0
 835         for video in smil.findall('./body/switch/video'):
 836             src = video.get('src')
 837             if not src:
 838                 continue
 839             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 840             width = int_or_none(video.get('width'))
 841             height = int_or_none(video.get('height'))
 842             proto = video.get('proto')
 843             if not proto:
 844                 if base:
 845                     if base.startswith('rtmp'):
 846                         proto = 'rtmp'
 847                     elif base.startswith('http'):
 848                         proto = 'http'
 849             ext = video.get('ext')
 850             if proto == 'm3u8':
 851                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 852             elif proto == 'rtmp':
 853                 rtmp_count += 1
 854                 streamer = video.get('streamer') or base
 855                 formats.append({
 856                     'url': streamer,
 857                     'play_path': src,
 858                     'ext': 'flv',
 859                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 860                     'tbr': bitrate,
 861                     'width': width,
 862                     'height': height,
 863                 })
 864         self._sort_formats(formats)
 865
 866         return formats
 867
 868     def _live_title(self, name):
 869         """ Generate the title for a live video """
 870         now = datetime.datetime.now()
 871         now_str = now.strftime("%Y-%m-%d %H:%M")
 872         return name + ' ' + now_str
 873
 874     def _int(self, v, name, fatal=False, **kwargs):
 875         res = int_or_none(v, **kwargs)
 876         if 'get_attr' in kwargs:
 877             print(getattr(v, kwargs['get_attr']))
 878         if res is None:
 879             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 880             if fatal:
 881                 raise ExtractorError(msg)
 882             else:
 883                 self._downloader.report_warning(msg)
 884         return res
 885
 886     def _float(self, v, name, fatal=False, **kwargs):
 887         res = float_or_none(v, **kwargs)
 888         if res is None:
 889             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 890             if fatal:
 891                 raise ExtractorError(msg)
 892             else:
 893                 self._downloader.report_warning(msg)
 894         return res
 895
 896     def _set_cookie(self, domain, name, value, expire_time=None):
 897         cookie = compat_cookiejar.Cookie(
 898             0, name, value, None, None, domain, None,
 899             None, '/', True, False, expire_time, '', None, None, None)
 900         self._downloader.cookiejar.set_cookie(cookie)
 901
 902     def get_testcases(self, include_onlymatching=False):
 903         t = getattr(self, '_TEST', None)
 904         if t:
 905             assert not hasattr(self, '_TESTS'), \
 906                 '%s has _TEST and _TESTS' % type(self).__name__
 907             tests = [t]
 908         else:
 909             tests = getattr(self, '_TESTS', [])
 910         for t in tests:
 911             if not include_onlymatching and t.get('only_matching', False):
 912                 continue
 913             t['name'] = type(self).__name__[:-len('IE')]
 914             yield t
 915
 916     def is_suitable(self, age_limit):
 917         """ Test whether the extractor is generally suitable for the given
 918         age limit (i.e. pornographic sites are not, all others usually are) """
 919
 920         any_restricted = False
 921         for tc in self.get_testcases(include_onlymatching=False):
 922             if 'playlist' in tc:
 923                 tc = tc['playlist'][0]
 924             is_restricted = age_restricted(
 925                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 926             if not is_restricted:
 927                 return True
 928             any_restricted = any_restricted or is_restricted
 929         return not any_restricted
 930
 931
 932 class SearchInfoExtractor(InfoExtractor):
 933     """
 934     Base class for paged search queries extractors.
 935     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 936     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 937     """
 938
 939     @classmethod
 940     def _make_valid_url(cls):
 941         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 942
 943     @classmethod
 944     def suitable(cls, url):
 945         return re.match(cls._make_valid_url(), url) is not None
 946
 947     def _real_extract(self, query):
 948         mobj = re.match(self._make_valid_url(), query)
 949         if mobj is None:
 950             raise ExtractorError('Invalid search query "%s"' % query)
 951
 952         prefix = mobj.group('prefix')
 953         query = mobj.group('query')
 954         if prefix == '':
 955             return self._get_n_results(query, 1)
 956         elif prefix == 'all':
 957             return self._get_n_results(query, self._MAX_RESULTS)
 958         else:
 959             n = int(prefix)
 960             if n <= 0:
 961                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 962             elif n > self._MAX_RESULTS:
 963                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 964                 n = self._MAX_RESULTS
 965             return self._get_n_results(query, n)
 966
 967     def _get_n_results(self, query, n):
 968         """Get a specified number of results for a query"""
 969         raise NotImplementedError("This method must be implemented by subclasses")
 970
 971     @property
 972     def SEARCH_KEY(self):
 973         return self._SEARCH_KEY