_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_http_client,
  18     compat_urllib_error,
  19     compat_urllib_parse_urlparse,
  20     compat_urlparse,
  21     compat_str,
  22 )
  23 from ..utils import (
  24     age_restricted,
  25     clean_html,
  26     compiled_regex_type,
  27     ExtractorError,
  28     float_or_none,
  29     int_or_none,
  30     RegexNotFoundError,
  31     sanitize_filename,
  32     unescapeHTML,
  33 )
  34 _NO_DEFAULT = object()
  35
  36
  37 class InfoExtractor(object):
  38     """Information Extractor class.
  39
  40     Information extractors are the classes that, given a URL, extract
  41     information about the video (or videos) the URL refers to. This
  42     information includes the real video URL, the video title, author and
  43     others. The information is stored in a dictionary which is then
  44     passed to the YoutubeDL. The YoutubeDL processes this
  45     information possibly downloading the video to the file system, among
  46     other possible outcomes.
  47
  48     The type field determines the the type of the result.
  49     By far the most common value (and the default if _type is missing) is
  50     "video", which indicates a single video.
  51
  52     For a video, the dictionaries must include the following fields:
  53
  54     id:             Video identifier.
  55     title:          Video title, unescaped.
  56
  57     Additionally, it must contain either a formats entry or a url one:
  58
  59     formats:        A list of dictionaries for each format available, ordered
  60                     from worst to best quality.
  61
  62                     Potential fields:
  63                     * url        Mandatory. The URL of the video file
  64                     * ext        Will be calculated from url if missing
  65                     * format     A human-readable description of the format
  66                                  ("mp4 container with h264/opus").
  67                                  Calculated from the format_id, width, height.
  68                                  and format_note fields if missing.
  69                     * format_id  A short description of the format
  70                                  ("mp4_h264_opus" or "19").
  71                                 Technically optional, but strongly recommended.
  72                     * format_note Additional info about the format
  73                                  ("3D" or "DASH video")
  74                     * width      Width of the video, if known
  75                     * height     Height of the video, if known
  76                     * resolution Textual description of width and height
  77                     * tbr        Average bitrate of audio and video in KBit/s
  78                     * abr        Average audio bitrate in KBit/s
  79                     * acodec     Name of the audio codec in use
  80                     * asr        Audio sampling rate in Hertz
  81                     * vbr        Average video bitrate in KBit/s
  82                     * fps        Frame rate
  83                     * vcodec     Name of the video codec in use
  84                     * container  Name of the container format
  85                     * filesize   The number of bytes, if known in advance
  86                     * filesize_approx  An estimate for the number of bytes
  87                     * player_url SWF Player URL (used for rtmpdump).
  88                     * protocol   The protocol that will be used for the actual
  89                                  download, lower-case.
  90                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  91                     * preference Order number of this format. If this field is
  92                                  present and not None, the formats get sorted
  93                                  by this field, regardless of all other values.
  94                                  -1 for default (order by other properties),
  95                                  -2 or smaller for less than default.
  96                                  < -1000 to hide the format (if there is
  97                                     another one which is strictly better)
  98                     * language_preference  Is this in the correct requested
  99                                  language?
 100                                  10 if it's what the URL is about,
 101                                  -1 for default (don't know),
 102                                  -10 otherwise, other values reserved for now.
 103                     * quality    Order number of the video quality of this
 104                                  format, irrespective of the file format.
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * source_preference  Order number for this video source
 108                                   (quality takes higher priority)
 109                                  -1 for default (order by other properties),
 110                                  -2 or smaller for less than default.
 111                     * http_method  HTTP method to use for the download.
 112                     * http_headers  A dictionary of additional HTTP headers
 113                                  to add to the request.
 114                     * http_post_data  Additional data to send with a POST
 115                                  request.
 116                     * stretched_ratio  If given and not 1, indicates that the
 117                                  video's pixels are not square.
 118                                  width : height ratio as float.
 119                     * no_resume  The server does not support resuming the
 120                                  (HTTP or RTMP) download. Boolean.
 121
 122     url:            Final video URL.
 123     ext:            Video filename extension.
 124     format:         The video format, defaults to ext (used for --get-format)
 125     player_url:     SWF Player URL (used for rtmpdump).
 126
 127     The following fields are optional:
 128
 129     alt_title:      A secondary title of the video.
 130     display_id      An alternative identifier for the video, not necessarily
 131                     unique, but available before title. Typically, id is
 132                     something like "4234987", title "Dancing naked mole rats",
 133                     and display_id "dancing-naked-mole-rats"
 134     thumbnails:     A list of dictionaries, with the following entries:
 135                         * "id" (optional, string) - Thumbnail format ID
 136                         * "url"
 137                         * "preference" (optional, int) - quality of the image
 138                         * "width" (optional, int)
 139                         * "height" (optional, int)
 140                         * "resolution" (optional, string "{width}x{height"},
 141                                         deprecated)
 142     thumbnail:      Full URL to a video thumbnail image.
 143     description:    Full video description.
 144     uploader:       Full name of the video uploader.
 145     timestamp:      UNIX timestamp of the moment the video became available.
 146     upload_date:    Video upload date (YYYYMMDD).
 147                     If not explicitly set, calculated from timestamp.
 148     uploader_id:    Nickname or id of the video uploader.
 149     location:       Physical location where the video was filmed.
 150     subtitles:      The subtitle file contents as a dictionary in the format
 151                     {language: subtitles}.
 152     duration:       Length of the video in seconds, as an integer.
 153     view_count:     How many users have watched the video on the platform.
 154     like_count:     Number of positive ratings of the video
 155     dislike_count:  Number of negative ratings of the video
 156     comment_count:  Number of comments on the video
 157     comments:       A list of comments, each with one or more of the following
 158                     properties (all but one of text or html optional):
 159                         * "author" - human-readable name of the comment author
 160                         * "author_id" - user ID of the comment author
 161                         * "id" - Comment ID
 162                         * "html" - Comment as HTML
 163                         * "text" - Plain text of the comment
 164                         * "timestamp" - UNIX timestamp of comment
 165                         * "parent" - ID of the comment this one is replying to.
 166                                      Set to "root" to indicate that this is a
 167                                      comment to the original video.
 168     age_limit:      Age restriction for the video, as an integer (years)
 169     webpage_url:    The url to the video webpage, if given to youtube-dl it
 170                     should allow to get the same result again. (It will be set
 171                     by YoutubeDL if it's missing)
 172     categories:     A list of categories that the video falls in, for example
 173                     ["Sports", "Berlin"]
 174     is_live:        True, False, or None (=unknown). Whether this video is a
 175                     live stream that goes on instead of a fixed-length video.
 176
 177     Unless mentioned otherwise, the fields should be Unicode strings.
 178
 179     Unless mentioned otherwise, None is equivalent to absence of information.
 180
 181
 182     _type "playlist" indicates multiple videos.
 183     There must be a key "entries", which is a list, an iterable, or a PagedList
 184     object, each element of which is a valid dictionary by this specification.
 185
 186     Additionally, playlists can have "title" and "id" attributes with the same
 187     semantics as videos (see above).
 188
 189
 190     _type "multi_video" indicates that there are multiple videos that
 191     form a single show, for examples multiple acts of an opera or TV episode.
 192     It must have an entries key like a playlist and contain all the keys
 193     required for a video at the same time.
 194
 195
 196     _type "url" indicates that the video must be extracted from another
 197     location, possibly by a different extractor. Its only required key is:
 198     "url" - the next URL to extract.
 199     The key "ie_key" can be set to the class name (minus the trailing "IE",
 200     e.g. "Youtube") if the extractor class is known in advance.
 201     Additionally, the dictionary may have any properties of the resolved entity
 202     known in advance, for example "title" if the title of the referred video is
 203     known ahead of time.
 204
 205
 206     _type "url_transparent" entities have the same specification as "url", but
 207     indicate that the given additional information is more precise than the one
 208     associated with the resolved URL.
 209     This is useful when a site employs a video service that hosts the video and
 210     its technical metadata, but that video service does not embed a useful
 211     title, description etc.
 212
 213
 214     Subclasses of this one should re-define the _real_initialize() and
 215     _real_extract() methods and define a _VALID_URL regexp.
 216     Probably, they should also be added to the list of extractors.
 217
 218     Finally, the _WORKING attribute should be set to False for broken IEs
 219     in order to warn the users and skip the tests.
 220     """
 221
 222     _ready = False
 223     _downloader = None
 224     _WORKING = True
 225
 226     def __init__(self, downloader=None):
 227         """Constructor. Receives an optional downloader."""
 228         self._ready = False
 229         self.set_downloader(downloader)
 230
 231     @classmethod
 232     def suitable(cls, url):
 233         """Receives a URL and returns True if suitable for this IE."""
 234
 235         # This does not use has/getattr intentionally - we want to know whether
 236         # we have cached the regexp for *this* class, whereas getattr would also
 237         # match the superclass
 238         if '_VALID_URL_RE' not in cls.__dict__:
 239             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 240         return cls._VALID_URL_RE.match(url) is not None
 241
 242     @classmethod
 243     def _match_id(cls, url):
 244         if '_VALID_URL_RE' not in cls.__dict__:
 245             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 246         m = cls._VALID_URL_RE.match(url)
 247         assert m
 248         return m.group('id')
 249
 250     @classmethod
 251     def working(cls):
 252         """Getter method for _WORKING."""
 253         return cls._WORKING
 254
 255     def initialize(self):
 256         """Initializes an instance (authentication, etc)."""
 257         if not self._ready:
 258             self._real_initialize()
 259             self._ready = True
 260
 261     def extract(self, url):
 262         """Extracts URL information and returns it in list of dicts."""
 263         self.initialize()
 264         return self._real_extract(url)
 265
 266     def set_downloader(self, downloader):
 267         """Sets the downloader for this IE."""
 268         self._downloader = downloader
 269
 270     def _real_initialize(self):
 271         """Real initialization process. Redefine in subclasses."""
 272         pass
 273
 274     def _real_extract(self, url):
 275         """Real extraction process. Redefine in subclasses."""
 276         pass
 277
 278     @classmethod
 279     def ie_key(cls):
 280         """A string for getting the InfoExtractor with get_info_extractor"""
 281         return cls.__name__[:-2]
 282
 283     @property
 284     def IE_NAME(self):
 285         return type(self).__name__[:-2]
 286
 287     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 288         """ Returns the response handle """
 289         if note is None:
 290             self.report_download_webpage(video_id)
 291         elif note is not False:
 292             if video_id is None:
 293                 self.to_screen('%s' % (note,))
 294             else:
 295                 self.to_screen('%s: %s' % (video_id, note))
 296         try:
 297             return self._downloader.urlopen(url_or_request)
 298         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 299             if errnote is False:
 300                 return False
 301             if errnote is None:
 302                 errnote = 'Unable to download webpage'
 303             errmsg = '%s: %s' % (errnote, compat_str(err))
 304             if fatal:
 305                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 306             else:
 307                 self._downloader.report_warning(errmsg)
 308                 return False
 309
 310     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 311         """ Returns a tuple (page content as string, URL handle) """
 312         # Strip hashes from the URL (#1038)
 313         if isinstance(url_or_request, (compat_str, str)):
 314             url_or_request = url_or_request.partition('#')[0]
 315
 316         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 317         if urlh is False:
 318             assert not fatal
 319             return False
 320         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 321         return (content, urlh)
 322
 323     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 324         content_type = urlh.headers.get('Content-Type', '')
 325         webpage_bytes = urlh.read()
 326         if prefix is not None:
 327             webpage_bytes = prefix + webpage_bytes
 328         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 329         if m:
 330             encoding = m.group(1)
 331         else:
 332             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 333                           webpage_bytes[:1024])
 334             if m:
 335                 encoding = m.group(1).decode('ascii')
 336             elif webpage_bytes.startswith(b'\xff\xfe'):
 337                 encoding = 'utf-16'
 338             else:
 339                 encoding = 'utf-8'
 340         if self._downloader.params.get('dump_intermediate_pages', False):
 341             try:
 342                 url = url_or_request.get_full_url()
 343             except AttributeError:
 344                 url = url_or_request
 345             self.to_screen('Dumping request to ' + url)
 346             dump = base64.b64encode(webpage_bytes).decode('ascii')
 347             self._downloader.to_screen(dump)
 348         if self._downloader.params.get('write_pages', False):
 349             try:
 350                 url = url_or_request.get_full_url()
 351             except AttributeError:
 352                 url = url_or_request
 353             basen = '%s_%s' % (video_id, url)
 354             if len(basen) > 240:
 355                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 356                 basen = basen[:240 - len(h)] + h
 357             raw_filename = basen + '.dump'
 358             filename = sanitize_filename(raw_filename, restricted=True)
 359             self.to_screen('Saving request to ' + filename)
 360             # Working around MAX_PATH limitation on Windows (see
 361             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 362             if os.name == 'nt':
 363                 absfilepath = os.path.abspath(filename)
 364                 if len(absfilepath) > 259:
 365                     filename = '\\\\?\\' + absfilepath
 366             with open(filename, 'wb') as outf:
 367                 outf.write(webpage_bytes)
 368
 369         try:
 370             content = webpage_bytes.decode(encoding, 'replace')
 371         except LookupError:
 372             content = webpage_bytes.decode('utf-8', 'replace')
 373
 374         if ('<title>Access to this site is blocked</title>' in content and
 375                 'Websense' in content[:512]):
 376             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 377             blocked_iframe = self._html_search_regex(
 378                 r'<iframe src="([^"]+)"', content,
 379                 'Websense information URL', default=None)
 380             if blocked_iframe:
 381                 msg += ' Visit %s for more details' % blocked_iframe
 382             raise ExtractorError(msg, expected=True)
 383
 384         return content
 385
 386     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
 387         """ Returns the data of the page as a string """
 388         success = False
 389         try_count = 0
 390         while success is False:
 391             try:
 392                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 393                 success = True
 394             except compat_http_client.IncompleteRead as e:
 395                 try_count += 1
 396                 if try_count >= tries:
 397                     raise e
 398                 self._sleep(timeout, video_id)
 399         if res is False:
 400             return res
 401         else:
 402             content, _ = res
 403             return content
 404
 405     def _download_xml(self, url_or_request, video_id,
 406                       note='Downloading XML', errnote='Unable to download XML',
 407                       transform_source=None, fatal=True):
 408         """Return the xml as an xml.etree.ElementTree.Element"""
 409         xml_string = self._download_webpage(
 410             url_or_request, video_id, note, errnote, fatal=fatal)
 411         if xml_string is False:
 412             return xml_string
 413         if transform_source:
 414             xml_string = transform_source(xml_string)
 415         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 416
 417     def _download_json(self, url_or_request, video_id,
 418                        note='Downloading JSON metadata',
 419                        errnote='Unable to download JSON metadata',
 420                        transform_source=None,
 421                        fatal=True):
 422         json_string = self._download_webpage(
 423             url_or_request, video_id, note, errnote, fatal=fatal)
 424         if (not fatal) and json_string is False:
 425             return None
 426         return self._parse_json(
 427             json_string, video_id, transform_source=transform_source, fatal=fatal)
 428
 429     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 430         if transform_source:
 431             json_string = transform_source(json_string)
 432         try:
 433             return json.loads(json_string)
 434         except ValueError as ve:
 435             errmsg = '%s: Failed to parse JSON ' % video_id
 436             if fatal:
 437                 raise ExtractorError(errmsg, cause=ve)
 438             else:
 439                 self.report_warning(errmsg + str(ve))
 440
 441     def report_warning(self, msg, video_id=None):
 442         idstr = '' if video_id is None else '%s: ' % video_id
 443         self._downloader.report_warning(
 444             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 445
 446     def to_screen(self, msg):
 447         """Print msg to screen, prefixing it with '[ie_name]'"""
 448         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 449
 450     def report_extraction(self, id_or_name):
 451         """Report information extraction."""
 452         self.to_screen('%s: Extracting information' % id_or_name)
 453
 454     def report_download_webpage(self, video_id):
 455         """Report webpage download."""
 456         self.to_screen('%s: Downloading webpage' % video_id)
 457
 458     def report_age_confirmation(self):
 459         """Report attempt to confirm age."""
 460         self.to_screen('Confirming age')
 461
 462     def report_login(self):
 463         """Report attempt to log in."""
 464         self.to_screen('Logging in')
 465
 466     # Methods for following #608
 467     @staticmethod
 468     def url_result(url, ie=None, video_id=None):
 469         """Returns a url that points to a page that should be processed"""
 470         # TODO: ie should be the class used for getting the info
 471         video_info = {'_type': 'url',
 472                       'url': url,
 473                       'ie_key': ie}
 474         if video_id is not None:
 475             video_info['id'] = video_id
 476         return video_info
 477
 478     @staticmethod
 479     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 480         """Returns a playlist"""
 481         video_info = {'_type': 'playlist',
 482                       'entries': entries}
 483         if playlist_id:
 484             video_info['id'] = playlist_id
 485         if playlist_title:
 486             video_info['title'] = playlist_title
 487         if playlist_description:
 488             video_info['description'] = playlist_description
 489         return video_info
 490
 491     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 492         """
 493         Perform a regex search on the given string, using a single or a list of
 494         patterns returning the first matching group.
 495         In case of failure return a default value or raise a WARNING or a
 496         RegexNotFoundError, depending on fatal, specifying the field name.
 497         """
 498         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 499             mobj = re.search(pattern, string, flags)
 500         else:
 501             for p in pattern:
 502                 mobj = re.search(p, string, flags)
 503                 if mobj:
 504                     break
 505
 506         if os.name != 'nt' and sys.stderr.isatty():
 507             _name = '\033[0;34m%s\033[0m' % name
 508         else:
 509             _name = name
 510
 511         if mobj:
 512             if group is None:
 513                 # return the first matching group
 514                 return next(g for g in mobj.groups() if g is not None)
 515             else:
 516                 return mobj.group(group)
 517         elif default is not _NO_DEFAULT:
 518             return default
 519         elif fatal:
 520             raise RegexNotFoundError('Unable to extract %s' % _name)
 521         else:
 522             self._downloader.report_warning('unable to extract %s; '
 523                                             'please report this issue on http://yt-dl.org/bug' % _name)
 524             return None
 525
 526     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 527         """
 528         Like _search_regex, but strips HTML tags and unescapes entities.
 529         """
 530         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 531         if res:
 532             return clean_html(res).strip()
 533         else:
 534             return res
 535
 536     def _get_login_info(self):
 537         """
 538         Get the the login info as (username, password)
 539         It will look in the netrc file using the _NETRC_MACHINE value
 540         If there's no info available, return (None, None)
 541         """
 542         if self._downloader is None:
 543             return (None, None)
 544
 545         username = None
 546         password = None
 547         downloader_params = self._downloader.params
 548
 549         # Attempt to use provided username and password or .netrc data
 550         if downloader_params.get('username', None) is not None:
 551             username = downloader_params['username']
 552             password = downloader_params['password']
 553         elif downloader_params.get('usenetrc', False):
 554             try:
 555                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 556                 if info is not None:
 557                     username = info[0]
 558                     password = info[2]
 559                 else:
 560                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 561             except (IOError, netrc.NetrcParseError) as err:
 562                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 563
 564         return (username, password)
 565
 566     def _get_tfa_info(self):
 567         """
 568         Get the two-factor authentication info
 569         TODO - asking the user will be required for sms/phone verify
 570         currently just uses the command line option
 571         If there's no info available, return None
 572         """
 573         if self._downloader is None:
 574             return None
 575         downloader_params = self._downloader.params
 576
 577         if downloader_params.get('twofactor', None) is not None:
 578             return downloader_params['twofactor']
 579
 580         return None
 581
 582     # Helper functions for extracting OpenGraph info
 583     @staticmethod
 584     def _og_regexes(prop):
 585         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 586         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 587         template = r'<meta[^>]+?%s[^>]+?%s'
 588         return [
 589             template % (property_re, content_re),
 590             template % (content_re, property_re),
 591         ]
 592
 593     def _og_search_property(self, prop, html, name=None, **kargs):
 594         if name is None:
 595             name = 'OpenGraph %s' % prop
 596         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 597         if escaped is None:
 598             return None
 599         return unescapeHTML(escaped)
 600
 601     def _og_search_thumbnail(self, html, **kargs):
 602         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 603
 604     def _og_search_description(self, html, **kargs):
 605         return self._og_search_property('description', html, fatal=False, **kargs)
 606
 607     def _og_search_title(self, html, **kargs):
 608         return self._og_search_property('title', html, **kargs)
 609
 610     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 611         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 612         if secure:
 613             regexes = self._og_regexes('video:secure_url') + regexes
 614         return self._html_search_regex(regexes, html, name, **kargs)
 615
 616     def _og_search_url(self, html, **kargs):
 617         return self._og_search_property('url', html, **kargs)
 618
 619     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 620         if display_name is None:
 621             display_name = name
 622         return self._html_search_regex(
 623             r'''(?isx)<meta
 624                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 625                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 626             html, display_name, fatal=fatal, group='content', **kwargs)
 627
 628     def _dc_search_uploader(self, html):
 629         return self._html_search_meta('dc.creator', html, 'uploader')
 630
 631     def _rta_search(self, html):
 632         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 633         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 634                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 635                      html):
 636             return 18
 637         return 0
 638
 639     def _media_rating_search(self, html):
 640         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 641         rating = self._html_search_meta('rating', html)
 642
 643         if not rating:
 644             return None
 645
 646         RATING_TABLE = {
 647             'safe for kids': 0,
 648             'general': 8,
 649             '14 years': 14,
 650             'mature': 17,
 651             'restricted': 19,
 652         }
 653         return RATING_TABLE.get(rating.lower(), None)
 654
 655     def _twitter_search_player(self, html):
 656         return self._html_search_meta('twitter:player', html,
 657                                       'twitter card player')
 658
 659     def _sort_formats(self, formats):
 660         if not formats:
 661             raise ExtractorError('No video formats found')
 662
 663         def _formats_key(f):
 664             # TODO remove the following workaround
 665             from ..utils import determine_ext
 666             if not f.get('ext') and 'url' in f:
 667                 f['ext'] = determine_ext(f['url'])
 668
 669             preference = f.get('preference')
 670             if preference is None:
 671                 proto = f.get('protocol')
 672                 if proto is None:
 673                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 674
 675                 preference = 0 if proto in ['http', 'https'] else -0.1
 676                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 677                     preference -= 0.5
 678
 679             if f.get('vcodec') == 'none':  # audio only
 680                 if self._downloader.params.get('prefer_free_formats'):
 681                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 682                 else:
 683                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 684                 ext_preference = 0
 685                 try:
 686                     audio_ext_preference = ORDER.index(f['ext'])
 687                 except ValueError:
 688                     audio_ext_preference = -1
 689             else:
 690                 if self._downloader.params.get('prefer_free_formats'):
 691                     ORDER = ['flv', 'mp4', 'webm']
 692                 else:
 693                     ORDER = ['webm', 'flv', 'mp4']
 694                 try:
 695                     ext_preference = ORDER.index(f['ext'])
 696                 except ValueError:
 697                     ext_preference = -1
 698                 audio_ext_preference = 0
 699
 700             return (
 701                 preference,
 702                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 703                 f.get('quality') if f.get('quality') is not None else -1,
 704                 f.get('height') if f.get('height') is not None else -1,
 705                 f.get('width') if f.get('width') is not None else -1,
 706                 ext_preference,
 707                 f.get('tbr') if f.get('tbr') is not None else -1,
 708                 f.get('vbr') if f.get('vbr') is not None else -1,
 709                 f.get('abr') if f.get('abr') is not None else -1,
 710                 audio_ext_preference,
 711                 f.get('fps') if f.get('fps') is not None else -1,
 712                 f.get('filesize') if f.get('filesize') is not None else -1,
 713                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 714                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 715                 f.get('format_id'),
 716             )
 717         formats.sort(key=_formats_key)
 718
 719     def http_scheme(self):
 720         """ Either "http:" or "https:", depending on the user's preferences """
 721         return (
 722             'http:'
 723             if self._downloader.params.get('prefer_insecure', False)
 724             else 'https:')
 725
 726     def _proto_relative_url(self, url, scheme=None):
 727         if url is None:
 728             return url
 729         if url.startswith('//'):
 730             if scheme is None:
 731                 scheme = self.http_scheme()
 732             return scheme + url
 733         else:
 734             return url
 735
 736     def _sleep(self, timeout, video_id, msg_template=None):
 737         if msg_template is None:
 738             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 739         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 740         self.to_screen(msg)
 741         time.sleep(timeout)
 742
 743     def _extract_f4m_formats(self, manifest_url, video_id):
 744         manifest = self._download_xml(
 745             manifest_url, video_id, 'Downloading f4m manifest',
 746             'Unable to download f4m manifest')
 747
 748         formats = []
 749         manifest_version = '1.0'
 750         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 751         if not media_nodes:
 752             manifest_version = '2.0'
 753             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 754         for i, media_el in enumerate(media_nodes):
 755             if manifest_version == '2.0':
 756                 manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
 757             tbr = int_or_none(media_el.attrib.get('bitrate'))
 758             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 759             formats.append({
 760                 'format_id': format_id,
 761                 'url': manifest_url,
 762                 'ext': 'flv',
 763                 'tbr': tbr,
 764                 'width': int_or_none(media_el.attrib.get('width')),
 765                 'height': int_or_none(media_el.attrib.get('height')),
 766             })
 767         self._sort_formats(formats)
 768
 769         return formats
 770
 771     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 772                               entry_protocol='m3u8', preference=None):
 773
 774         formats = [{
 775             'format_id': 'm3u8-meta',
 776             'url': m3u8_url,
 777             'ext': ext,
 778             'protocol': 'm3u8',
 779             'preference': -1,
 780             'resolution': 'multiple',
 781             'format_note': 'Quality selection URL',
 782         }]
 783
 784         format_url = lambda u: (
 785             u
 786             if re.match(r'^https?://', u)
 787             else compat_urlparse.urljoin(m3u8_url, u))
 788
 789         m3u8_doc = self._download_webpage(
 790             m3u8_url, video_id,
 791             note='Downloading m3u8 information',
 792             errnote='Failed to download m3u8 information')
 793         last_info = None
 794         kv_rex = re.compile(
 795             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 796         for line in m3u8_doc.splitlines():
 797             if line.startswith('#EXT-X-STREAM-INF:'):
 798                 last_info = {}
 799                 for m in kv_rex.finditer(line):
 800                     v = m.group('val')
 801                     if v.startswith('"'):
 802                         v = v[1:-1]
 803                     last_info[m.group('key')] = v
 804             elif line.startswith('#') or not line.strip():
 805                 continue
 806             else:
 807                 if last_info is None:
 808                     formats.append({'url': format_url(line)})
 809                     continue
 810                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 811
 812                 f = {
 813                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 814                     'url': format_url(line.strip()),
 815                     'tbr': tbr,
 816                     'ext': ext,
 817                     'protocol': entry_protocol,
 818                     'preference': preference,
 819                 }
 820                 codecs = last_info.get('CODECS')
 821                 if codecs:
 822                     # TODO: looks like video codec is not always necessarily goes first
 823                     va_codecs = codecs.split(',')
 824                     if va_codecs[0]:
 825                         f['vcodec'] = va_codecs[0].partition('.')[0]
 826                     if len(va_codecs) > 1 and va_codecs[1]:
 827                         f['acodec'] = va_codecs[1].partition('.')[0]
 828                 resolution = last_info.get('RESOLUTION')
 829                 if resolution:
 830                     width_str, height_str = resolution.split('x')
 831                     f['width'] = int(width_str)
 832                     f['height'] = int(height_str)
 833                 formats.append(f)
 834                 last_info = {}
 835         self._sort_formats(formats)
 836         return formats
 837
 838     # TODO: improve extraction
 839     def _extract_smil_formats(self, smil_url, video_id):
 840         smil = self._download_xml(
 841             smil_url, video_id, 'Downloading SMIL file',
 842             'Unable to download SMIL file')
 843
 844         base = smil.find('./head/meta').get('base')
 845
 846         formats = []
 847         rtmp_count = 0
 848         for video in smil.findall('./body/switch/video'):
 849             src = video.get('src')
 850             if not src:
 851                 continue
 852             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 853             width = int_or_none(video.get('width'))
 854             height = int_or_none(video.get('height'))
 855             proto = video.get('proto')
 856             if not proto:
 857                 if base:
 858                     if base.startswith('rtmp'):
 859                         proto = 'rtmp'
 860                     elif base.startswith('http'):
 861                         proto = 'http'
 862             ext = video.get('ext')
 863             if proto == 'm3u8':
 864                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 865             elif proto == 'rtmp':
 866                 rtmp_count += 1
 867                 streamer = video.get('streamer') or base
 868                 formats.append({
 869                     'url': streamer,
 870                     'play_path': src,
 871                     'ext': 'flv',
 872                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 873                     'tbr': bitrate,
 874                     'width': width,
 875                     'height': height,
 876                 })
 877         self._sort_formats(formats)
 878
 879         return formats
 880
 881     def _live_title(self, name):
 882         """ Generate the title for a live video """
 883         now = datetime.datetime.now()
 884         now_str = now.strftime("%Y-%m-%d %H:%M")
 885         return name + ' ' + now_str
 886
 887     def _int(self, v, name, fatal=False, **kwargs):
 888         res = int_or_none(v, **kwargs)
 889         if 'get_attr' in kwargs:
 890             print(getattr(v, kwargs['get_attr']))
 891         if res is None:
 892             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 893             if fatal:
 894                 raise ExtractorError(msg)
 895             else:
 896                 self._downloader.report_warning(msg)
 897         return res
 898
 899     def _float(self, v, name, fatal=False, **kwargs):
 900         res = float_or_none(v, **kwargs)
 901         if res is None:
 902             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 903             if fatal:
 904                 raise ExtractorError(msg)
 905             else:
 906                 self._downloader.report_warning(msg)
 907         return res
 908
 909     def _set_cookie(self, domain, name, value, expire_time=None):
 910         cookie = compat_cookiejar.Cookie(
 911             0, name, value, None, None, domain, None,
 912             None, '/', True, False, expire_time, '', None, None, None)
 913         self._downloader.cookiejar.set_cookie(cookie)
 914
 915     def get_testcases(self, include_onlymatching=False):
 916         t = getattr(self, '_TEST', None)
 917         if t:
 918             assert not hasattr(self, '_TESTS'), \
 919                 '%s has _TEST and _TESTS' % type(self).__name__
 920             tests = [t]
 921         else:
 922             tests = getattr(self, '_TESTS', [])
 923         for t in tests:
 924             if not include_onlymatching and t.get('only_matching', False):
 925                 continue
 926             t['name'] = type(self).__name__[:-len('IE')]
 927             yield t
 928
 929     def is_suitable(self, age_limit):
 930         """ Test whether the extractor is generally suitable for the given
 931         age limit (i.e. pornographic sites are not, all others usually are) """
 932
 933         any_restricted = False
 934         for tc in self.get_testcases(include_onlymatching=False):
 935             if 'playlist' in tc:
 936                 tc = tc['playlist'][0]
 937             is_restricted = age_restricted(
 938                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 939             if not is_restricted:
 940                 return True
 941             any_restricted = any_restricted or is_restricted
 942         return not any_restricted
 943
 944
 945 class SearchInfoExtractor(InfoExtractor):
 946     """
 947     Base class for paged search queries extractors.
 948     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 949     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 950     """
 951
 952     @classmethod
 953     def _make_valid_url(cls):
 954         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 955
 956     @classmethod
 957     def suitable(cls, url):
 958         return re.match(cls._make_valid_url(), url) is not None
 959
 960     def _real_extract(self, query):
 961         mobj = re.match(self._make_valid_url(), query)
 962         if mobj is None:
 963             raise ExtractorError('Invalid search query "%s"' % query)
 964
 965         prefix = mobj.group('prefix')
 966         query = mobj.group('query')
 967         if prefix == '':
 968             return self._get_n_results(query, 1)
 969         elif prefix == 'all':
 970             return self._get_n_results(query, self._MAX_RESULTS)
 971         else:
 972             n = int(prefix)
 973             if n <= 0:
 974                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 975             elif n > self._MAX_RESULTS:
 976                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 977                 n = self._MAX_RESULTS
 978             return self._get_n_results(query, n)
 979
 980     def _get_n_results(self, query, n):
 981         """Get a specified number of results for a query"""
 982         raise NotImplementedError("This method must be implemented by subclasses")
 983
 984     @property
 985     def SEARCH_KEY(self):
 986         return self._SEARCH_KEY