_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_HTTPError,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urllib_parse_urlparse,
  23     compat_urllib_request,
  24     compat_urlparse,
  25     compat_str,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     RegexNotFoundError,
  39     sanitize_filename,
  40     unescapeHTML,
  41     url_basename,
  42     xpath_text,
  43     xpath_with_ns,
  44 )
  45
  46
  47 class InfoExtractor(object):
  48     """Information Extractor class.
  49
  50     Information extractors are the classes that, given a URL, extract
  51     information about the video (or videos) the URL refers to. This
  52     information includes the real video URL, the video title, author and
  53     others. The information is stored in a dictionary which is then
  54     passed to the YoutubeDL. The YoutubeDL processes this
  55     information possibly downloading the video to the file system, among
  56     other possible outcomes.
  57
  58     The type field determines the type of the result.
  59     By far the most common value (and the default if _type is missing) is
  60     "video", which indicates a single video.
  61
  62     For a video, the dictionaries must include the following fields:
  63
  64     id:             Video identifier.
  65     title:          Video title, unescaped.
  66
  67     Additionally, it must contain either a formats entry or a url one:
  68
  69     formats:        A list of dictionaries for each format available, ordered
  70                     from worst to best quality.
  71
  72                     Potential fields:
  73                     * url        Mandatory. The URL of the video file
  74                     * ext        Will be calculated from URL if missing
  75                     * format     A human-readable description of the format
  76                                  ("mp4 container with h264/opus").
  77                                  Calculated from the format_id, width, height.
  78                                  and format_note fields if missing.
  79                     * format_id  A short description of the format
  80                                  ("mp4_h264_opus" or "19").
  81                                 Technically optional, but strongly recommended.
  82                     * format_note Additional info about the format
  83                                  ("3D" or "DASH video")
  84                     * width      Width of the video, if known
  85                     * height     Height of the video, if known
  86                     * resolution Textual description of width and height
  87                     * tbr        Average bitrate of audio and video in KBit/s
  88                     * abr        Average audio bitrate in KBit/s
  89                     * acodec     Name of the audio codec in use
  90                     * asr        Audio sampling rate in Hertz
  91                     * vbr        Average video bitrate in KBit/s
  92                     * fps        Frame rate
  93                     * vcodec     Name of the video codec in use
  94                     * container  Name of the container format
  95                     * filesize   The number of bytes, if known in advance
  96                     * filesize_approx  An estimate for the number of bytes
  97                     * player_url SWF Player URL (used for rtmpdump).
  98                     * protocol   The protocol that will be used for the actual
  99                                  download, lower-case.
 100                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 101                                  "m3u8", or "m3u8_native".
 102                     * preference Order number of this format. If this field is
 103                                  present and not None, the formats get sorted
 104                                  by this field, regardless of all other values.
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                                  < -1000 to hide the format (if there is
 108                                     another one which is strictly better)
 109                     * language_preference  Is this in the correct requested
 110                                  language?
 111                                  10 if it's what the URL is about,
 112                                  -1 for default (don't know),
 113                                  -10 otherwise, other values reserved for now.
 114                     * quality    Order number of the video quality of this
 115                                  format, irrespective of the file format.
 116                                  -1 for default (order by other properties),
 117                                  -2 or smaller for less than default.
 118                     * source_preference  Order number for this video source
 119                                   (quality takes higher priority)
 120                                  -1 for default (order by other properties),
 121                                  -2 or smaller for less than default.
 122                     * http_headers  A dictionary of additional HTTP headers
 123                                  to add to the request.
 124                     * stretched_ratio  If given and not 1, indicates that the
 125                                  video's pixels are not square.
 126                                  width : height ratio as float.
 127                     * no_resume  The server does not support resuming the
 128                                  (HTTP or RTMP) download. Boolean.
 129
 130     url:            Final video URL.
 131     ext:            Video filename extension.
 132     format:         The video format, defaults to ext (used for --get-format)
 133     player_url:     SWF Player URL (used for rtmpdump).
 134
 135     The following fields are optional:
 136
 137     alt_title:      A secondary title of the video.
 138     display_id      An alternative identifier for the video, not necessarily
 139                     unique, but available before title. Typically, id is
 140                     something like "4234987", title "Dancing naked mole rats",
 141                     and display_id "dancing-naked-mole-rats"
 142     thumbnails:     A list of dictionaries, with the following entries:
 143                         * "id" (optional, string) - Thumbnail format ID
 144                         * "url"
 145                         * "preference" (optional, int) - quality of the image
 146                         * "width" (optional, int)
 147                         * "height" (optional, int)
 148                         * "resolution" (optional, string "{width}x{height"},
 149                                         deprecated)
 150     thumbnail:      Full URL to a video thumbnail image.
 151     description:    Full video description.
 152     uploader:       Full name of the video uploader.
 153     creator:        The main artist who created the video.
 154     timestamp:      UNIX timestamp of the moment the video became available.
 155     upload_date:    Video upload date (YYYYMMDD).
 156                     If not explicitly set, calculated from timestamp.
 157     uploader_id:    Nickname or id of the video uploader.
 158     location:       Physical location where the video was filmed.
 159     subtitles:      The available subtitles as a dictionary in the format
 160                     {language: subformats}. "subformats" is a list sorted from
 161                     lower to higher preference, each element is a dictionary
 162                     with the "ext" entry and one of:
 163                         * "data": The subtitles file contents
 164                         * "url": A URL pointing to the subtitles file
 165     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 166                     automatically generated captions
 167     duration:       Length of the video in seconds, as an integer.
 168     view_count:     How many users have watched the video on the platform.
 169     like_count:     Number of positive ratings of the video
 170     dislike_count:  Number of negative ratings of the video
 171     average_rating: Average rating give by users, the scale used depends on the webpage
 172     comment_count:  Number of comments on the video
 173     comments:       A list of comments, each with one or more of the following
 174                     properties (all but one of text or html optional):
 175                         * "author" - human-readable name of the comment author
 176                         * "author_id" - user ID of the comment author
 177                         * "id" - Comment ID
 178                         * "html" - Comment as HTML
 179                         * "text" - Plain text of the comment
 180                         * "timestamp" - UNIX timestamp of comment
 181                         * "parent" - ID of the comment this one is replying to.
 182                                      Set to "root" to indicate that this is a
 183                                      comment to the original video.
 184     age_limit:      Age restriction for the video, as an integer (years)
 185     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 186                     should allow to get the same result again. (It will be set
 187                     by YoutubeDL if it's missing)
 188     categories:     A list of categories that the video falls in, for example
 189                     ["Sports", "Berlin"]
 190     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 191     is_live:        True, False, or None (=unknown). Whether this video is a
 192                     live stream that goes on instead of a fixed-length video.
 193     start_time:     Time in seconds where the reproduction should start, as
 194                     specified in the URL.
 195     end_time:       Time in seconds where the reproduction should end, as
 196                     specified in the URL.
 197
 198     Unless mentioned otherwise, the fields should be Unicode strings.
 199
 200     Unless mentioned otherwise, None is equivalent to absence of information.
 201
 202
 203     _type "playlist" indicates multiple videos.
 204     There must be a key "entries", which is a list, an iterable, or a PagedList
 205     object, each element of which is a valid dictionary by this specification.
 206
 207     Additionally, playlists can have "title", "description" and "id" attributes
 208     with the same semantics as videos (see above).
 209
 210
 211     _type "multi_video" indicates that there are multiple videos that
 212     form a single show, for examples multiple acts of an opera or TV episode.
 213     It must have an entries key like a playlist and contain all the keys
 214     required for a video at the same time.
 215
 216
 217     _type "url" indicates that the video must be extracted from another
 218     location, possibly by a different extractor. Its only required key is:
 219     "url" - the next URL to extract.
 220     The key "ie_key" can be set to the class name (minus the trailing "IE",
 221     e.g. "Youtube") if the extractor class is known in advance.
 222     Additionally, the dictionary may have any properties of the resolved entity
 223     known in advance, for example "title" if the title of the referred video is
 224     known ahead of time.
 225
 226
 227     _type "url_transparent" entities have the same specification as "url", but
 228     indicate that the given additional information is more precise than the one
 229     associated with the resolved URL.
 230     This is useful when a site employs a video service that hosts the video and
 231     its technical metadata, but that video service does not embed a useful
 232     title, description etc.
 233
 234
 235     Subclasses of this one should re-define the _real_initialize() and
 236     _real_extract() methods and define a _VALID_URL regexp.
 237     Probably, they should also be added to the list of extractors.
 238
 239     Finally, the _WORKING attribute should be set to False for broken IEs
 240     in order to warn the users and skip the tests.
 241     """
 242
 243     _ready = False
 244     _downloader = None
 245     _WORKING = True
 246
 247     def __init__(self, downloader=None):
 248         """Constructor. Receives an optional downloader."""
 249         self._ready = False
 250         self.set_downloader(downloader)
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255
 256         # This does not use has/getattr intentionally - we want to know whether
 257         # we have cached the regexp for *this* class, whereas getattr would also
 258         # match the superclass
 259         if '_VALID_URL_RE' not in cls.__dict__:
 260             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 261         return cls._VALID_URL_RE.match(url) is not None
 262
 263     @classmethod
 264     def _match_id(cls, url):
 265         if '_VALID_URL_RE' not in cls.__dict__:
 266             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 267         m = cls._VALID_URL_RE.match(url)
 268         assert m
 269         return m.group('id')
 270
 271     @classmethod
 272     def working(cls):
 273         """Getter method for _WORKING."""
 274         return cls._WORKING
 275
 276     def initialize(self):
 277         """Initializes an instance (authentication, etc)."""
 278         if not self._ready:
 279             self._real_initialize()
 280             self._ready = True
 281
 282     def extract(self, url):
 283         """Extracts URL information and returns it in list of dicts."""
 284         try:
 285             self.initialize()
 286             return self._real_extract(url)
 287         except ExtractorError:
 288             raise
 289         except compat_http_client.IncompleteRead as e:
 290             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 291         except (KeyError, StopIteration) as e:
 292             raise ExtractorError('An extractor error has occured.', cause=e)
 293
 294     def set_downloader(self, downloader):
 295         """Sets the downloader for this IE."""
 296         self._downloader = downloader
 297
 298     def _real_initialize(self):
 299         """Real initialization process. Redefine in subclasses."""
 300         pass
 301
 302     def _real_extract(self, url):
 303         """Real extraction process. Redefine in subclasses."""
 304         pass
 305
 306     @classmethod
 307     def ie_key(cls):
 308         """A string for getting the InfoExtractor with get_info_extractor"""
 309         return cls.__name__[:-2]
 310
 311     @property
 312     def IE_NAME(self):
 313         return type(self).__name__[:-2]
 314
 315     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 316         """ Returns the response handle """
 317         if note is None:
 318             self.report_download_webpage(video_id)
 319         elif note is not False:
 320             if video_id is None:
 321                 self.to_screen('%s' % (note,))
 322             else:
 323                 self.to_screen('%s: %s' % (video_id, note))
 324         try:
 325             return self._downloader.urlopen(url_or_request)
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             if errnote is False:
 328                 return False
 329             if errnote is None:
 330                 errnote = 'Unable to download webpage'
 331             errmsg = '%s: %s' % (errnote, compat_str(err))
 332             if fatal:
 333                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 334             else:
 335                 self._downloader.report_warning(errmsg)
 336                 return False
 337
 338     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 339         """ Returns a tuple (page content as string, URL handle) """
 340         # Strip hashes from the URL (#1038)
 341         if isinstance(url_or_request, (compat_str, str)):
 342             url_or_request = url_or_request.partition('#')[0]
 343
 344         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 345         if urlh is False:
 346             assert not fatal
 347             return False
 348         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 349         return (content, urlh)
 350
 351     @staticmethod
 352     def _guess_encoding_from_content(content_type, webpage_bytes):
 353         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 354         if m:
 355             encoding = m.group(1)
 356         else:
 357             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 358                           webpage_bytes[:1024])
 359             if m:
 360                 encoding = m.group(1).decode('ascii')
 361             elif webpage_bytes.startswith(b'\xff\xfe'):
 362                 encoding = 'utf-16'
 363             else:
 364                 encoding = 'utf-8'
 365
 366         return encoding
 367
 368     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 369         content_type = urlh.headers.get('Content-Type', '')
 370         webpage_bytes = urlh.read()
 371         if prefix is not None:
 372             webpage_bytes = prefix + webpage_bytes
 373         if not encoding:
 374             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 375         if self._downloader.params.get('dump_intermediate_pages', False):
 376             try:
 377                 url = url_or_request.get_full_url()
 378             except AttributeError:
 379                 url = url_or_request
 380             self.to_screen('Dumping request to ' + url)
 381             dump = base64.b64encode(webpage_bytes).decode('ascii')
 382             self._downloader.to_screen(dump)
 383         if self._downloader.params.get('write_pages', False):
 384             try:
 385                 url = url_or_request.get_full_url()
 386             except AttributeError:
 387                 url = url_or_request
 388             basen = '%s_%s' % (video_id, url)
 389             if len(basen) > 240:
 390                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 391                 basen = basen[:240 - len(h)] + h
 392             raw_filename = basen + '.dump'
 393             filename = sanitize_filename(raw_filename, restricted=True)
 394             self.to_screen('Saving request to ' + filename)
 395             # Working around MAX_PATH limitation on Windows (see
 396             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 397             if os.name == 'nt':
 398                 absfilepath = os.path.abspath(filename)
 399                 if len(absfilepath) > 259:
 400                     filename = '\\\\?\\' + absfilepath
 401             with open(filename, 'wb') as outf:
 402                 outf.write(webpage_bytes)
 403
 404         try:
 405             content = webpage_bytes.decode(encoding, 'replace')
 406         except LookupError:
 407             content = webpage_bytes.decode('utf-8', 'replace')
 408
 409         if ('<title>Access to this site is blocked</title>' in content and
 410                 'Websense' in content[:512]):
 411             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 412             blocked_iframe = self._html_search_regex(
 413                 r'<iframe src="([^"]+)"', content,
 414                 'Websense information URL', default=None)
 415             if blocked_iframe:
 416                 msg += ' Visit %s for more details' % blocked_iframe
 417             raise ExtractorError(msg, expected=True)
 418         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 419             msg = (
 420                 'Access to this webpage has been blocked by Indian censorship. '
 421                 'Use a VPN or proxy server (with --proxy) to route around it.')
 422             block_msg = self._html_search_regex(
 423                 r'</h1><p>(.*?)</p>',
 424                 content, 'block message', default=None)
 425             if block_msg:
 426                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 427             raise ExtractorError(msg, expected=True)
 428
 429         return content
 430
 431     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 432         """ Returns the data of the page as a string """
 433         success = False
 434         try_count = 0
 435         while success is False:
 436             try:
 437                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 438                 success = True
 439             except compat_http_client.IncompleteRead as e:
 440                 try_count += 1
 441                 if try_count >= tries:
 442                     raise e
 443                 self._sleep(timeout, video_id)
 444         if res is False:
 445             return res
 446         else:
 447             content, _ = res
 448             return content
 449
 450     def _download_xml(self, url_or_request, video_id,
 451                       note='Downloading XML', errnote='Unable to download XML',
 452                       transform_source=None, fatal=True, encoding=None):
 453         """Return the xml as an xml.etree.ElementTree.Element"""
 454         xml_string = self._download_webpage(
 455             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 456         if xml_string is False:
 457             return xml_string
 458         if transform_source:
 459             xml_string = transform_source(xml_string)
 460         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 461
 462     def _download_json(self, url_or_request, video_id,
 463                        note='Downloading JSON metadata',
 464                        errnote='Unable to download JSON metadata',
 465                        transform_source=None,
 466                        fatal=True, encoding=None):
 467         json_string = self._download_webpage(
 468             url_or_request, video_id, note, errnote, fatal=fatal,
 469             encoding=encoding)
 470         if (not fatal) and json_string is False:
 471             return None
 472         return self._parse_json(
 473             json_string, video_id, transform_source=transform_source, fatal=fatal)
 474
 475     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 476         if transform_source:
 477             json_string = transform_source(json_string)
 478         try:
 479             return json.loads(json_string)
 480         except ValueError as ve:
 481             errmsg = '%s: Failed to parse JSON ' % video_id
 482             if fatal:
 483                 raise ExtractorError(errmsg, cause=ve)
 484             else:
 485                 self.report_warning(errmsg + str(ve))
 486
 487     def report_warning(self, msg, video_id=None):
 488         idstr = '' if video_id is None else '%s: ' % video_id
 489         self._downloader.report_warning(
 490             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 491
 492     def to_screen(self, msg):
 493         """Print msg to screen, prefixing it with '[ie_name]'"""
 494         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 495
 496     def report_extraction(self, id_or_name):
 497         """Report information extraction."""
 498         self.to_screen('%s: Extracting information' % id_or_name)
 499
 500     def report_download_webpage(self, video_id):
 501         """Report webpage download."""
 502         self.to_screen('%s: Downloading webpage' % video_id)
 503
 504     def report_age_confirmation(self):
 505         """Report attempt to confirm age."""
 506         self.to_screen('Confirming age')
 507
 508     def report_login(self):
 509         """Report attempt to log in."""
 510         self.to_screen('Logging in')
 511
 512     # Methods for following #608
 513     @staticmethod
 514     def url_result(url, ie=None, video_id=None, video_title=None):
 515         """Returns a URL that points to a page that should be processed"""
 516         # TODO: ie should be the class used for getting the info
 517         video_info = {'_type': 'url',
 518                       'url': url,
 519                       'ie_key': ie}
 520         if video_id is not None:
 521             video_info['id'] = video_id
 522         if video_title is not None:
 523             video_info['title'] = video_title
 524         return video_info
 525
 526     @staticmethod
 527     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 528         """Returns a playlist"""
 529         video_info = {'_type': 'playlist',
 530                       'entries': entries}
 531         if playlist_id:
 532             video_info['id'] = playlist_id
 533         if playlist_title:
 534             video_info['title'] = playlist_title
 535         if playlist_description:
 536             video_info['description'] = playlist_description
 537         return video_info
 538
 539     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 540         """
 541         Perform a regex search on the given string, using a single or a list of
 542         patterns returning the first matching group.
 543         In case of failure return a default value or raise a WARNING or a
 544         RegexNotFoundError, depending on fatal, specifying the field name.
 545         """
 546         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 547             mobj = re.search(pattern, string, flags)
 548         else:
 549             for p in pattern:
 550                 mobj = re.search(p, string, flags)
 551                 if mobj:
 552                     break
 553
 554         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 555             _name = '\033[0;34m%s\033[0m' % name
 556         else:
 557             _name = name
 558
 559         if mobj:
 560             if group is None:
 561                 # return the first matching group
 562                 return next(g for g in mobj.groups() if g is not None)
 563             else:
 564                 return mobj.group(group)
 565         elif default is not NO_DEFAULT:
 566             return default
 567         elif fatal:
 568             raise RegexNotFoundError('Unable to extract %s' % _name)
 569         else:
 570             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 571             return None
 572
 573     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 574         """
 575         Like _search_regex, but strips HTML tags and unescapes entities.
 576         """
 577         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 578         if res:
 579             return clean_html(res).strip()
 580         else:
 581             return res
 582
 583     def _get_login_info(self):
 584         """
 585         Get the login info as (username, password)
 586         It will look in the netrc file using the _NETRC_MACHINE value
 587         If there's no info available, return (None, None)
 588         """
 589         if self._downloader is None:
 590             return (None, None)
 591
 592         username = None
 593         password = None
 594         downloader_params = self._downloader.params
 595
 596         # Attempt to use provided username and password or .netrc data
 597         if downloader_params.get('username', None) is not None:
 598             username = downloader_params['username']
 599             password = downloader_params['password']
 600         elif downloader_params.get('usenetrc', False):
 601             try:
 602                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 603                 if info is not None:
 604                     username = info[0]
 605                     password = info[2]
 606                 else:
 607                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 608             except (IOError, netrc.NetrcParseError) as err:
 609                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 610
 611         return (username, password)
 612
 613     def _get_tfa_info(self):
 614         """
 615         Get the two-factor authentication info
 616         TODO - asking the user will be required for sms/phone verify
 617         currently just uses the command line option
 618         If there's no info available, return None
 619         """
 620         if self._downloader is None:
 621             return None
 622         downloader_params = self._downloader.params
 623
 624         if downloader_params.get('twofactor', None) is not None:
 625             return downloader_params['twofactor']
 626
 627         return None
 628
 629     # Helper functions for extracting OpenGraph info
 630     @staticmethod
 631     def _og_regexes(prop):
 632         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 633         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 634         template = r'<meta[^>]+?%s[^>]+?%s'
 635         return [
 636             template % (property_re, content_re),
 637             template % (content_re, property_re),
 638         ]
 639
 640     @staticmethod
 641     def _meta_regex(prop):
 642         return r'''(?isx)<meta
 643                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 644                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 645
 646     def _og_search_property(self, prop, html, name=None, **kargs):
 647         if name is None:
 648             name = 'OpenGraph %s' % prop
 649         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 650         if escaped is None:
 651             return None
 652         return unescapeHTML(escaped)
 653
 654     def _og_search_thumbnail(self, html, **kargs):
 655         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 656
 657     def _og_search_description(self, html, **kargs):
 658         return self._og_search_property('description', html, fatal=False, **kargs)
 659
 660     def _og_search_title(self, html, **kargs):
 661         return self._og_search_property('title', html, **kargs)
 662
 663     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 664         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 665         if secure:
 666             regexes = self._og_regexes('video:secure_url') + regexes
 667         return self._html_search_regex(regexes, html, name, **kargs)
 668
 669     def _og_search_url(self, html, **kargs):
 670         return self._og_search_property('url', html, **kargs)
 671
 672     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 673         if display_name is None:
 674             display_name = name
 675         return self._html_search_regex(
 676             self._meta_regex(name),
 677             html, display_name, fatal=fatal, group='content', **kwargs)
 678
 679     def _dc_search_uploader(self, html):
 680         return self._html_search_meta('dc.creator', html, 'uploader')
 681
 682     def _rta_search(self, html):
 683         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 684         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 685                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 686                      html):
 687             return 18
 688         return 0
 689
 690     def _media_rating_search(self, html):
 691         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 692         rating = self._html_search_meta('rating', html)
 693
 694         if not rating:
 695             return None
 696
 697         RATING_TABLE = {
 698             'safe for kids': 0,
 699             'general': 8,
 700             '14 years': 14,
 701             'mature': 17,
 702             'restricted': 19,
 703         }
 704         return RATING_TABLE.get(rating.lower(), None)
 705
 706     def _family_friendly_search(self, html):
 707         # See http://schema.org/VideoObject
 708         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 709
 710         if not family_friendly:
 711             return None
 712
 713         RATING_TABLE = {
 714             '1': 0,
 715             'true': 0,
 716             '0': 18,
 717             'false': 18,
 718         }
 719         return RATING_TABLE.get(family_friendly.lower(), None)
 720
 721     def _twitter_search_player(self, html):
 722         return self._html_search_meta('twitter:player', html,
 723                                       'twitter card player')
 724
 725     @staticmethod
 726     def _hidden_inputs(html):
 727         hidden_inputs = {}
 728         for input in re.findall(r'<input([^>]+)>', html):
 729             if not re.search(r'type=(["\'])hidden\1', input):
 730                 continue
 731             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 732             if not name:
 733                 continue
 734             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 735             if not value:
 736                 continue
 737             hidden_inputs[name.group('value')] = value.group('value')
 738         return hidden_inputs
 739
 740     def _form_hidden_inputs(self, form_id, html):
 741         form = self._search_regex(
 742             r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 743             html, '%s form' % form_id, group='form')
 744         return self._hidden_inputs(form)
 745
 746     def _sort_formats(self, formats, field_preference=None):
 747         if not formats:
 748             raise ExtractorError('No video formats found')
 749
 750         def _formats_key(f):
 751             # TODO remove the following workaround
 752             from ..utils import determine_ext
 753             if not f.get('ext') and 'url' in f:
 754                 f['ext'] = determine_ext(f['url'])
 755
 756             if isinstance(field_preference, (list, tuple)):
 757                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 758
 759             preference = f.get('preference')
 760             if preference is None:
 761                 proto = f.get('protocol')
 762                 if proto is None:
 763                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 764
 765                 preference = 0 if proto in ['http', 'https'] else -0.1
 766                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 767                     preference -= 0.5
 768
 769             if f.get('vcodec') == 'none':  # audio only
 770                 if self._downloader.params.get('prefer_free_formats'):
 771                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 772                 else:
 773                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 774                 ext_preference = 0
 775                 try:
 776                     audio_ext_preference = ORDER.index(f['ext'])
 777                 except ValueError:
 778                     audio_ext_preference = -1
 779             else:
 780                 if self._downloader.params.get('prefer_free_formats'):
 781                     ORDER = ['flv', 'mp4', 'webm']
 782                 else:
 783                     ORDER = ['webm', 'flv', 'mp4']
 784                 try:
 785                     ext_preference = ORDER.index(f['ext'])
 786                 except ValueError:
 787                     ext_preference = -1
 788                 audio_ext_preference = 0
 789
 790             return (
 791                 preference,
 792                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 793                 f.get('quality') if f.get('quality') is not None else -1,
 794                 f.get('tbr') if f.get('tbr') is not None else -1,
 795                 f.get('filesize') if f.get('filesize') is not None else -1,
 796                 f.get('vbr') if f.get('vbr') is not None else -1,
 797                 f.get('height') if f.get('height') is not None else -1,
 798                 f.get('width') if f.get('width') is not None else -1,
 799                 ext_preference,
 800                 f.get('abr') if f.get('abr') is not None else -1,
 801                 audio_ext_preference,
 802                 f.get('fps') if f.get('fps') is not None else -1,
 803                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 804                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 805                 f.get('format_id') if f.get('format_id') is not None else '',
 806             )
 807         formats.sort(key=_formats_key)
 808
 809     def _check_formats(self, formats, video_id):
 810         if formats:
 811             formats[:] = filter(
 812                 lambda f: self._is_valid_url(
 813                     f['url'], video_id,
 814                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 815                 formats)
 816
 817     def _is_valid_url(self, url, video_id, item='video'):
 818         url = self._proto_relative_url(url, scheme='http:')
 819         # For now assume non HTTP(S) URLs always valid
 820         if not (url.startswith('http://') or url.startswith('https://')):
 821             return True
 822         try:
 823             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 824             return True
 825         except ExtractorError as e:
 826             if isinstance(e.cause, compat_HTTPError):
 827                 self.to_screen(
 828                     '%s: %s URL is invalid, skipping' % (video_id, item))
 829                 return False
 830             raise
 831
 832     def http_scheme(self):
 833         """ Either "http:" or "https:", depending on the user's preferences """
 834         return (
 835             'http:'
 836             if self._downloader.params.get('prefer_insecure', False)
 837             else 'https:')
 838
 839     def _proto_relative_url(self, url, scheme=None):
 840         if url is None:
 841             return url
 842         if url.startswith('//'):
 843             if scheme is None:
 844                 scheme = self.http_scheme()
 845             return scheme + url
 846         else:
 847             return url
 848
 849     def _sleep(self, timeout, video_id, msg_template=None):
 850         if msg_template is None:
 851             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 852         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 853         self.to_screen(msg)
 854         time.sleep(timeout)
 855
 856     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 857                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 858         manifest = self._download_xml(
 859             manifest_url, video_id, 'Downloading f4m manifest',
 860             'Unable to download f4m manifest',
 861             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 862             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 863             transform_source=transform_source)
 864
 865         formats = []
 866         manifest_version = '1.0'
 867         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 868         if not media_nodes:
 869             manifest_version = '2.0'
 870             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 871         for i, media_el in enumerate(media_nodes):
 872             if manifest_version == '2.0':
 873                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 874                 if not media_url:
 875                     continue
 876                 manifest_url = (
 877                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 878                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 879                 # If media_url is itself a f4m manifest do the recursive extraction
 880                 # since bitrates in parent manifest (this one) and media_url manifest
 881                 # may differ leading to inability to resolve the format by requested
 882                 # bitrate in f4m downloader
 883                 if determine_ext(manifest_url) == 'f4m':
 884                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 885                     continue
 886             tbr = int_or_none(media_el.attrib.get('bitrate'))
 887             formats.append({
 888                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 889                 'url': manifest_url,
 890                 'ext': 'flv',
 891                 'tbr': tbr,
 892                 'width': int_or_none(media_el.attrib.get('width')),
 893                 'height': int_or_none(media_el.attrib.get('height')),
 894                 'preference': preference,
 895             })
 896         self._sort_formats(formats)
 897
 898         return formats
 899
 900     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 901                               entry_protocol='m3u8', preference=None,
 902                               m3u8_id=None, note=None, errnote=None,
 903                               fatal=True):
 904
 905         formats = [{
 906             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 907             'url': m3u8_url,
 908             'ext': ext,
 909             'protocol': 'm3u8',
 910             'preference': preference - 1 if preference else -1,
 911             'resolution': 'multiple',
 912             'format_note': 'Quality selection URL',
 913         }]
 914
 915         format_url = lambda u: (
 916             u
 917             if re.match(r'^https?://', u)
 918             else compat_urlparse.urljoin(m3u8_url, u))
 919
 920         m3u8_doc = self._download_webpage(
 921             m3u8_url, video_id,
 922             note=note or 'Downloading m3u8 information',
 923             errnote=errnote or 'Failed to download m3u8 information',
 924             fatal=fatal)
 925         if m3u8_doc is False:
 926             return m3u8_doc
 927         last_info = None
 928         last_media = None
 929         kv_rex = re.compile(
 930             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 931         for line in m3u8_doc.splitlines():
 932             if line.startswith('#EXT-X-STREAM-INF:'):
 933                 last_info = {}
 934                 for m in kv_rex.finditer(line):
 935                     v = m.group('val')
 936                     if v.startswith('"'):
 937                         v = v[1:-1]
 938                     last_info[m.group('key')] = v
 939             elif line.startswith('#EXT-X-MEDIA:'):
 940                 last_media = {}
 941                 for m in kv_rex.finditer(line):
 942                     v = m.group('val')
 943                     if v.startswith('"'):
 944                         v = v[1:-1]
 945                     last_media[m.group('key')] = v
 946             elif line.startswith('#') or not line.strip():
 947                 continue
 948             else:
 949                 if last_info is None:
 950                     formats.append({'url': format_url(line)})
 951                     continue
 952                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 953                 format_id = []
 954                 if m3u8_id:
 955                     format_id.append(m3u8_id)
 956                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 957                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 958                 f = {
 959                     'format_id': '-'.join(format_id),
 960                     'url': format_url(line.strip()),
 961                     'tbr': tbr,
 962                     'ext': ext,
 963                     'protocol': entry_protocol,
 964                     'preference': preference,
 965                 }
 966                 codecs = last_info.get('CODECS')
 967                 if codecs:
 968                     # TODO: looks like video codec is not always necessarily goes first
 969                     va_codecs = codecs.split(',')
 970                     if va_codecs[0]:
 971                         f['vcodec'] = va_codecs[0].partition('.')[0]
 972                     if len(va_codecs) > 1 and va_codecs[1]:
 973                         f['acodec'] = va_codecs[1].partition('.')[0]
 974                 resolution = last_info.get('RESOLUTION')
 975                 if resolution:
 976                     width_str, height_str = resolution.split('x')
 977                     f['width'] = int(width_str)
 978                     f['height'] = int(height_str)
 979                 if last_media is not None:
 980                     f['m3u8_media'] = last_media
 981                     last_media = None
 982                 formats.append(f)
 983                 last_info = {}
 984         self._sort_formats(formats)
 985         return formats
 986
 987     @staticmethod
 988     def _xpath_ns(path, namespace=None):
 989         if not namespace:
 990             return path
 991         out = []
 992         for c in path.split('/'):
 993             if not c or c == '.':
 994                 out.append(c)
 995             else:
 996                 out.append('{%s}%s' % (namespace, c))
 997         return '/'.join(out)
 998
 999     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1000         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1001
1002         if smil is False:
1003             assert not fatal
1004             return []
1005
1006         namespace = self._parse_smil_namespace(smil)
1007
1008         return self._parse_smil_formats(
1009             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1010
1011     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1012         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1013         if smil is False:
1014             return {}
1015         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1016
1017     def _download_smil(self, smil_url, video_id, fatal=True):
1018         return self._download_xml(
1019             smil_url, video_id, 'Downloading SMIL file',
1020             'Unable to download SMIL file', fatal=fatal)
1021
1022     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1023         namespace = self._parse_smil_namespace(smil)
1024
1025         formats = self._parse_smil_formats(
1026             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1027         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1028
1029         video_id = os.path.splitext(url_basename(smil_url))[0]
1030         title = None
1031         description = None
1032         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1033             name = meta.attrib.get('name')
1034             content = meta.attrib.get('content')
1035             if not name or not content:
1036                 continue
1037             if not title and name == 'title':
1038                 title = content
1039             elif not description and name in ('description', 'abstract'):
1040                 description = content
1041
1042         return {
1043             'id': video_id,
1044             'title': title or video_id,
1045             'description': description,
1046             'formats': formats,
1047             'subtitles': subtitles,
1048         }
1049
1050     def _parse_smil_namespace(self, smil):
1051         return self._search_regex(
1052             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1053
1054     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1055         base = smil_url
1056         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1057             b = meta.get('base') or meta.get('httpBase')
1058             if b:
1059                 base = b
1060                 break
1061
1062         formats = []
1063         rtmp_count = 0
1064         http_count = 0
1065
1066         videos = smil.findall(self._xpath_ns('.//video', namespace))
1067         for video in videos:
1068             src = video.get('src')
1069             if not src:
1070                 continue
1071
1072             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1073             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1074             width = int_or_none(video.get('width'))
1075             height = int_or_none(video.get('height'))
1076             proto = video.get('proto')
1077             ext = video.get('ext')
1078             src_ext = determine_ext(src)
1079             streamer = video.get('streamer') or base
1080
1081             if proto == 'rtmp' or streamer.startswith('rtmp'):
1082                 rtmp_count += 1
1083                 formats.append({
1084                     'url': streamer,
1085                     'play_path': src,
1086                     'ext': 'flv',
1087                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1088                     'tbr': bitrate,
1089                     'filesize': filesize,
1090                     'width': width,
1091                     'height': height,
1092                 })
1093                 continue
1094
1095             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1096
1097             if proto == 'm3u8' or src_ext == 'm3u8':
1098                 formats.extend(self._extract_m3u8_formats(
1099                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1100                 continue
1101
1102             if src_ext == 'f4m':
1103                 f4m_url = src_url
1104                 if not f4m_params:
1105                     f4m_params = {
1106                         'hdcore': '3.2.0',
1107                         'plugin': 'flowplayer-3.2.0.1',
1108                     }
1109                 f4m_url += '&' if '?' in f4m_url else '?'
1110                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1111                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1112                 continue
1113
1114             if src_url.startswith('http'):
1115                 http_count += 1
1116                 formats.append({
1117                     'url': src_url,
1118                     'ext': ext or src_ext or 'flv',
1119                     'format_id': 'http-%d' % (bitrate or http_count),
1120                     'tbr': bitrate,
1121                     'filesize': filesize,
1122                     'width': width,
1123                     'height': height,
1124                 })
1125                 continue
1126
1127         self._sort_formats(formats)
1128
1129         return formats
1130
1131     def _parse_smil_subtitles(self, smil, namespace=None):
1132         subtitles = {}
1133         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1134             src = textstream.get('src')
1135             if not src:
1136                 continue
1137             ext = textstream.get('ext') or determine_ext(src)
1138             if not ext:
1139                 type_ = textstream.get('type')
1140                 if type_ == 'text/srt':
1141                     ext = 'srt'
1142             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1143             subtitles.setdefault(lang, []).append({
1144                 'url': src,
1145                 'ext': ext,
1146             })
1147         return subtitles
1148
1149     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1150         xspf = self._download_xml(
1151             playlist_url, playlist_id, 'Downloading xpsf playlist',
1152             'Unable to download xspf manifest', fatal=fatal)
1153         if xspf is False:
1154             return []
1155         return self._parse_xspf(xspf, playlist_id)
1156
1157     def _parse_xspf(self, playlist, playlist_id):
1158         NS_MAP = {
1159             'xspf': 'http://xspf.org/ns/0/',
1160             's1': 'http://static.streamone.nl/player/ns/0',
1161         }
1162
1163         entries = []
1164         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1165             title = xpath_text(
1166                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1167             description = xpath_text(
1168                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1169             thumbnail = xpath_text(
1170                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1171             duration = float_or_none(
1172                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1173
1174             formats = [{
1175                 'url': location.text,
1176                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1177                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1178                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1179             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1180             self._sort_formats(formats)
1181
1182             entries.append({
1183                 'id': playlist_id,
1184                 'title': title,
1185                 'description': description,
1186                 'thumbnail': thumbnail,
1187                 'duration': duration,
1188                 'formats': formats,
1189             })
1190         return entries
1191
1192     def _live_title(self, name):
1193         """ Generate the title for a live video """
1194         now = datetime.datetime.now()
1195         now_str = now.strftime("%Y-%m-%d %H:%M")
1196         return name + ' ' + now_str
1197
1198     def _int(self, v, name, fatal=False, **kwargs):
1199         res = int_or_none(v, **kwargs)
1200         if 'get_attr' in kwargs:
1201             print(getattr(v, kwargs['get_attr']))
1202         if res is None:
1203             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1204             if fatal:
1205                 raise ExtractorError(msg)
1206             else:
1207                 self._downloader.report_warning(msg)
1208         return res
1209
1210     def _float(self, v, name, fatal=False, **kwargs):
1211         res = float_or_none(v, **kwargs)
1212         if res is None:
1213             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1214             if fatal:
1215                 raise ExtractorError(msg)
1216             else:
1217                 self._downloader.report_warning(msg)
1218         return res
1219
1220     def _set_cookie(self, domain, name, value, expire_time=None):
1221         cookie = compat_cookiejar.Cookie(
1222             0, name, value, None, None, domain, None,
1223             None, '/', True, False, expire_time, '', None, None, None)
1224         self._downloader.cookiejar.set_cookie(cookie)
1225
1226     def _get_cookies(self, url):
1227         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1228         req = compat_urllib_request.Request(url)
1229         self._downloader.cookiejar.add_cookie_header(req)
1230         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1231
1232     def get_testcases(self, include_onlymatching=False):
1233         t = getattr(self, '_TEST', None)
1234         if t:
1235             assert not hasattr(self, '_TESTS'), \
1236                 '%s has _TEST and _TESTS' % type(self).__name__
1237             tests = [t]
1238         else:
1239             tests = getattr(self, '_TESTS', [])
1240         for t in tests:
1241             if not include_onlymatching and t.get('only_matching', False):
1242                 continue
1243             t['name'] = type(self).__name__[:-len('IE')]
1244             yield t
1245
1246     def is_suitable(self, age_limit):
1247         """ Test whether the extractor is generally suitable for the given
1248         age limit (i.e. pornographic sites are not, all others usually are) """
1249
1250         any_restricted = False
1251         for tc in self.get_testcases(include_onlymatching=False):
1252             if 'playlist' in tc:
1253                 tc = tc['playlist'][0]
1254             is_restricted = age_restricted(
1255                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1256             if not is_restricted:
1257                 return True
1258             any_restricted = any_restricted or is_restricted
1259         return not any_restricted
1260
1261     def extract_subtitles(self, *args, **kwargs):
1262         if (self._downloader.params.get('writesubtitles', False) or
1263                 self._downloader.params.get('listsubtitles')):
1264             return self._get_subtitles(*args, **kwargs)
1265         return {}
1266
1267     def _get_subtitles(self, *args, **kwargs):
1268         raise NotImplementedError("This method must be implemented by subclasses")
1269
1270     def extract_automatic_captions(self, *args, **kwargs):
1271         if (self._downloader.params.get('writeautomaticsub', False) or
1272                 self._downloader.params.get('listsubtitles')):
1273             return self._get_automatic_captions(*args, **kwargs)
1274         return {}
1275
1276     def _get_automatic_captions(self, *args, **kwargs):
1277         raise NotImplementedError("This method must be implemented by subclasses")
1278
1279
1280 class SearchInfoExtractor(InfoExtractor):
1281     """
1282     Base class for paged search queries extractors.
1283     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1284     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1285     """
1286
1287     @classmethod
1288     def _make_valid_url(cls):
1289         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1290
1291     @classmethod
1292     def suitable(cls, url):
1293         return re.match(cls._make_valid_url(), url) is not None
1294
1295     def _real_extract(self, query):
1296         mobj = re.match(self._make_valid_url(), query)
1297         if mobj is None:
1298             raise ExtractorError('Invalid search query "%s"' % query)
1299
1300         prefix = mobj.group('prefix')
1301         query = mobj.group('query')
1302         if prefix == '':
1303             return self._get_n_results(query, 1)
1304         elif prefix == 'all':
1305             return self._get_n_results(query, self._MAX_RESULTS)
1306         else:
1307             n = int(prefix)
1308             if n <= 0:
1309                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1310             elif n > self._MAX_RESULTS:
1311                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1312                 n = self._MAX_RESULTS
1313             return self._get_n_results(query, n)
1314
1315     def _get_n_results(self, query, n):
1316         """Get a specified number of results for a query"""
1317         raise NotImplementedError("This method must be implemented by subclasses")
1318
1319     @property
1320     def SEARCH_KEY(self):
1321         return self._SEARCH_KEY