_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_HTTPError,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urllib_parse_urlparse,
  23     compat_urllib_request,
  24     compat_urlparse,
  25     compat_str,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     RegexNotFoundError,
  39     sanitize_filename,
  40     unescapeHTML,
  41     url_basename,
  42 )
  43
  44
  45 class InfoExtractor(object):
  46     """Information Extractor class.
  47
  48     Information extractors are the classes that, given a URL, extract
  49     information about the video (or videos) the URL refers to. This
  50     information includes the real video URL, the video title, author and
  51     others. The information is stored in a dictionary which is then
  52     passed to the YoutubeDL. The YoutubeDL processes this
  53     information possibly downloading the video to the file system, among
  54     other possible outcomes.
  55
  56     The type field determines the type of the result.
  57     By far the most common value (and the default if _type is missing) is
  58     "video", which indicates a single video.
  59
  60     For a video, the dictionaries must include the following fields:
  61
  62     id:             Video identifier.
  63     title:          Video title, unescaped.
  64
  65     Additionally, it must contain either a formats entry or a url one:
  66
  67     formats:        A list of dictionaries for each format available, ordered
  68                     from worst to best quality.
  69
  70                     Potential fields:
  71                     * url        Mandatory. The URL of the video file
  72                     * ext        Will be calculated from URL if missing
  73                     * format     A human-readable description of the format
  74                                  ("mp4 container with h264/opus").
  75                                  Calculated from the format_id, width, height.
  76                                  and format_note fields if missing.
  77                     * format_id  A short description of the format
  78                                  ("mp4_h264_opus" or "19").
  79                                 Technically optional, but strongly recommended.
  80                     * format_note Additional info about the format
  81                                  ("3D" or "DASH video")
  82                     * width      Width of the video, if known
  83                     * height     Height of the video, if known
  84                     * resolution Textual description of width and height
  85                     * tbr        Average bitrate of audio and video in KBit/s
  86                     * abr        Average audio bitrate in KBit/s
  87                     * acodec     Name of the audio codec in use
  88                     * asr        Audio sampling rate in Hertz
  89                     * vbr        Average video bitrate in KBit/s
  90                     * fps        Frame rate
  91                     * vcodec     Name of the video codec in use
  92                     * container  Name of the container format
  93                     * filesize   The number of bytes, if known in advance
  94                     * filesize_approx  An estimate for the number of bytes
  95                     * player_url SWF Player URL (used for rtmpdump).
  96                     * protocol   The protocol that will be used for the actual
  97                                  download, lower-case.
  98                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  99                                  "m3u8", or "m3u8_native".
 100                     * preference Order number of this format. If this field is
 101                                  present and not None, the formats get sorted
 102                                  by this field, regardless of all other values.
 103                                  -1 for default (order by other properties),
 104                                  -2 or smaller for less than default.
 105                                  < -1000 to hide the format (if there is
 106                                     another one which is strictly better)
 107                     * language_preference  Is this in the correct requested
 108                                  language?
 109                                  10 if it's what the URL is about,
 110                                  -1 for default (don't know),
 111                                  -10 otherwise, other values reserved for now.
 112                     * quality    Order number of the video quality of this
 113                                  format, irrespective of the file format.
 114                                  -1 for default (order by other properties),
 115                                  -2 or smaller for less than default.
 116                     * source_preference  Order number for this video source
 117                                   (quality takes higher priority)
 118                                  -1 for default (order by other properties),
 119                                  -2 or smaller for less than default.
 120                     * http_headers  A dictionary of additional HTTP headers
 121                                  to add to the request.
 122                     * stretched_ratio  If given and not 1, indicates that the
 123                                  video's pixels are not square.
 124                                  width : height ratio as float.
 125                     * no_resume  The server does not support resuming the
 126                                  (HTTP or RTMP) download. Boolean.
 127
 128     url:            Final video URL.
 129     ext:            Video filename extension.
 130     format:         The video format, defaults to ext (used for --get-format)
 131     player_url:     SWF Player URL (used for rtmpdump).
 132
 133     The following fields are optional:
 134
 135     alt_title:      A secondary title of the video.
 136     display_id      An alternative identifier for the video, not necessarily
 137                     unique, but available before title. Typically, id is
 138                     something like "4234987", title "Dancing naked mole rats",
 139                     and display_id "dancing-naked-mole-rats"
 140     thumbnails:     A list of dictionaries, with the following entries:
 141                         * "id" (optional, string) - Thumbnail format ID
 142                         * "url"
 143                         * "preference" (optional, int) - quality of the image
 144                         * "width" (optional, int)
 145                         * "height" (optional, int)
 146                         * "resolution" (optional, string "{width}x{height"},
 147                                         deprecated)
 148     thumbnail:      Full URL to a video thumbnail image.
 149     description:    Full video description.
 150     uploader:       Full name of the video uploader.
 151     creator:        The main artist who created the video.
 152     timestamp:      UNIX timestamp of the moment the video became available.
 153     upload_date:    Video upload date (YYYYMMDD).
 154                     If not explicitly set, calculated from timestamp.
 155     uploader_id:    Nickname or id of the video uploader.
 156     location:       Physical location where the video was filmed.
 157     subtitles:      The available subtitles as a dictionary in the format
 158                     {language: subformats}. "subformats" is a list sorted from
 159                     lower to higher preference, each element is a dictionary
 160                     with the "ext" entry and one of:
 161                         * "data": The subtitles file contents
 162                         * "url": A URL pointing to the subtitles file
 163     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 164                     automatically generated captions
 165     duration:       Length of the video in seconds, as an integer.
 166     view_count:     How many users have watched the video on the platform.
 167     like_count:     Number of positive ratings of the video
 168     dislike_count:  Number of negative ratings of the video
 169     average_rating: Average rating give by users, the scale used depends on the webpage
 170     comment_count:  Number of comments on the video
 171     comments:       A list of comments, each with one or more of the following
 172                     properties (all but one of text or html optional):
 173                         * "author" - human-readable name of the comment author
 174                         * "author_id" - user ID of the comment author
 175                         * "id" - Comment ID
 176                         * "html" - Comment as HTML
 177                         * "text" - Plain text of the comment
 178                         * "timestamp" - UNIX timestamp of comment
 179                         * "parent" - ID of the comment this one is replying to.
 180                                      Set to "root" to indicate that this is a
 181                                      comment to the original video.
 182     age_limit:      Age restriction for the video, as an integer (years)
 183     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 184                     should allow to get the same result again. (It will be set
 185                     by YoutubeDL if it's missing)
 186     categories:     A list of categories that the video falls in, for example
 187                     ["Sports", "Berlin"]
 188     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 189     is_live:        True, False, or None (=unknown). Whether this video is a
 190                     live stream that goes on instead of a fixed-length video.
 191     start_time:     Time in seconds where the reproduction should start, as
 192                     specified in the URL.
 193     end_time:       Time in seconds where the reproduction should end, as
 194                     specified in the URL.
 195
 196     Unless mentioned otherwise, the fields should be Unicode strings.
 197
 198     Unless mentioned otherwise, None is equivalent to absence of information.
 199
 200
 201     _type "playlist" indicates multiple videos.
 202     There must be a key "entries", which is a list, an iterable, or a PagedList
 203     object, each element of which is a valid dictionary by this specification.
 204
 205     Additionally, playlists can have "title" and "id" attributes with the same
 206     semantics as videos (see above).
 207
 208
 209     _type "multi_video" indicates that there are multiple videos that
 210     form a single show, for examples multiple acts of an opera or TV episode.
 211     It must have an entries key like a playlist and contain all the keys
 212     required for a video at the same time.
 213
 214
 215     _type "url" indicates that the video must be extracted from another
 216     location, possibly by a different extractor. Its only required key is:
 217     "url" - the next URL to extract.
 218     The key "ie_key" can be set to the class name (minus the trailing "IE",
 219     e.g. "Youtube") if the extractor class is known in advance.
 220     Additionally, the dictionary may have any properties of the resolved entity
 221     known in advance, for example "title" if the title of the referred video is
 222     known ahead of time.
 223
 224
 225     _type "url_transparent" entities have the same specification as "url", but
 226     indicate that the given additional information is more precise than the one
 227     associated with the resolved URL.
 228     This is useful when a site employs a video service that hosts the video and
 229     its technical metadata, but that video service does not embed a useful
 230     title, description etc.
 231
 232
 233     Subclasses of this one should re-define the _real_initialize() and
 234     _real_extract() methods and define a _VALID_URL regexp.
 235     Probably, they should also be added to the list of extractors.
 236
 237     Finally, the _WORKING attribute should be set to False for broken IEs
 238     in order to warn the users and skip the tests.
 239     """
 240
 241     _ready = False
 242     _downloader = None
 243     _WORKING = True
 244
 245     def __init__(self, downloader=None):
 246         """Constructor. Receives an optional downloader."""
 247         self._ready = False
 248         self.set_downloader(downloader)
 249
 250     @classmethod
 251     def suitable(cls, url):
 252         """Receives a URL and returns True if suitable for this IE."""
 253
 254         # This does not use has/getattr intentionally - we want to know whether
 255         # we have cached the regexp for *this* class, whereas getattr would also
 256         # match the superclass
 257         if '_VALID_URL_RE' not in cls.__dict__:
 258             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 259         return cls._VALID_URL_RE.match(url) is not None
 260
 261     @classmethod
 262     def _match_id(cls, url):
 263         if '_VALID_URL_RE' not in cls.__dict__:
 264             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 265         m = cls._VALID_URL_RE.match(url)
 266         assert m
 267         return m.group('id')
 268
 269     @classmethod
 270     def working(cls):
 271         """Getter method for _WORKING."""
 272         return cls._WORKING
 273
 274     def initialize(self):
 275         """Initializes an instance (authentication, etc)."""
 276         if not self._ready:
 277             self._real_initialize()
 278             self._ready = True
 279
 280     def extract(self, url):
 281         """Extracts URL information and returns it in list of dicts."""
 282         try:
 283             self.initialize()
 284             return self._real_extract(url)
 285         except ExtractorError:
 286             raise
 287         except compat_http_client.IncompleteRead as e:
 288             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 289         except (KeyError, StopIteration) as e:
 290             raise ExtractorError('An extractor error has occured.', cause=e)
 291
 292     def set_downloader(self, downloader):
 293         """Sets the downloader for this IE."""
 294         self._downloader = downloader
 295
 296     def _real_initialize(self):
 297         """Real initialization process. Redefine in subclasses."""
 298         pass
 299
 300     def _real_extract(self, url):
 301         """Real extraction process. Redefine in subclasses."""
 302         pass
 303
 304     @classmethod
 305     def ie_key(cls):
 306         """A string for getting the InfoExtractor with get_info_extractor"""
 307         return cls.__name__[:-2]
 308
 309     @property
 310     def IE_NAME(self):
 311         return type(self).__name__[:-2]
 312
 313     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 314         """ Returns the response handle """
 315         if note is None:
 316             self.report_download_webpage(video_id)
 317         elif note is not False:
 318             if video_id is None:
 319                 self.to_screen('%s' % (note,))
 320             else:
 321                 self.to_screen('%s: %s' % (video_id, note))
 322         try:
 323             return self._downloader.urlopen(url_or_request)
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             if errnote is False:
 326                 return False
 327             if errnote is None:
 328                 errnote = 'Unable to download webpage'
 329             errmsg = '%s: %s' % (errnote, compat_str(err))
 330             if fatal:
 331                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 332             else:
 333                 self._downloader.report_warning(errmsg)
 334                 return False
 335
 336     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 337         """ Returns a tuple (page content as string, URL handle) """
 338         # Strip hashes from the URL (#1038)
 339         if isinstance(url_or_request, (compat_str, str)):
 340             url_or_request = url_or_request.partition('#')[0]
 341
 342         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 343         if urlh is False:
 344             assert not fatal
 345             return False
 346         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 347         return (content, urlh)
 348
 349     @staticmethod
 350     def _guess_encoding_from_content(content_type, webpage_bytes):
 351         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 352         if m:
 353             encoding = m.group(1)
 354         else:
 355             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 356                           webpage_bytes[:1024])
 357             if m:
 358                 encoding = m.group(1).decode('ascii')
 359             elif webpage_bytes.startswith(b'\xff\xfe'):
 360                 encoding = 'utf-16'
 361             else:
 362                 encoding = 'utf-8'
 363
 364         return encoding
 365
 366     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 367         content_type = urlh.headers.get('Content-Type', '')
 368         webpage_bytes = urlh.read()
 369         if prefix is not None:
 370             webpage_bytes = prefix + webpage_bytes
 371         if not encoding:
 372             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 373         if self._downloader.params.get('dump_intermediate_pages', False):
 374             try:
 375                 url = url_or_request.get_full_url()
 376             except AttributeError:
 377                 url = url_or_request
 378             self.to_screen('Dumping request to ' + url)
 379             dump = base64.b64encode(webpage_bytes).decode('ascii')
 380             self._downloader.to_screen(dump)
 381         if self._downloader.params.get('write_pages', False):
 382             try:
 383                 url = url_or_request.get_full_url()
 384             except AttributeError:
 385                 url = url_or_request
 386             basen = '%s_%s' % (video_id, url)
 387             if len(basen) > 240:
 388                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 389                 basen = basen[:240 - len(h)] + h
 390             raw_filename = basen + '.dump'
 391             filename = sanitize_filename(raw_filename, restricted=True)
 392             self.to_screen('Saving request to ' + filename)
 393             # Working around MAX_PATH limitation on Windows (see
 394             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 395             if os.name == 'nt':
 396                 absfilepath = os.path.abspath(filename)
 397                 if len(absfilepath) > 259:
 398                     filename = '\\\\?\\' + absfilepath
 399             with open(filename, 'wb') as outf:
 400                 outf.write(webpage_bytes)
 401
 402         try:
 403             content = webpage_bytes.decode(encoding, 'replace')
 404         except LookupError:
 405             content = webpage_bytes.decode('utf-8', 'replace')
 406
 407         if ('<title>Access to this site is blocked</title>' in content and
 408                 'Websense' in content[:512]):
 409             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 410             blocked_iframe = self._html_search_regex(
 411                 r'<iframe src="([^"]+)"', content,
 412                 'Websense information URL', default=None)
 413             if blocked_iframe:
 414                 msg += ' Visit %s for more details' % blocked_iframe
 415             raise ExtractorError(msg, expected=True)
 416         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 417             msg = (
 418                 'Access to this webpage has been blocked by Indian censorship. '
 419                 'Use a VPN or proxy server (with --proxy) to route around it.')
 420             block_msg = self._html_search_regex(
 421                 r'</h1><p>(.*?)</p>',
 422                 content, 'block message', default=None)
 423             if block_msg:
 424                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 425             raise ExtractorError(msg, expected=True)
 426
 427         return content
 428
 429     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 430         """ Returns the data of the page as a string """
 431         success = False
 432         try_count = 0
 433         while success is False:
 434             try:
 435                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 436                 success = True
 437             except compat_http_client.IncompleteRead as e:
 438                 try_count += 1
 439                 if try_count >= tries:
 440                     raise e
 441                 self._sleep(timeout, video_id)
 442         if res is False:
 443             return res
 444         else:
 445             content, _ = res
 446             return content
 447
 448     def _download_xml(self, url_or_request, video_id,
 449                       note='Downloading XML', errnote='Unable to download XML',
 450                       transform_source=None, fatal=True, encoding=None):
 451         """Return the xml as an xml.etree.ElementTree.Element"""
 452         xml_string = self._download_webpage(
 453             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 454         if xml_string is False:
 455             return xml_string
 456         if transform_source:
 457             xml_string = transform_source(xml_string)
 458         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 459
 460     def _download_json(self, url_or_request, video_id,
 461                        note='Downloading JSON metadata',
 462                        errnote='Unable to download JSON metadata',
 463                        transform_source=None,
 464                        fatal=True, encoding=None):
 465         json_string = self._download_webpage(
 466             url_or_request, video_id, note, errnote, fatal=fatal,
 467             encoding=encoding)
 468         if (not fatal) and json_string is False:
 469             return None
 470         return self._parse_json(
 471             json_string, video_id, transform_source=transform_source, fatal=fatal)
 472
 473     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 474         if transform_source:
 475             json_string = transform_source(json_string)
 476         try:
 477             return json.loads(json_string)
 478         except ValueError as ve:
 479             errmsg = '%s: Failed to parse JSON ' % video_id
 480             if fatal:
 481                 raise ExtractorError(errmsg, cause=ve)
 482             else:
 483                 self.report_warning(errmsg + str(ve))
 484
 485     def report_warning(self, msg, video_id=None):
 486         idstr = '' if video_id is None else '%s: ' % video_id
 487         self._downloader.report_warning(
 488             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 489
 490     def to_screen(self, msg):
 491         """Print msg to screen, prefixing it with '[ie_name]'"""
 492         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 493
 494     def report_extraction(self, id_or_name):
 495         """Report information extraction."""
 496         self.to_screen('%s: Extracting information' % id_or_name)
 497
 498     def report_download_webpage(self, video_id):
 499         """Report webpage download."""
 500         self.to_screen('%s: Downloading webpage' % video_id)
 501
 502     def report_age_confirmation(self):
 503         """Report attempt to confirm age."""
 504         self.to_screen('Confirming age')
 505
 506     def report_login(self):
 507         """Report attempt to log in."""
 508         self.to_screen('Logging in')
 509
 510     # Methods for following #608
 511     @staticmethod
 512     def url_result(url, ie=None, video_id=None, video_title=None):
 513         """Returns a URL that points to a page that should be processed"""
 514         # TODO: ie should be the class used for getting the info
 515         video_info = {'_type': 'url',
 516                       'url': url,
 517                       'ie_key': ie}
 518         if video_id is not None:
 519             video_info['id'] = video_id
 520         if video_title is not None:
 521             video_info['title'] = video_title
 522         return video_info
 523
 524     @staticmethod
 525     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 526         """Returns a playlist"""
 527         video_info = {'_type': 'playlist',
 528                       'entries': entries}
 529         if playlist_id:
 530             video_info['id'] = playlist_id
 531         if playlist_title:
 532             video_info['title'] = playlist_title
 533         if playlist_description:
 534             video_info['description'] = playlist_description
 535         return video_info
 536
 537     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 538         """
 539         Perform a regex search on the given string, using a single or a list of
 540         patterns returning the first matching group.
 541         In case of failure return a default value or raise a WARNING or a
 542         RegexNotFoundError, depending on fatal, specifying the field name.
 543         """
 544         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 545             mobj = re.search(pattern, string, flags)
 546         else:
 547             for p in pattern:
 548                 mobj = re.search(p, string, flags)
 549                 if mobj:
 550                     break
 551
 552         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 553             _name = '\033[0;34m%s\033[0m' % name
 554         else:
 555             _name = name
 556
 557         if mobj:
 558             if group is None:
 559                 # return the first matching group
 560                 return next(g for g in mobj.groups() if g is not None)
 561             else:
 562                 return mobj.group(group)
 563         elif default is not NO_DEFAULT:
 564             return default
 565         elif fatal:
 566             raise RegexNotFoundError('Unable to extract %s' % _name)
 567         else:
 568             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 569             return None
 570
 571     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 572         """
 573         Like _search_regex, but strips HTML tags and unescapes entities.
 574         """
 575         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 576         if res:
 577             return clean_html(res).strip()
 578         else:
 579             return res
 580
 581     def _get_login_info(self):
 582         """
 583         Get the login info as (username, password)
 584         It will look in the netrc file using the _NETRC_MACHINE value
 585         If there's no info available, return (None, None)
 586         """
 587         if self._downloader is None:
 588             return (None, None)
 589
 590         username = None
 591         password = None
 592         downloader_params = self._downloader.params
 593
 594         # Attempt to use provided username and password or .netrc data
 595         if downloader_params.get('username', None) is not None:
 596             username = downloader_params['username']
 597             password = downloader_params['password']
 598         elif downloader_params.get('usenetrc', False):
 599             try:
 600                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 601                 if info is not None:
 602                     username = info[0]
 603                     password = info[2]
 604                 else:
 605                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 606             except (IOError, netrc.NetrcParseError) as err:
 607                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 608
 609         return (username, password)
 610
 611     def _get_tfa_info(self):
 612         """
 613         Get the two-factor authentication info
 614         TODO - asking the user will be required for sms/phone verify
 615         currently just uses the command line option
 616         If there's no info available, return None
 617         """
 618         if self._downloader is None:
 619             return None
 620         downloader_params = self._downloader.params
 621
 622         if downloader_params.get('twofactor', None) is not None:
 623             return downloader_params['twofactor']
 624
 625         return None
 626
 627     # Helper functions for extracting OpenGraph info
 628     @staticmethod
 629     def _og_regexes(prop):
 630         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 631         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 632         template = r'<meta[^>]+?%s[^>]+?%s'
 633         return [
 634             template % (property_re, content_re),
 635             template % (content_re, property_re),
 636         ]
 637
 638     @staticmethod
 639     def _meta_regex(prop):
 640         return r'''(?isx)<meta
 641                     (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
 642                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 643
 644     def _og_search_property(self, prop, html, name=None, **kargs):
 645         if name is None:
 646             name = 'OpenGraph %s' % prop
 647         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 648         if escaped is None:
 649             return None
 650         return unescapeHTML(escaped)
 651
 652     def _og_search_thumbnail(self, html, **kargs):
 653         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 654
 655     def _og_search_description(self, html, **kargs):
 656         return self._og_search_property('description', html, fatal=False, **kargs)
 657
 658     def _og_search_title(self, html, **kargs):
 659         return self._og_search_property('title', html, **kargs)
 660
 661     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 662         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 663         if secure:
 664             regexes = self._og_regexes('video:secure_url') + regexes
 665         return self._html_search_regex(regexes, html, name, **kargs)
 666
 667     def _og_search_url(self, html, **kargs):
 668         return self._og_search_property('url', html, **kargs)
 669
 670     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 671         if display_name is None:
 672             display_name = name
 673         return self._html_search_regex(
 674             self._meta_regex(name),
 675             html, display_name, fatal=fatal, group='content', **kwargs)
 676
 677     def _dc_search_uploader(self, html):
 678         return self._html_search_meta('dc.creator', html, 'uploader')
 679
 680     def _rta_search(self, html):
 681         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 682         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 683                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 684                      html):
 685             return 18
 686         return 0
 687
 688     def _media_rating_search(self, html):
 689         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 690         rating = self._html_search_meta('rating', html)
 691
 692         if not rating:
 693             return None
 694
 695         RATING_TABLE = {
 696             'safe for kids': 0,
 697             'general': 8,
 698             '14 years': 14,
 699             'mature': 17,
 700             'restricted': 19,
 701         }
 702         return RATING_TABLE.get(rating.lower(), None)
 703
 704     def _family_friendly_search(self, html):
 705         # See http://schema.org/VideoObject
 706         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 707
 708         if not family_friendly:
 709             return None
 710
 711         RATING_TABLE = {
 712             '1': 0,
 713             'true': 0,
 714             '0': 18,
 715             'false': 18,
 716         }
 717         return RATING_TABLE.get(family_friendly.lower(), None)
 718
 719     def _twitter_search_player(self, html):
 720         return self._html_search_meta('twitter:player', html,
 721                                       'twitter card player')
 722
 723     @staticmethod
 724     def _hidden_inputs(html):
 725         return dict([
 726             (input.group('name'), input.group('value')) for input in re.finditer(
 727                 r'''(?x)
 728                     <input\s+
 729                         type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
 730                         name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
 731                         (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
 732                         value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
 733                 ''', html)
 734         ])
 735
 736     def _form_hidden_inputs(self, form_id, html):
 737         form = self._search_regex(
 738             r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 739             html, '%s form' % form_id, group='form')
 740         return self._hidden_inputs(form)
 741
 742     def _sort_formats(self, formats, field_preference=None):
 743         if not formats:
 744             raise ExtractorError('No video formats found')
 745
 746         def _formats_key(f):
 747             # TODO remove the following workaround
 748             from ..utils import determine_ext
 749             if not f.get('ext') and 'url' in f:
 750                 f['ext'] = determine_ext(f['url'])
 751
 752             if isinstance(field_preference, (list, tuple)):
 753                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 754
 755             preference = f.get('preference')
 756             if preference is None:
 757                 proto = f.get('protocol')
 758                 if proto is None:
 759                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 760
 761                 preference = 0 if proto in ['http', 'https'] else -0.1
 762                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 763                     preference -= 0.5
 764
 765             if f.get('vcodec') == 'none':  # audio only
 766                 if self._downloader.params.get('prefer_free_formats'):
 767                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 768                 else:
 769                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 770                 ext_preference = 0
 771                 try:
 772                     audio_ext_preference = ORDER.index(f['ext'])
 773                 except ValueError:
 774                     audio_ext_preference = -1
 775             else:
 776                 if self._downloader.params.get('prefer_free_formats'):
 777                     ORDER = ['flv', 'mp4', 'webm']
 778                 else:
 779                     ORDER = ['webm', 'flv', 'mp4']
 780                 try:
 781                     ext_preference = ORDER.index(f['ext'])
 782                 except ValueError:
 783                     ext_preference = -1
 784                 audio_ext_preference = 0
 785
 786             return (
 787                 preference,
 788                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 789                 f.get('quality') if f.get('quality') is not None else -1,
 790                 f.get('tbr') if f.get('tbr') is not None else -1,
 791                 f.get('filesize') if f.get('filesize') is not None else -1,
 792                 f.get('vbr') if f.get('vbr') is not None else -1,
 793                 f.get('height') if f.get('height') is not None else -1,
 794                 f.get('width') if f.get('width') is not None else -1,
 795                 ext_preference,
 796                 f.get('abr') if f.get('abr') is not None else -1,
 797                 audio_ext_preference,
 798                 f.get('fps') if f.get('fps') is not None else -1,
 799                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 800                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 801                 f.get('format_id') if f.get('format_id') is not None else '',
 802             )
 803         formats.sort(key=_formats_key)
 804
 805     def _check_formats(self, formats, video_id):
 806         if formats:
 807             formats[:] = filter(
 808                 lambda f: self._is_valid_url(
 809                     f['url'], video_id,
 810                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 811                 formats)
 812
 813     def _is_valid_url(self, url, video_id, item='video'):
 814         url = self._proto_relative_url(url, scheme='http:')
 815         # For now assume non HTTP(S) URLs always valid
 816         if not (url.startswith('http://') or url.startswith('https://')):
 817             return True
 818         try:
 819             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 820             return True
 821         except ExtractorError as e:
 822             if isinstance(e.cause, compat_HTTPError):
 823                 self.to_screen(
 824                     '%s: %s URL is invalid, skipping' % (video_id, item))
 825                 return False
 826             raise
 827
 828     def http_scheme(self):
 829         """ Either "http:" or "https:", depending on the user's preferences """
 830         return (
 831             'http:'
 832             if self._downloader.params.get('prefer_insecure', False)
 833             else 'https:')
 834
 835     def _proto_relative_url(self, url, scheme=None):
 836         if url is None:
 837             return url
 838         if url.startswith('//'):
 839             if scheme is None:
 840                 scheme = self.http_scheme()
 841             return scheme + url
 842         else:
 843             return url
 844
 845     def _sleep(self, timeout, video_id, msg_template=None):
 846         if msg_template is None:
 847             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 848         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 849         self.to_screen(msg)
 850         time.sleep(timeout)
 851
 852     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 853                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 854         manifest = self._download_xml(
 855             manifest_url, video_id, 'Downloading f4m manifest',
 856             'Unable to download f4m manifest',
 857             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 858             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 859             transform_source=transform_source)
 860
 861         formats = []
 862         manifest_version = '1.0'
 863         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 864         if not media_nodes:
 865             manifest_version = '2.0'
 866             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 867         for i, media_el in enumerate(media_nodes):
 868             if manifest_version == '2.0':
 869                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 870                 if not media_url:
 871                     continue
 872                 manifest_url = (
 873                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 874                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 875                 # If media_url is itself a f4m manifest do the recursive extraction
 876                 # since bitrates in parent manifest (this one) and media_url manifest
 877                 # may differ leading to inability to resolve the format by requested
 878                 # bitrate in f4m downloader
 879                 if determine_ext(manifest_url) == 'f4m':
 880                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 881                     continue
 882             tbr = int_or_none(media_el.attrib.get('bitrate'))
 883             formats.append({
 884                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 885                 'url': manifest_url,
 886                 'ext': 'flv',
 887                 'tbr': tbr,
 888                 'width': int_or_none(media_el.attrib.get('width')),
 889                 'height': int_or_none(media_el.attrib.get('height')),
 890                 'preference': preference,
 891             })
 892         self._sort_formats(formats)
 893
 894         return formats
 895
 896     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 897                               entry_protocol='m3u8', preference=None,
 898                               m3u8_id=None, note=None, errnote=None,
 899                               fatal=True):
 900
 901         formats = [{
 902             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 903             'url': m3u8_url,
 904             'ext': ext,
 905             'protocol': 'm3u8',
 906             'preference': preference - 1 if preference else -1,
 907             'resolution': 'multiple',
 908             'format_note': 'Quality selection URL',
 909         }]
 910
 911         format_url = lambda u: (
 912             u
 913             if re.match(r'^https?://', u)
 914             else compat_urlparse.urljoin(m3u8_url, u))
 915
 916         m3u8_doc = self._download_webpage(
 917             m3u8_url, video_id,
 918             note=note or 'Downloading m3u8 information',
 919             errnote=errnote or 'Failed to download m3u8 information',
 920             fatal=fatal)
 921         if m3u8_doc is False:
 922             return m3u8_doc
 923         last_info = None
 924         last_media = None
 925         kv_rex = re.compile(
 926             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 927         for line in m3u8_doc.splitlines():
 928             if line.startswith('#EXT-X-STREAM-INF:'):
 929                 last_info = {}
 930                 for m in kv_rex.finditer(line):
 931                     v = m.group('val')
 932                     if v.startswith('"'):
 933                         v = v[1:-1]
 934                     last_info[m.group('key')] = v
 935             elif line.startswith('#EXT-X-MEDIA:'):
 936                 last_media = {}
 937                 for m in kv_rex.finditer(line):
 938                     v = m.group('val')
 939                     if v.startswith('"'):
 940                         v = v[1:-1]
 941                     last_media[m.group('key')] = v
 942             elif line.startswith('#') or not line.strip():
 943                 continue
 944             else:
 945                 if last_info is None:
 946                     formats.append({'url': format_url(line)})
 947                     continue
 948                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 949                 format_id = []
 950                 if m3u8_id:
 951                     format_id.append(m3u8_id)
 952                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 953                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 954                 f = {
 955                     'format_id': '-'.join(format_id),
 956                     'url': format_url(line.strip()),
 957                     'tbr': tbr,
 958                     'ext': ext,
 959                     'protocol': entry_protocol,
 960                     'preference': preference,
 961                 }
 962                 codecs = last_info.get('CODECS')
 963                 if codecs:
 964                     # TODO: looks like video codec is not always necessarily goes first
 965                     va_codecs = codecs.split(',')
 966                     if va_codecs[0]:
 967                         f['vcodec'] = va_codecs[0].partition('.')[0]
 968                     if len(va_codecs) > 1 and va_codecs[1]:
 969                         f['acodec'] = va_codecs[1].partition('.')[0]
 970                 resolution = last_info.get('RESOLUTION')
 971                 if resolution:
 972                     width_str, height_str = resolution.split('x')
 973                     f['width'] = int(width_str)
 974                     f['height'] = int(height_str)
 975                 if last_media is not None:
 976                     f['m3u8_media'] = last_media
 977                     last_media = None
 978                 formats.append(f)
 979                 last_info = {}
 980         self._sort_formats(formats)
 981         return formats
 982
 983     @staticmethod
 984     def _xpath_ns(path, namespace=None):
 985         if not namespace:
 986             return path
 987         out = []
 988         for c in path.split('/'):
 989             if not c or c == '.':
 990                 out.append(c)
 991             else:
 992                 out.append('{%s}%s' % (namespace, c))
 993         return '/'.join(out)
 994
 995     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
 996         smil = self._download_smil(smil_url, video_id, fatal=fatal)
 997
 998         if smil is False:
 999             assert not fatal
1000             return []
1001
1002         namespace = self._parse_smil_namespace(smil)
1003
1004         return self._parse_smil_formats(
1005             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1006
1007     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1008         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1009         if smil is False:
1010             return {}
1011         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1012
1013     def _download_smil(self, smil_url, video_id, fatal=True):
1014         return self._download_xml(
1015             smil_url, video_id, 'Downloading SMIL file',
1016             'Unable to download SMIL file', fatal=fatal)
1017
1018     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1019         namespace = self._parse_smil_namespace(smil)
1020
1021         formats = self._parse_smil_formats(
1022             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1023         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1024
1025         video_id = os.path.splitext(url_basename(smil_url))[0]
1026         title = None
1027         description = None
1028         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1029             name = meta.attrib.get('name')
1030             content = meta.attrib.get('content')
1031             if not name or not content:
1032                 continue
1033             if not title and name == 'title':
1034                 title = content
1035             elif not description and name in ('description', 'abstract'):
1036                 description = content
1037
1038         return {
1039             'id': video_id,
1040             'title': title or video_id,
1041             'description': description,
1042             'formats': formats,
1043             'subtitles': subtitles,
1044         }
1045
1046     def _parse_smil_namespace(self, smil):
1047         return self._search_regex(
1048             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1049
1050     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1051         base = smil_url
1052         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1053             b = meta.get('base') or meta.get('httpBase')
1054             if b:
1055                 base = b
1056                 break
1057
1058         formats = []
1059         rtmp_count = 0
1060         http_count = 0
1061
1062         videos = smil.findall(self._xpath_ns('.//video', namespace))
1063         for video in videos:
1064             src = video.get('src')
1065             if not src:
1066                 continue
1067
1068             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1069             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1070             width = int_or_none(video.get('width'))
1071             height = int_or_none(video.get('height'))
1072             proto = video.get('proto')
1073             ext = video.get('ext')
1074             src_ext = determine_ext(src)
1075             streamer = video.get('streamer') or base
1076
1077             if proto == 'rtmp' or streamer.startswith('rtmp'):
1078                 rtmp_count += 1
1079                 formats.append({
1080                     'url': streamer,
1081                     'play_path': src,
1082                     'ext': 'flv',
1083                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1084                     'tbr': bitrate,
1085                     'filesize': filesize,
1086                     'width': width,
1087                     'height': height,
1088                 })
1089                 continue
1090
1091             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1092
1093             if proto == 'm3u8' or src_ext == 'm3u8':
1094                 formats.extend(self._extract_m3u8_formats(
1095                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1096                 continue
1097
1098             if src_ext == 'f4m':
1099                 f4m_url = src_url
1100                 if not f4m_params:
1101                     f4m_params = {
1102                         'hdcore': '3.2.0',
1103                         'plugin': 'flowplayer-3.2.0.1',
1104                     }
1105                 f4m_url += '&' if '?' in f4m_url else '?'
1106                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1107                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1108                 continue
1109
1110             if src_url.startswith('http'):
1111                 http_count += 1
1112                 formats.append({
1113                     'url': src_url,
1114                     'ext': ext or src_ext or 'flv',
1115                     'format_id': 'http-%d' % (bitrate or http_count),
1116                     'tbr': bitrate,
1117                     'filesize': filesize,
1118                     'width': width,
1119                     'height': height,
1120                 })
1121                 continue
1122
1123         self._sort_formats(formats)
1124
1125         return formats
1126
1127     def _parse_smil_subtitles(self, smil, namespace=None):
1128         subtitles = {}
1129         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1130             src = textstream.get('src')
1131             if not src:
1132                 continue
1133             ext = textstream.get('ext') or determine_ext(src)
1134             if not ext:
1135                 type_ = textstream.get('type')
1136                 if type_ == 'text/srt':
1137                     ext = 'srt'
1138             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1139             subtitles.setdefault(lang, []).append({
1140                 'url': src,
1141                 'ext': ext,
1142             })
1143         return subtitles
1144
1145     def _live_title(self, name):
1146         """ Generate the title for a live video """
1147         now = datetime.datetime.now()
1148         now_str = now.strftime("%Y-%m-%d %H:%M")
1149         return name + ' ' + now_str
1150
1151     def _int(self, v, name, fatal=False, **kwargs):
1152         res = int_or_none(v, **kwargs)
1153         if 'get_attr' in kwargs:
1154             print(getattr(v, kwargs['get_attr']))
1155         if res is None:
1156             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1157             if fatal:
1158                 raise ExtractorError(msg)
1159             else:
1160                 self._downloader.report_warning(msg)
1161         return res
1162
1163     def _float(self, v, name, fatal=False, **kwargs):
1164         res = float_or_none(v, **kwargs)
1165         if res is None:
1166             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1167             if fatal:
1168                 raise ExtractorError(msg)
1169             else:
1170                 self._downloader.report_warning(msg)
1171         return res
1172
1173     def _set_cookie(self, domain, name, value, expire_time=None):
1174         cookie = compat_cookiejar.Cookie(
1175             0, name, value, None, None, domain, None,
1176             None, '/', True, False, expire_time, '', None, None, None)
1177         self._downloader.cookiejar.set_cookie(cookie)
1178
1179     def _get_cookies(self, url):
1180         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1181         req = compat_urllib_request.Request(url)
1182         self._downloader.cookiejar.add_cookie_header(req)
1183         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1184
1185     def get_testcases(self, include_onlymatching=False):
1186         t = getattr(self, '_TEST', None)
1187         if t:
1188             assert not hasattr(self, '_TESTS'), \
1189                 '%s has _TEST and _TESTS' % type(self).__name__
1190             tests = [t]
1191         else:
1192             tests = getattr(self, '_TESTS', [])
1193         for t in tests:
1194             if not include_onlymatching and t.get('only_matching', False):
1195                 continue
1196             t['name'] = type(self).__name__[:-len('IE')]
1197             yield t
1198
1199     def is_suitable(self, age_limit):
1200         """ Test whether the extractor is generally suitable for the given
1201         age limit (i.e. pornographic sites are not, all others usually are) """
1202
1203         any_restricted = False
1204         for tc in self.get_testcases(include_onlymatching=False):
1205             if 'playlist' in tc:
1206                 tc = tc['playlist'][0]
1207             is_restricted = age_restricted(
1208                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1209             if not is_restricted:
1210                 return True
1211             any_restricted = any_restricted or is_restricted
1212         return not any_restricted
1213
1214     def extract_subtitles(self, *args, **kwargs):
1215         if (self._downloader.params.get('writesubtitles', False) or
1216                 self._downloader.params.get('listsubtitles')):
1217             return self._get_subtitles(*args, **kwargs)
1218         return {}
1219
1220     def _get_subtitles(self, *args, **kwargs):
1221         raise NotImplementedError("This method must be implemented by subclasses")
1222
1223     def extract_automatic_captions(self, *args, **kwargs):
1224         if (self._downloader.params.get('writeautomaticsub', False) or
1225                 self._downloader.params.get('listsubtitles')):
1226             return self._get_automatic_captions(*args, **kwargs)
1227         return {}
1228
1229     def _get_automatic_captions(self, *args, **kwargs):
1230         raise NotImplementedError("This method must be implemented by subclasses")
1231
1232
1233 class SearchInfoExtractor(InfoExtractor):
1234     """
1235     Base class for paged search queries extractors.
1236     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1237     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1238     """
1239
1240     @classmethod
1241     def _make_valid_url(cls):
1242         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1243
1244     @classmethod
1245     def suitable(cls, url):
1246         return re.match(cls._make_valid_url(), url) is not None
1247
1248     def _real_extract(self, query):
1249         mobj = re.match(self._make_valid_url(), query)
1250         if mobj is None:
1251             raise ExtractorError('Invalid search query "%s"' % query)
1252
1253         prefix = mobj.group('prefix')
1254         query = mobj.group('query')
1255         if prefix == '':
1256             return self._get_n_results(query, 1)
1257         elif prefix == 'all':
1258             return self._get_n_results(query, self._MAX_RESULTS)
1259         else:
1260             n = int(prefix)
1261             if n <= 0:
1262                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1263             elif n > self._MAX_RESULTS:
1264                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1265                 n = self._MAX_RESULTS
1266             return self._get_n_results(query, n)
1267
1268     def _get_n_results(self, query, n):
1269         """Get a specified number of results for a query"""
1270         raise NotImplementedError("This method must be implemented by subclasses")
1271
1272     @property
1273     def SEARCH_KEY(self):
1274         return self._SEARCH_KEY