_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import sys
   9 import time
  10 import xml.etree.ElementTree
  11
  12 from ..utils import (
  13     compat_http_client,
  14     compat_urllib_error,
  15     compat_urllib_parse_urlparse,
  16     compat_str,
  17
  18     clean_html,
  19     compiled_regex_type,
  20     ExtractorError,
  21     int_or_none,
  22     RegexNotFoundError,
  23     sanitize_filename,
  24     unescapeHTML,
  25 )
  26 _NO_DEFAULT = object()
  27
  28
  29 class InfoExtractor(object):
  30     """Information Extractor class.
  31
  32     Information extractors are the classes that, given a URL, extract
  33     information about the video (or videos) the URL refers to. This
  34     information includes the real video URL, the video title, author and
  35     others. The information is stored in a dictionary which is then
  36     passed to the FileDownloader. The FileDownloader processes this
  37     information possibly downloading the video to the file system, among
  38     other possible outcomes.
  39
  40     The dictionaries must include the following fields:
  41
  42     id:             Video identifier.
  43     title:          Video title, unescaped.
  44
  45     Additionally, it must contain either a formats entry or a url one:
  46
  47     formats:        A list of dictionaries for each format available, ordered
  48                     from worst to best quality.
  49
  50                     Potential fields:
  51                     * url        Mandatory. The URL of the video file
  52                     * ext        Will be calculated from url if missing
  53                     * format     A human-readable description of the format
  54                                  ("mp4 container with h264/opus").
  55                                  Calculated from the format_id, width, height.
  56                                  and format_note fields if missing.
  57                     * format_id  A short description of the format
  58                                  ("mp4_h264_opus" or "19").
  59                                 Technically optional, but strongly recommended.
  60                     * format_note Additional info about the format
  61                                  ("3D" or "DASH video")
  62                     * width      Width of the video, if known
  63                     * height     Height of the video, if known
  64                     * resolution Textual description of width and height
  65                     * tbr        Average bitrate of audio and video in KBit/s
  66                     * abr        Average audio bitrate in KBit/s
  67                     * acodec     Name of the audio codec in use
  68                     * asr        Audio sampling rate in Hertz
  69                     * vbr        Average video bitrate in KBit/s
  70                     * vcodec     Name of the video codec in use
  71                     * container  Name of the container format
  72                     * filesize   The number of bytes, if known in advance
  73                     * filesize_approx  An estimate for the number of bytes
  74                     * player_url SWF Player URL (used for rtmpdump).
  75                     * protocol   The protocol that will be used for the actual
  76                                  download, lower-case.
  77                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  78                     * preference Order number of this format. If this field is
  79                                  present and not None, the formats get sorted
  80                                  by this field, regardless of all other values.
  81                                  -1 for default (order by other properties),
  82                                  -2 or smaller for less than default.
  83                     * quality    Order number of the video quality of this
  84                                  format, irrespective of the file format.
  85                                  -1 for default (order by other properties),
  86                                  -2 or smaller for less than default.
  87     url:            Final video URL.
  88     ext:            Video filename extension.
  89     format:         The video format, defaults to ext (used for --get-format)
  90     player_url:     SWF Player URL (used for rtmpdump).
  91
  92     The following fields are optional:
  93
  94     display_id      An alternative identifier for the video, not necessarily
  95                     unique, but available before title. Typically, id is
  96                     something like "4234987", title "Dancing naked mole rats",
  97                     and display_id "dancing-naked-mole-rats"
  98     thumbnails:     A list of dictionaries, with the following entries:
  99                         * "url"
 100                         * "width" (optional, int)
 101                         * "height" (optional, int)
 102                         * "resolution" (optional, string "{width}x{height"},
 103                                         deprecated)
 104     thumbnail:      Full URL to a video thumbnail image.
 105     description:    One-line video description.
 106     uploader:       Full name of the video uploader.
 107     timestamp:      UNIX timestamp of the moment the video became available.
 108     upload_date:    Video upload date (YYYYMMDD).
 109                     If not explicitly set, calculated from timestamp.
 110     uploader_id:    Nickname or id of the video uploader.
 111     location:       Physical location of the video.
 112     subtitles:      The subtitle file contents as a dictionary in the format
 113                     {language: subtitles}.
 114     duration:       Length of the video in seconds, as an integer.
 115     view_count:     How many users have watched the video on the platform.
 116     like_count:     Number of positive ratings of the video
 117     dislike_count:  Number of negative ratings of the video
 118     comment_count:  Number of comments on the video
 119     age_limit:      Age restriction for the video, as an integer (years)
 120     webpage_url:    The url to the video webpage, if given to youtube-dl it
 121                     should allow to get the same result again. (It will be set
 122                     by YoutubeDL if it's missing)
 123     categories:     A list of categories that the video falls in, for example
 124                     ["Sports", "Berlin"]
 125
 126     Unless mentioned otherwise, the fields should be Unicode strings.
 127
 128     Subclasses of this one should re-define the _real_initialize() and
 129     _real_extract() methods and define a _VALID_URL regexp.
 130     Probably, they should also be added to the list of extractors.
 131
 132     Finally, the _WORKING attribute should be set to False for broken IEs
 133     in order to warn the users and skip the tests.
 134     """
 135
 136     _ready = False
 137     _downloader = None
 138     _WORKING = True
 139
 140     def __init__(self, downloader=None):
 141         """Constructor. Receives an optional downloader."""
 142         self._ready = False
 143         self.set_downloader(downloader)
 144
 145     @classmethod
 146     def suitable(cls, url):
 147         """Receives a URL and returns True if suitable for this IE."""
 148
 149         # This does not use has/getattr intentionally - we want to know whether
 150         # we have cached the regexp for *this* class, whereas getattr would also
 151         # match the superclass
 152         if '_VALID_URL_RE' not in cls.__dict__:
 153             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 154         return cls._VALID_URL_RE.match(url) is not None
 155
 156     @classmethod
 157     def working(cls):
 158         """Getter method for _WORKING."""
 159         return cls._WORKING
 160
 161     def initialize(self):
 162         """Initializes an instance (authentication, etc)."""
 163         if not self._ready:
 164             self._real_initialize()
 165             self._ready = True
 166
 167     def extract(self, url):
 168         """Extracts URL information and returns it in list of dicts."""
 169         self.initialize()
 170         return self._real_extract(url)
 171
 172     def set_downloader(self, downloader):
 173         """Sets the downloader for this IE."""
 174         self._downloader = downloader
 175
 176     def _real_initialize(self):
 177         """Real initialization process. Redefine in subclasses."""
 178         pass
 179
 180     def _real_extract(self, url):
 181         """Real extraction process. Redefine in subclasses."""
 182         pass
 183
 184     @classmethod
 185     def ie_key(cls):
 186         """A string for getting the InfoExtractor with get_info_extractor"""
 187         return cls.__name__[:-2]
 188
 189     @property
 190     def IE_NAME(self):
 191         return type(self).__name__[:-2]
 192
 193     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 194         """ Returns the response handle """
 195         if note is None:
 196             self.report_download_webpage(video_id)
 197         elif note is not False:
 198             if video_id is None:
 199                 self.to_screen(u'%s' % (note,))
 200             else:
 201                 self.to_screen(u'%s: %s' % (video_id, note))
 202         try:
 203             return self._downloader.urlopen(url_or_request)
 204         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 205             if errnote is False:
 206                 return False
 207             if errnote is None:
 208                 errnote = u'Unable to download webpage'
 209             errmsg = u'%s: %s' % (errnote, compat_str(err))
 210             if fatal:
 211                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 212             else:
 213                 self._downloader.report_warning(errmsg)
 214                 return False
 215
 216     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 217         """ Returns a tuple (page content as string, URL handle) """
 218
 219         # Strip hashes from the URL (#1038)
 220         if isinstance(url_or_request, (compat_str, str)):
 221             url_or_request = url_or_request.partition('#')[0]
 222
 223         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 224         if urlh is False:
 225             assert not fatal
 226             return False
 227         content_type = urlh.headers.get('Content-Type', '')
 228         webpage_bytes = urlh.read()
 229         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 230         if m:
 231             encoding = m.group(1)
 232         else:
 233             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 234                           webpage_bytes[:1024])
 235             if m:
 236                 encoding = m.group(1).decode('ascii')
 237             elif webpage_bytes.startswith(b'\xff\xfe'):
 238                 encoding = 'utf-16'
 239             else:
 240                 encoding = 'utf-8'
 241         if self._downloader.params.get('dump_intermediate_pages', False):
 242             try:
 243                 url = url_or_request.get_full_url()
 244             except AttributeError:
 245                 url = url_or_request
 246             self.to_screen(u'Dumping request to ' + url)
 247             dump = base64.b64encode(webpage_bytes).decode('ascii')
 248             self._downloader.to_screen(dump)
 249         if self._downloader.params.get('write_pages', False):
 250             try:
 251                 url = url_or_request.get_full_url()
 252             except AttributeError:
 253                 url = url_or_request
 254             basen = '%s_%s' % (video_id, url)
 255             if len(basen) > 240:
 256                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 257                 basen = basen[:240 - len(h)] + h
 258             raw_filename = basen + '.dump'
 259             filename = sanitize_filename(raw_filename, restricted=True)
 260             self.to_screen(u'Saving request to ' + filename)
 261             with open(filename, 'wb') as outf:
 262                 outf.write(webpage_bytes)
 263
 264         try:
 265             content = webpage_bytes.decode(encoding, 'replace')
 266         except LookupError:
 267             content = webpage_bytes.decode('utf-8', 'replace')
 268
 269         if (u'<title>Access to this site is blocked</title>' in content and
 270                 u'Websense' in content[:512]):
 271             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 272             blocked_iframe = self._html_search_regex(
 273                 r'<iframe src="([^"]+)"', content,
 274                 u'Websense information URL', default=None)
 275             if blocked_iframe:
 276                 msg += u' Visit %s for more details' % blocked_iframe
 277             raise ExtractorError(msg, expected=True)
 278
 279         return (content, urlh)
 280
 281     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 282         """ Returns the data of the page as a string """
 283         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 284         if res is False:
 285             return res
 286         else:
 287             content, _ = res
 288             return content
 289
 290     def _download_xml(self, url_or_request, video_id,
 291                       note=u'Downloading XML', errnote=u'Unable to download XML',
 292                       transform_source=None, fatal=True):
 293         """Return the xml as an xml.etree.ElementTree.Element"""
 294         xml_string = self._download_webpage(
 295             url_or_request, video_id, note, errnote, fatal=fatal)
 296         if xml_string is False:
 297             return xml_string
 298         if transform_source:
 299             xml_string = transform_source(xml_string)
 300         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 301
 302     def _download_json(self, url_or_request, video_id,
 303                        note=u'Downloading JSON metadata',
 304                        errnote=u'Unable to download JSON metadata',
 305                        transform_source=None,
 306                        fatal=True):
 307         json_string = self._download_webpage(
 308             url_or_request, video_id, note, errnote, fatal=fatal)
 309         if (not fatal) and json_string is False:
 310             return None
 311         if transform_source:
 312             json_string = transform_source(json_string)
 313         try:
 314             return json.loads(json_string)
 315         except ValueError as ve:
 316             raise ExtractorError('Failed to download JSON', cause=ve)
 317
 318     def report_warning(self, msg, video_id=None):
 319         idstr = u'' if video_id is None else u'%s: ' % video_id
 320         self._downloader.report_warning(
 321             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 322
 323     def to_screen(self, msg):
 324         """Print msg to screen, prefixing it with '[ie_name]'"""
 325         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 326
 327     def report_extraction(self, id_or_name):
 328         """Report information extraction."""
 329         self.to_screen(u'%s: Extracting information' % id_or_name)
 330
 331     def report_download_webpage(self, video_id):
 332         """Report webpage download."""
 333         self.to_screen(u'%s: Downloading webpage' % video_id)
 334
 335     def report_age_confirmation(self):
 336         """Report attempt to confirm age."""
 337         self.to_screen(u'Confirming age')
 338
 339     def report_login(self):
 340         """Report attempt to log in."""
 341         self.to_screen(u'Logging in')
 342
 343     #Methods for following #608
 344     @staticmethod
 345     def url_result(url, ie=None, video_id=None):
 346         """Returns a url that points to a page that should be processed"""
 347         #TODO: ie should be the class used for getting the info
 348         video_info = {'_type': 'url',
 349                       'url': url,
 350                       'ie_key': ie}
 351         if video_id is not None:
 352             video_info['id'] = video_id
 353         return video_info
 354     @staticmethod
 355     def playlist_result(entries, playlist_id=None, playlist_title=None):
 356         """Returns a playlist"""
 357         video_info = {'_type': 'playlist',
 358                       'entries': entries}
 359         if playlist_id:
 360             video_info['id'] = playlist_id
 361         if playlist_title:
 362             video_info['title'] = playlist_title
 363         return video_info
 364
 365     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 366         """
 367         Perform a regex search on the given string, using a single or a list of
 368         patterns returning the first matching group.
 369         In case of failure return a default value or raise a WARNING or a
 370         RegexNotFoundError, depending on fatal, specifying the field name.
 371         """
 372         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 373             mobj = re.search(pattern, string, flags)
 374         else:
 375             for p in pattern:
 376                 mobj = re.search(p, string, flags)
 377                 if mobj:
 378                     break
 379
 380         if os.name != 'nt' and sys.stderr.isatty():
 381             _name = u'\033[0;34m%s\033[0m' % name
 382         else:
 383             _name = name
 384
 385         if mobj:
 386             # return the first matching group
 387             return next(g for g in mobj.groups() if g is not None)
 388         elif default is not _NO_DEFAULT:
 389             return default
 390         elif fatal:
 391             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 392         else:
 393             self._downloader.report_warning(u'unable to extract %s; '
 394                 u'please report this issue on http://yt-dl.org/bug' % _name)
 395             return None
 396
 397     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 398         """
 399         Like _search_regex, but strips HTML tags and unescapes entities.
 400         """
 401         res = self._search_regex(pattern, string, name, default, fatal, flags)
 402         if res:
 403             return clean_html(res).strip()
 404         else:
 405             return res
 406
 407     def _get_login_info(self):
 408         """
 409         Get the the login info as (username, password)
 410         It will look in the netrc file using the _NETRC_MACHINE value
 411         If there's no info available, return (None, None)
 412         """
 413         if self._downloader is None:
 414             return (None, None)
 415
 416         username = None
 417         password = None
 418         downloader_params = self._downloader.params
 419
 420         # Attempt to use provided username and password or .netrc data
 421         if downloader_params.get('username', None) is not None:
 422             username = downloader_params['username']
 423             password = downloader_params['password']
 424         elif downloader_params.get('usenetrc', False):
 425             try:
 426                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 427                 if info is not None:
 428                     username = info[0]
 429                     password = info[2]
 430                 else:
 431                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 432             except (IOError, netrc.NetrcParseError) as err:
 433                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 434
 435         return (username, password)
 436
 437     def _get_tfa_info(self):
 438         """
 439         Get the two-factor authentication info
 440         TODO - asking the user will be required for sms/phone verify
 441         currently just uses the command line option
 442         If there's no info available, return None
 443         """
 444         if self._downloader is None:
 445             self.to_screen("no downloader")
 446             return None
 447         downloader_params = self._downloader.params
 448
 449         if downloader_params.get('twofactor', None) is not None:
 450             return downloader_params['twofactor']
 451
 452         self.to_screen("param is None")
 453         return None
 454
 455     # Helper functions for extracting OpenGraph info
 456     @staticmethod
 457     def _og_regexes(prop):
 458         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 459         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 460         template = r'<meta[^>]+?%s[^>]+?%s'
 461         return [
 462             template % (property_re, content_re),
 463             template % (content_re, property_re),
 464         ]
 465
 466     def _og_search_property(self, prop, html, name=None, **kargs):
 467         if name is None:
 468             name = 'OpenGraph %s' % prop
 469         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 470         if escaped is None:
 471             return None
 472         return unescapeHTML(escaped)
 473
 474     def _og_search_thumbnail(self, html, **kargs):
 475         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 476
 477     def _og_search_description(self, html, **kargs):
 478         return self._og_search_property('description', html, fatal=False, **kargs)
 479
 480     def _og_search_title(self, html, **kargs):
 481         return self._og_search_property('title', html, **kargs)
 482
 483     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 484         regexes = self._og_regexes('video')
 485         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 486         return self._html_search_regex(regexes, html, name, **kargs)
 487
 488     def _og_search_url(self, html, **kargs):
 489         return self._og_search_property('url', html, **kargs)
 490
 491     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 492         if display_name is None:
 493             display_name = name
 494         return self._html_search_regex(
 495             r'''(?ix)<meta
 496                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 497                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 498             html, display_name, fatal=fatal, **kwargs)
 499
 500     def _dc_search_uploader(self, html):
 501         return self._html_search_meta('dc.creator', html, 'uploader')
 502
 503     def _rta_search(self, html):
 504         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 505         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 506                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 507                      html):
 508             return 18
 509         return 0
 510
 511     def _media_rating_search(self, html):
 512         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 513         rating = self._html_search_meta('rating', html)
 514
 515         if not rating:
 516             return None
 517
 518         RATING_TABLE = {
 519             'safe for kids': 0,
 520             'general': 8,
 521             '14 years': 14,
 522             'mature': 17,
 523             'restricted': 19,
 524         }
 525         return RATING_TABLE.get(rating.lower(), None)
 526
 527     def _twitter_search_player(self, html):
 528         return self._html_search_meta('twitter:player', html,
 529             'twitter card player')
 530
 531     def _sort_formats(self, formats):
 532         if not formats:
 533             raise ExtractorError(u'No video formats found')
 534
 535         def _formats_key(f):
 536             # TODO remove the following workaround
 537             from ..utils import determine_ext
 538             if not f.get('ext') and 'url' in f:
 539                 f['ext'] = determine_ext(f['url'])
 540
 541             preference = f.get('preference')
 542             if preference is None:
 543                 proto = f.get('protocol')
 544                 if proto is None:
 545                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 546
 547                 preference = 0 if proto in ['http', 'https'] else -0.1
 548                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 549                     preference -= 0.5
 550
 551             if f.get('vcodec') == 'none':  # audio only
 552                 if self._downloader.params.get('prefer_free_formats'):
 553                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 554                 else:
 555                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 556                 ext_preference = 0
 557                 try:
 558                     audio_ext_preference = ORDER.index(f['ext'])
 559                 except ValueError:
 560                     audio_ext_preference = -1
 561             else:
 562                 if self._downloader.params.get('prefer_free_formats'):
 563                     ORDER = [u'flv', u'mp4', u'webm']
 564                 else:
 565                     ORDER = [u'webm', u'flv', u'mp4']
 566                 try:
 567                     ext_preference = ORDER.index(f['ext'])
 568                 except ValueError:
 569                     ext_preference = -1
 570                 audio_ext_preference = 0
 571
 572             return (
 573                 preference,
 574                 f.get('quality') if f.get('quality') is not None else -1,
 575                 f.get('height') if f.get('height') is not None else -1,
 576                 f.get('width') if f.get('width') is not None else -1,
 577                 ext_preference,
 578                 f.get('tbr') if f.get('tbr') is not None else -1,
 579                 f.get('vbr') if f.get('vbr') is not None else -1,
 580                 f.get('abr') if f.get('abr') is not None else -1,
 581                 audio_ext_preference,
 582                 f.get('filesize') if f.get('filesize') is not None else -1,
 583                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 584                 f.get('format_id'),
 585             )
 586         formats.sort(key=_formats_key)
 587
 588     def http_scheme(self):
 589         """ Either "https:" or "https:", depending on the user's preferences """
 590         return (
 591             'http:'
 592             if self._downloader.params.get('prefer_insecure', False)
 593             else 'https:')
 594
 595     def _proto_relative_url(self, url, scheme=None):
 596         if url is None:
 597             return url
 598         if url.startswith('//'):
 599             if scheme is None:
 600                 scheme = self.http_scheme()
 601             return scheme + url
 602         else:
 603             return url
 604
 605     def _sleep(self, timeout, video_id, msg_template=None):
 606         if msg_template is None:
 607             msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
 608         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 609         self.to_screen(msg)
 610         time.sleep(timeout)
 611
 612     def _extract_f4m_formats(self, manifest_url, video_id):
 613         manifest = self._download_xml(
 614             manifest_url, video_id, 'Downloading f4m manifest',
 615             'Unable to download f4m manifest')
 616
 617         formats = []
 618         for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
 619             formats.append({
 620                 'url': manifest_url,
 621                 'ext': 'flv',
 622                 'tbr': int_or_none(media_el.attrib.get('bitrate')),
 623                 'width': int_or_none(media_el.attrib.get('width')),
 624                 'height': int_or_none(media_el.attrib.get('height')),
 625             })
 626         self._sort_formats(formats)
 627
 628         return formats
 629
 630
 631 class SearchInfoExtractor(InfoExtractor):
 632     """
 633     Base class for paged search queries extractors.
 634     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 635     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 636     """
 637
 638     @classmethod
 639     def _make_valid_url(cls):
 640         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 641
 642     @classmethod
 643     def suitable(cls, url):
 644         return re.match(cls._make_valid_url(), url) is not None
 645
 646     def _real_extract(self, query):
 647         mobj = re.match(self._make_valid_url(), query)
 648         if mobj is None:
 649             raise ExtractorError(u'Invalid search query "%s"' % query)
 650
 651         prefix = mobj.group('prefix')
 652         query = mobj.group('query')
 653         if prefix == '':
 654             return self._get_n_results(query, 1)
 655         elif prefix == 'all':
 656             return self._get_n_results(query, self._MAX_RESULTS)
 657         else:
 658             n = int(prefix)
 659             if n <= 0:
 660                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 661             elif n > self._MAX_RESULTS:
 662                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 663                 n = self._MAX_RESULTS
 664             return self._get_n_results(query, n)
 665
 666     def _get_n_results(self, query, n):
 667         """Get a specified number of results for a query"""
 668         raise NotImplementedError("This method must be implemented by subclasses")
 669
 670     @property
 671     def SEARCH_KEY(self):
 672         return self._SEARCH_KEY