_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     url_basename,
  48     xpath_text,
  49     xpath_with_ns,
  50     determine_protocol,
  51     parse_duration,
  52     mimetype2ext,
  53     update_Request,
  54     update_url_query,
  55 )
  56
  57
  58 class InfoExtractor(object):
  59     """Information Extractor class.
  60
  61     Information extractors are the classes that, given a URL, extract
  62     information about the video (or videos) the URL refers to. This
  63     information includes the real video URL, the video title, author and
  64     others. The information is stored in a dictionary which is then
  65     passed to the YoutubeDL. The YoutubeDL processes this
  66     information possibly downloading the video to the file system, among
  67     other possible outcomes.
  68
  69     The type field determines the type of the result.
  70     By far the most common value (and the default if _type is missing) is
  71     "video", which indicates a single video.
  72
  73     For a video, the dictionaries must include the following fields:
  74
  75     id:             Video identifier.
  76     title:          Video title, unescaped.
  77
  78     Additionally, it must contain either a formats entry or a url one:
  79
  80     formats:        A list of dictionaries for each format available, ordered
  81                     from worst to best quality.
  82
  83                     Potential fields:
  84                     * url        Mandatory. The URL of the video file
  85                     * ext        Will be calculated from URL if missing
  86                     * format     A human-readable description of the format
  87                                  ("mp4 container with h264/opus").
  88                                  Calculated from the format_id, width, height.
  89                                  and format_note fields if missing.
  90                     * format_id  A short description of the format
  91                                  ("mp4_h264_opus" or "19").
  92                                 Technically optional, but strongly recommended.
  93                     * format_note Additional info about the format
  94                                  ("3D" or "DASH video")
  95                     * width      Width of the video, if known
  96                     * height     Height of the video, if known
  97                     * resolution Textual description of width and height
  98                     * tbr        Average bitrate of audio and video in KBit/s
  99                     * abr        Average audio bitrate in KBit/s
 100                     * acodec     Name of the audio codec in use
 101                     * asr        Audio sampling rate in Hertz
 102                     * vbr        Average video bitrate in KBit/s
 103                     * fps        Frame rate
 104                     * vcodec     Name of the video codec in use
 105                     * container  Name of the container format
 106                     * filesize   The number of bytes, if known in advance
 107                     * filesize_approx  An estimate for the number of bytes
 108                     * player_url SWF Player URL (used for rtmpdump).
 109                     * protocol   The protocol that will be used for the actual
 110                                  download, lower-case.
 111                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 112                                  "m3u8", "m3u8_native" or "http_dash_segments".
 113                     * preference Order number of this format. If this field is
 114                                  present and not None, the formats get sorted
 115                                  by this field, regardless of all other values.
 116                                  -1 for default (order by other properties),
 117                                  -2 or smaller for less than default.
 118                                  < -1000 to hide the format (if there is
 119                                     another one which is strictly better)
 120                     * language   Language code, e.g. "de" or "en-US".
 121                     * language_preference  Is this in the language mentioned in
 122                                  the URL?
 123                                  10 if it's what the URL is about,
 124                                  -1 for default (don't know),
 125                                  -10 otherwise, other values reserved for now.
 126                     * quality    Order number of the video quality of this
 127                                  format, irrespective of the file format.
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                     * source_preference  Order number for this video source
 131                                   (quality takes higher priority)
 132                                  -1 for default (order by other properties),
 133                                  -2 or smaller for less than default.
 134                     * http_headers  A dictionary of additional HTTP headers
 135                                  to add to the request.
 136                     * stretched_ratio  If given and not 1, indicates that the
 137                                  video's pixels are not square.
 138                                  width : height ratio as float.
 139                     * no_resume  The server does not support resuming the
 140                                  (HTTP or RTMP) download. Boolean.
 141
 142     url:            Final video URL.
 143     ext:            Video filename extension.
 144     format:         The video format, defaults to ext (used for --get-format)
 145     player_url:     SWF Player URL (used for rtmpdump).
 146
 147     The following fields are optional:
 148
 149     alt_title:      A secondary title of the video.
 150     display_id      An alternative identifier for the video, not necessarily
 151                     unique, but available before title. Typically, id is
 152                     something like "4234987", title "Dancing naked mole rats",
 153                     and display_id "dancing-naked-mole-rats"
 154     thumbnails:     A list of dictionaries, with the following entries:
 155                         * "id" (optional, string) - Thumbnail format ID
 156                         * "url"
 157                         * "preference" (optional, int) - quality of the image
 158                         * "width" (optional, int)
 159                         * "height" (optional, int)
 160                         * "resolution" (optional, string "{width}x{height"},
 161                                         deprecated)
 162     thumbnail:      Full URL to a video thumbnail image.
 163     description:    Full video description.
 164     uploader:       Full name of the video uploader.
 165     license:        License name the video is licensed under.
 166     creator:        The creator of the video.
 167     release_date:   The date (YYYYMMDD) when the video was released.
 168     timestamp:      UNIX timestamp of the moment the video became available.
 169     upload_date:    Video upload date (YYYYMMDD).
 170                     If not explicitly set, calculated from timestamp.
 171     uploader_id:    Nickname or id of the video uploader.
 172     uploader_url:   Full URL to a personal webpage of the video uploader.
 173     location:       Physical location where the video was filmed.
 174     subtitles:      The available subtitles as a dictionary in the format
 175                     {language: subformats}. "subformats" is a list sorted from
 176                     lower to higher preference, each element is a dictionary
 177                     with the "ext" entry and one of:
 178                         * "data": The subtitles file contents
 179                         * "url": A URL pointing to the subtitles file
 180                     "ext" will be calculated from URL if missing
 181     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 182                     automatically generated captions
 183     duration:       Length of the video in seconds, as an integer or float.
 184     view_count:     How many users have watched the video on the platform.
 185     like_count:     Number of positive ratings of the video
 186     dislike_count:  Number of negative ratings of the video
 187     repost_count:   Number of reposts of the video
 188     average_rating: Average rating give by users, the scale used depends on the webpage
 189     comment_count:  Number of comments on the video
 190     comments:       A list of comments, each with one or more of the following
 191                     properties (all but one of text or html optional):
 192                         * "author" - human-readable name of the comment author
 193                         * "author_id" - user ID of the comment author
 194                         * "id" - Comment ID
 195                         * "html" - Comment as HTML
 196                         * "text" - Plain text of the comment
 197                         * "timestamp" - UNIX timestamp of comment
 198                         * "parent" - ID of the comment this one is replying to.
 199                                      Set to "root" to indicate that this is a
 200                                      comment to the original video.
 201     age_limit:      Age restriction for the video, as an integer (years)
 202     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 203                     should allow to get the same result again. (It will be set
 204                     by YoutubeDL if it's missing)
 205     categories:     A list of categories that the video falls in, for example
 206                     ["Sports", "Berlin"]
 207     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 208     is_live:        True, False, or None (=unknown). Whether this video is a
 209                     live stream that goes on instead of a fixed-length video.
 210     start_time:     Time in seconds where the reproduction should start, as
 211                     specified in the URL.
 212     end_time:       Time in seconds where the reproduction should end, as
 213                     specified in the URL.
 214
 215     The following fields should only be used when the video belongs to some logical
 216     chapter or section:
 217
 218     chapter:        Name or title of the chapter the video belongs to.
 219     chapter_number: Number of the chapter the video belongs to, as an integer.
 220     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 221
 222     The following fields should only be used when the video is an episode of some
 223     series or programme:
 224
 225     series:         Title of the series or programme the video episode belongs to.
 226     season:         Title of the season the video episode belongs to.
 227     season_number:  Number of the season the video episode belongs to, as an integer.
 228     season_id:      Id of the season the video episode belongs to, as a unicode string.
 229     episode:        Title of the video episode. Unlike mandatory video title field,
 230                     this field should denote the exact title of the video episode
 231                     without any kind of decoration.
 232     episode_number: Number of the video episode within a season, as an integer.
 233     episode_id:     Id of the video episode, as a unicode string.
 234
 235     The following fields should only be used when the media is a track or a part of
 236     a music album:
 237
 238     track:          Title of the track.
 239     track_number:   Number of the track within an album or a disc, as an integer.
 240     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 241                     as a unicode string.
 242     artist:         Artist(s) of the track.
 243     genre:          Genre(s) of the track.
 244     album:          Title of the album the track belongs to.
 245     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 246     album_artist:   List of all artists appeared on the album (e.g.
 247                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 248                     and compilations).
 249     disc_number:    Number of the disc or other physical medium the track belongs to,
 250                     as an integer.
 251     release_year:   Year (YYYY) when the album was released.
 252
 253     Unless mentioned otherwise, the fields should be Unicode strings.
 254
 255     Unless mentioned otherwise, None is equivalent to absence of information.
 256
 257
 258     _type "playlist" indicates multiple videos.
 259     There must be a key "entries", which is a list, an iterable, or a PagedList
 260     object, each element of which is a valid dictionary by this specification.
 261
 262     Additionally, playlists can have "title", "description" and "id" attributes
 263     with the same semantics as videos (see above).
 264
 265
 266     _type "multi_video" indicates that there are multiple videos that
 267     form a single show, for examples multiple acts of an opera or TV episode.
 268     It must have an entries key like a playlist and contain all the keys
 269     required for a video at the same time.
 270
 271
 272     _type "url" indicates that the video must be extracted from another
 273     location, possibly by a different extractor. Its only required key is:
 274     "url" - the next URL to extract.
 275     The key "ie_key" can be set to the class name (minus the trailing "IE",
 276     e.g. "Youtube") if the extractor class is known in advance.
 277     Additionally, the dictionary may have any properties of the resolved entity
 278     known in advance, for example "title" if the title of the referred video is
 279     known ahead of time.
 280
 281
 282     _type "url_transparent" entities have the same specification as "url", but
 283     indicate that the given additional information is more precise than the one
 284     associated with the resolved URL.
 285     This is useful when a site employs a video service that hosts the video and
 286     its technical metadata, but that video service does not embed a useful
 287     title, description etc.
 288
 289
 290     Subclasses of this one should re-define the _real_initialize() and
 291     _real_extract() methods and define a _VALID_URL regexp.
 292     Probably, they should also be added to the list of extractors.
 293
 294     Finally, the _WORKING attribute should be set to False for broken IEs
 295     in order to warn the users and skip the tests.
 296     """
 297
 298     _ready = False
 299     _downloader = None
 300     _WORKING = True
 301
 302     def __init__(self, downloader=None):
 303         """Constructor. Receives an optional downloader."""
 304         self._ready = False
 305         self.set_downloader(downloader)
 306
 307     @classmethod
 308     def suitable(cls, url):
 309         """Receives a URL and returns True if suitable for this IE."""
 310
 311         # This does not use has/getattr intentionally - we want to know whether
 312         # we have cached the regexp for *this* class, whereas getattr would also
 313         # match the superclass
 314         if '_VALID_URL_RE' not in cls.__dict__:
 315             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 316         return cls._VALID_URL_RE.match(url) is not None
 317
 318     @classmethod
 319     def _match_id(cls, url):
 320         if '_VALID_URL_RE' not in cls.__dict__:
 321             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 322         m = cls._VALID_URL_RE.match(url)
 323         assert m
 324         return m.group('id')
 325
 326     @classmethod
 327     def working(cls):
 328         """Getter method for _WORKING."""
 329         return cls._WORKING
 330
 331     def initialize(self):
 332         """Initializes an instance (authentication, etc)."""
 333         if not self._ready:
 334             self._real_initialize()
 335             self._ready = True
 336
 337     def extract(self, url):
 338         """Extracts URL information and returns it in list of dicts."""
 339         try:
 340             self.initialize()
 341             return self._real_extract(url)
 342         except ExtractorError:
 343             raise
 344         except compat_http_client.IncompleteRead as e:
 345             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 346         except (KeyError, StopIteration) as e:
 347             raise ExtractorError('An extractor error has occurred.', cause=e)
 348
 349     def set_downloader(self, downloader):
 350         """Sets the downloader for this IE."""
 351         self._downloader = downloader
 352
 353     def _real_initialize(self):
 354         """Real initialization process. Redefine in subclasses."""
 355         pass
 356
 357     def _real_extract(self, url):
 358         """Real extraction process. Redefine in subclasses."""
 359         pass
 360
 361     @classmethod
 362     def ie_key(cls):
 363         """A string for getting the InfoExtractor with get_info_extractor"""
 364         return compat_str(cls.__name__[:-2])
 365
 366     @property
 367     def IE_NAME(self):
 368         return compat_str(type(self).__name__[:-2])
 369
 370     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 371         """ Returns the response handle """
 372         if note is None:
 373             self.report_download_webpage(video_id)
 374         elif note is not False:
 375             if video_id is None:
 376                 self.to_screen('%s' % (note,))
 377             else:
 378                 self.to_screen('%s: %s' % (video_id, note))
 379         if isinstance(url_or_request, compat_urllib_request.Request):
 380             url_or_request = update_Request(
 381                 url_or_request, data=data, headers=headers, query=query)
 382         else:
 383             if query:
 384                 url_or_request = update_url_query(url_or_request, query)
 385             if data is not None or headers:
 386                 url_or_request = sanitized_Request(url_or_request, data, headers)
 387         try:
 388             return self._downloader.urlopen(url_or_request)
 389         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 390             if errnote is False:
 391                 return False
 392             if errnote is None:
 393                 errnote = 'Unable to download webpage'
 394
 395             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 396             if fatal:
 397                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 398             else:
 399                 self._downloader.report_warning(errmsg)
 400                 return False
 401
 402     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 403         """ Returns a tuple (page content as string, URL handle) """
 404         # Strip hashes from the URL (#1038)
 405         if isinstance(url_or_request, (compat_str, str)):
 406             url_or_request = url_or_request.partition('#')[0]
 407
 408         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 409         if urlh is False:
 410             assert not fatal
 411             return False
 412         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 413         return (content, urlh)
 414
 415     @staticmethod
 416     def _guess_encoding_from_content(content_type, webpage_bytes):
 417         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 418         if m:
 419             encoding = m.group(1)
 420         else:
 421             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 422                           webpage_bytes[:1024])
 423             if m:
 424                 encoding = m.group(1).decode('ascii')
 425             elif webpage_bytes.startswith(b'\xff\xfe'):
 426                 encoding = 'utf-16'
 427             else:
 428                 encoding = 'utf-8'
 429
 430         return encoding
 431
 432     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 433         content_type = urlh.headers.get('Content-Type', '')
 434         webpage_bytes = urlh.read()
 435         if prefix is not None:
 436             webpage_bytes = prefix + webpage_bytes
 437         if not encoding:
 438             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 439         if self._downloader.params.get('dump_intermediate_pages', False):
 440             try:
 441                 url = url_or_request.get_full_url()
 442             except AttributeError:
 443                 url = url_or_request
 444             self.to_screen('Dumping request to ' + url)
 445             dump = base64.b64encode(webpage_bytes).decode('ascii')
 446             self._downloader.to_screen(dump)
 447         if self._downloader.params.get('write_pages', False):
 448             try:
 449                 url = url_or_request.get_full_url()
 450             except AttributeError:
 451                 url = url_or_request
 452             basen = '%s_%s' % (video_id, url)
 453             if len(basen) > 240:
 454                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 455                 basen = basen[:240 - len(h)] + h
 456             raw_filename = basen + '.dump'
 457             filename = sanitize_filename(raw_filename, restricted=True)
 458             self.to_screen('Saving request to ' + filename)
 459             # Working around MAX_PATH limitation on Windows (see
 460             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 461             if compat_os_name == 'nt':
 462                 absfilepath = os.path.abspath(filename)
 463                 if len(absfilepath) > 259:
 464                     filename = '\\\\?\\' + absfilepath
 465             with open(filename, 'wb') as outf:
 466                 outf.write(webpage_bytes)
 467
 468         try:
 469             content = webpage_bytes.decode(encoding, 'replace')
 470         except LookupError:
 471             content = webpage_bytes.decode('utf-8', 'replace')
 472
 473         if ('<title>Access to this site is blocked</title>' in content and
 474                 'Websense' in content[:512]):
 475             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 476             blocked_iframe = self._html_search_regex(
 477                 r'<iframe src="([^"]+)"', content,
 478                 'Websense information URL', default=None)
 479             if blocked_iframe:
 480                 msg += ' Visit %s for more details' % blocked_iframe
 481             raise ExtractorError(msg, expected=True)
 482         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 483             msg = (
 484                 'Access to this webpage has been blocked by Indian censorship. '
 485                 'Use a VPN or proxy server (with --proxy) to route around it.')
 486             block_msg = self._html_search_regex(
 487                 r'</h1><p>(.*?)</p>',
 488                 content, 'block message', default=None)
 489             if block_msg:
 490                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 491             raise ExtractorError(msg, expected=True)
 492
 493         return content
 494
 495     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 496         """ Returns the data of the page as a string """
 497         success = False
 498         try_count = 0
 499         while success is False:
 500             try:
 501                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 502                 success = True
 503             except compat_http_client.IncompleteRead as e:
 504                 try_count += 1
 505                 if try_count >= tries:
 506                     raise e
 507                 self._sleep(timeout, video_id)
 508         if res is False:
 509             return res
 510         else:
 511             content, _ = res
 512             return content
 513
 514     def _download_xml(self, url_or_request, video_id,
 515                       note='Downloading XML', errnote='Unable to download XML',
 516                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 517         """Return the xml as an xml.etree.ElementTree.Element"""
 518         xml_string = self._download_webpage(
 519             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 520         if xml_string is False:
 521             return xml_string
 522         if transform_source:
 523             xml_string = transform_source(xml_string)
 524         return compat_etree_fromstring(xml_string.encode('utf-8'))
 525
 526     def _download_json(self, url_or_request, video_id,
 527                        note='Downloading JSON metadata',
 528                        errnote='Unable to download JSON metadata',
 529                        transform_source=None,
 530                        fatal=True, encoding=None, data=None, headers={}, query={}):
 531         json_string = self._download_webpage(
 532             url_or_request, video_id, note, errnote, fatal=fatal,
 533             encoding=encoding, data=data, headers=headers, query=query)
 534         if (not fatal) and json_string is False:
 535             return None
 536         return self._parse_json(
 537             json_string, video_id, transform_source=transform_source, fatal=fatal)
 538
 539     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 540         if transform_source:
 541             json_string = transform_source(json_string)
 542         try:
 543             return json.loads(json_string)
 544         except ValueError as ve:
 545             errmsg = '%s: Failed to parse JSON ' % video_id
 546             if fatal:
 547                 raise ExtractorError(errmsg, cause=ve)
 548             else:
 549                 self.report_warning(errmsg + str(ve))
 550
 551     def report_warning(self, msg, video_id=None):
 552         idstr = '' if video_id is None else '%s: ' % video_id
 553         self._downloader.report_warning(
 554             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 555
 556     def to_screen(self, msg):
 557         """Print msg to screen, prefixing it with '[ie_name]'"""
 558         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 559
 560     def report_extraction(self, id_or_name):
 561         """Report information extraction."""
 562         self.to_screen('%s: Extracting information' % id_or_name)
 563
 564     def report_download_webpage(self, video_id):
 565         """Report webpage download."""
 566         self.to_screen('%s: Downloading webpage' % video_id)
 567
 568     def report_age_confirmation(self):
 569         """Report attempt to confirm age."""
 570         self.to_screen('Confirming age')
 571
 572     def report_login(self):
 573         """Report attempt to log in."""
 574         self.to_screen('Logging in')
 575
 576     @staticmethod
 577     def raise_login_required(msg='This video is only available for registered users'):
 578         raise ExtractorError(
 579             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 580             expected=True)
 581
 582     @staticmethod
 583     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 584         raise ExtractorError(
 585             '%s. You might want to use --proxy to workaround.' % msg,
 586             expected=True)
 587
 588     # Methods for following #608
 589     @staticmethod
 590     def url_result(url, ie=None, video_id=None, video_title=None):
 591         """Returns a URL that points to a page that should be processed"""
 592         # TODO: ie should be the class used for getting the info
 593         video_info = {'_type': 'url',
 594                       'url': url,
 595                       'ie_key': ie}
 596         if video_id is not None:
 597             video_info['id'] = video_id
 598         if video_title is not None:
 599             video_info['title'] = video_title
 600         return video_info
 601
 602     @staticmethod
 603     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 604         """Returns a playlist"""
 605         video_info = {'_type': 'playlist',
 606                       'entries': entries}
 607         if playlist_id:
 608             video_info['id'] = playlist_id
 609         if playlist_title:
 610             video_info['title'] = playlist_title
 611         if playlist_description:
 612             video_info['description'] = playlist_description
 613         return video_info
 614
 615     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 616         """
 617         Perform a regex search on the given string, using a single or a list of
 618         patterns returning the first matching group.
 619         In case of failure return a default value or raise a WARNING or a
 620         RegexNotFoundError, depending on fatal, specifying the field name.
 621         """
 622         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 623             mobj = re.search(pattern, string, flags)
 624         else:
 625             for p in pattern:
 626                 mobj = re.search(p, string, flags)
 627                 if mobj:
 628                     break
 629
 630         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 631             _name = '\033[0;34m%s\033[0m' % name
 632         else:
 633             _name = name
 634
 635         if mobj:
 636             if group is None:
 637                 # return the first matching group
 638                 return next(g for g in mobj.groups() if g is not None)
 639             else:
 640                 return mobj.group(group)
 641         elif default is not NO_DEFAULT:
 642             return default
 643         elif fatal:
 644             raise RegexNotFoundError('Unable to extract %s' % _name)
 645         else:
 646             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 647             return None
 648
 649     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 650         """
 651         Like _search_regex, but strips HTML tags and unescapes entities.
 652         """
 653         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 654         if res:
 655             return clean_html(res).strip()
 656         else:
 657             return res
 658
 659     def _get_login_info(self):
 660         """
 661         Get the login info as (username, password)
 662         It will look in the netrc file using the _NETRC_MACHINE value
 663         If there's no info available, return (None, None)
 664         """
 665         if self._downloader is None:
 666             return (None, None)
 667
 668         username = None
 669         password = None
 670         downloader_params = self._downloader.params
 671
 672         # Attempt to use provided username and password or .netrc data
 673         if downloader_params.get('username') is not None:
 674             username = downloader_params['username']
 675             password = downloader_params['password']
 676         elif downloader_params.get('usenetrc', False):
 677             try:
 678                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 679                 if info is not None:
 680                     username = info[0]
 681                     password = info[2]
 682                 else:
 683                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 684             except (IOError, netrc.NetrcParseError) as err:
 685                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 686
 687         return (username, password)
 688
 689     def _get_tfa_info(self, note='two-factor verification code'):
 690         """
 691         Get the two-factor authentication info
 692         TODO - asking the user will be required for sms/phone verify
 693         currently just uses the command line option
 694         If there's no info available, return None
 695         """
 696         if self._downloader is None:
 697             return None
 698         downloader_params = self._downloader.params
 699
 700         if downloader_params.get('twofactor') is not None:
 701             return downloader_params['twofactor']
 702
 703         return compat_getpass('Type %s and press [Return]: ' % note)
 704
 705     # Helper functions for extracting OpenGraph info
 706     @staticmethod
 707     def _og_regexes(prop):
 708         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 709         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 710                        % {'prop': re.escape(prop)})
 711         template = r'<meta[^>]+?%s[^>]+?%s'
 712         return [
 713             template % (property_re, content_re),
 714             template % (content_re, property_re),
 715         ]
 716
 717     @staticmethod
 718     def _meta_regex(prop):
 719         return r'''(?isx)<meta
 720                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 721                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 722
 723     def _og_search_property(self, prop, html, name=None, **kargs):
 724         if name is None:
 725             name = 'OpenGraph %s' % prop
 726         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 727         if escaped is None:
 728             return None
 729         return unescapeHTML(escaped)
 730
 731     def _og_search_thumbnail(self, html, **kargs):
 732         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 733
 734     def _og_search_description(self, html, **kargs):
 735         return self._og_search_property('description', html, fatal=False, **kargs)
 736
 737     def _og_search_title(self, html, **kargs):
 738         return self._og_search_property('title', html, **kargs)
 739
 740     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 741         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 742         if secure:
 743             regexes = self._og_regexes('video:secure_url') + regexes
 744         return self._html_search_regex(regexes, html, name, **kargs)
 745
 746     def _og_search_url(self, html, **kargs):
 747         return self._og_search_property('url', html, **kargs)
 748
 749     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 750         if display_name is None:
 751             display_name = name
 752         return self._html_search_regex(
 753             self._meta_regex(name),
 754             html, display_name, fatal=fatal, group='content', **kwargs)
 755
 756     def _dc_search_uploader(self, html):
 757         return self._html_search_meta('dc.creator', html, 'uploader')
 758
 759     def _rta_search(self, html):
 760         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 761         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 762                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 763                      html):
 764             return 18
 765         return 0
 766
 767     def _media_rating_search(self, html):
 768         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 769         rating = self._html_search_meta('rating', html)
 770
 771         if not rating:
 772             return None
 773
 774         RATING_TABLE = {
 775             'safe for kids': 0,
 776             'general': 8,
 777             '14 years': 14,
 778             'mature': 17,
 779             'restricted': 19,
 780         }
 781         return RATING_TABLE.get(rating.lower())
 782
 783     def _family_friendly_search(self, html):
 784         # See http://schema.org/VideoObject
 785         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 786
 787         if not family_friendly:
 788             return None
 789
 790         RATING_TABLE = {
 791             '1': 0,
 792             'true': 0,
 793             '0': 18,
 794             'false': 18,
 795         }
 796         return RATING_TABLE.get(family_friendly.lower())
 797
 798     def _twitter_search_player(self, html):
 799         return self._html_search_meta('twitter:player', html,
 800                                       'twitter card player')
 801
 802     def _search_json_ld(self, html, video_id, **kwargs):
 803         json_ld = self._search_regex(
 804             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 805             html, 'JSON-LD', group='json_ld', **kwargs)
 806         if not json_ld:
 807             return {}
 808         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 809
 810     def _json_ld(self, json_ld, video_id, fatal=True):
 811         if isinstance(json_ld, compat_str):
 812             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 813         if not json_ld:
 814             return {}
 815         info = {}
 816         if json_ld.get('@context') == 'http://schema.org':
 817             item_type = json_ld.get('@type')
 818             if item_type == 'TVEpisode':
 819                 info.update({
 820                     'episode': unescapeHTML(json_ld.get('name')),
 821                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 822                     'description': unescapeHTML(json_ld.get('description')),
 823                 })
 824                 part_of_season = json_ld.get('partOfSeason')
 825                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 826                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 827                 part_of_series = json_ld.get('partOfSeries')
 828                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 829                     info['series'] = unescapeHTML(part_of_series.get('name'))
 830             elif item_type == 'Article':
 831                 info.update({
 832                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 833                     'title': unescapeHTML(json_ld.get('headline')),
 834                     'description': unescapeHTML(json_ld.get('articleBody')),
 835                 })
 836         return dict((k, v) for k, v in info.items() if v is not None)
 837
 838     @staticmethod
 839     def _hidden_inputs(html):
 840         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 841         hidden_inputs = {}
 842         for input in re.findall(r'(?i)<input([^>]+)>', html):
 843             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 844                 continue
 845             name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
 846             if not name:
 847                 continue
 848             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 849             if not value:
 850                 continue
 851             hidden_inputs[name.group('value')] = value.group('value')
 852         return hidden_inputs
 853
 854     def _form_hidden_inputs(self, form_id, html):
 855         form = self._search_regex(
 856             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 857             html, '%s form' % form_id, group='form')
 858         return self._hidden_inputs(form)
 859
 860     def _sort_formats(self, formats, field_preference=None):
 861         if not formats:
 862             raise ExtractorError('No video formats found')
 863
 864         for f in formats:
 865             # Automatically determine tbr when missing based on abr and vbr (improves
 866             # formats sorting in some cases)
 867             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 868                 f['tbr'] = f['abr'] + f['vbr']
 869
 870         def _formats_key(f):
 871             # TODO remove the following workaround
 872             from ..utils import determine_ext
 873             if not f.get('ext') and 'url' in f:
 874                 f['ext'] = determine_ext(f['url'])
 875
 876             if isinstance(field_preference, (list, tuple)):
 877                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 878
 879             preference = f.get('preference')
 880             if preference is None:
 881                 preference = 0
 882                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 883                     preference -= 0.5
 884
 885             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 886
 887             if f.get('vcodec') == 'none':  # audio only
 888                 preference -= 50
 889                 if self._downloader.params.get('prefer_free_formats'):
 890                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 891                 else:
 892                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 893                 ext_preference = 0
 894                 try:
 895                     audio_ext_preference = ORDER.index(f['ext'])
 896                 except ValueError:
 897                     audio_ext_preference = -1
 898             else:
 899                 if f.get('acodec') == 'none':  # video only
 900                     preference -= 40
 901                 if self._downloader.params.get('prefer_free_formats'):
 902                     ORDER = ['flv', 'mp4', 'webm']
 903                 else:
 904                     ORDER = ['webm', 'flv', 'mp4']
 905                 try:
 906                     ext_preference = ORDER.index(f['ext'])
 907                 except ValueError:
 908                     ext_preference = -1
 909                 audio_ext_preference = 0
 910
 911             return (
 912                 preference,
 913                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 914                 f.get('quality') if f.get('quality') is not None else -1,
 915                 f.get('tbr') if f.get('tbr') is not None else -1,
 916                 f.get('filesize') if f.get('filesize') is not None else -1,
 917                 f.get('vbr') if f.get('vbr') is not None else -1,
 918                 f.get('height') if f.get('height') is not None else -1,
 919                 f.get('width') if f.get('width') is not None else -1,
 920                 proto_preference,
 921                 ext_preference,
 922                 f.get('abr') if f.get('abr') is not None else -1,
 923                 audio_ext_preference,
 924                 f.get('fps') if f.get('fps') is not None else -1,
 925                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 926                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 927                 f.get('format_id') if f.get('format_id') is not None else '',
 928             )
 929         formats.sort(key=_formats_key)
 930
 931     def _check_formats(self, formats, video_id):
 932         if formats:
 933             formats[:] = filter(
 934                 lambda f: self._is_valid_url(
 935                     f['url'], video_id,
 936                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 937                 formats)
 938
 939     @staticmethod
 940     def _remove_duplicate_formats(formats):
 941         format_urls = set()
 942         unique_formats = []
 943         for f in formats:
 944             if f['url'] not in format_urls:
 945                 format_urls.add(f['url'])
 946                 unique_formats.append(f)
 947         formats[:] = unique_formats
 948
 949     def _is_valid_url(self, url, video_id, item='video'):
 950         url = self._proto_relative_url(url, scheme='http:')
 951         # For now assume non HTTP(S) URLs always valid
 952         if not (url.startswith('http://') or url.startswith('https://')):
 953             return True
 954         try:
 955             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 956             return True
 957         except ExtractorError as e:
 958             if isinstance(e.cause, compat_urllib_error.URLError):
 959                 self.to_screen(
 960                     '%s: %s URL is invalid, skipping' % (video_id, item))
 961                 return False
 962             raise
 963
 964     def http_scheme(self):
 965         """ Either "http:" or "https:", depending on the user's preferences """
 966         return (
 967             'http:'
 968             if self._downloader.params.get('prefer_insecure', False)
 969             else 'https:')
 970
 971     def _proto_relative_url(self, url, scheme=None):
 972         if url is None:
 973             return url
 974         if url.startswith('//'):
 975             if scheme is None:
 976                 scheme = self.http_scheme()
 977             return scheme + url
 978         else:
 979             return url
 980
 981     def _sleep(self, timeout, video_id, msg_template=None):
 982         if msg_template is None:
 983             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 984         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 985         self.to_screen(msg)
 986         time.sleep(timeout)
 987
 988     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 989                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 990                              fatal=True, m3u8_id=None):
 991         manifest = self._download_xml(
 992             manifest_url, video_id, 'Downloading f4m manifest',
 993             'Unable to download f4m manifest',
 994             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 995             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 996             transform_source=transform_source,
 997             fatal=fatal)
 998
 999         if manifest is False:
1000             return []
1001
1002         return self._parse_f4m_formats(
1003             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1004             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1005
1006     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1007                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1008                            fatal=True, m3u8_id=None):
1009         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1010         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1011         if akamai_pv is not None and ';' in akamai_pv.text:
1012             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1013             if playerVerificationChallenge.strip() != '':
1014                 return []
1015
1016         formats = []
1017         manifest_version = '1.0'
1018         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1019         if not media_nodes:
1020             manifest_version = '2.0'
1021             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1022         # Remove unsupported DRM protected media from final formats
1023         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1024         media_nodes = remove_encrypted_media(media_nodes)
1025         if not media_nodes:
1026             return formats
1027         base_url = xpath_text(
1028             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1029             'base URL', default=None)
1030         if base_url:
1031             base_url = base_url.strip()
1032
1033         bootstrap_info = xpath_text(
1034             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1035             'bootstrap info', default=None)
1036
1037         for i, media_el in enumerate(media_nodes):
1038             tbr = int_or_none(media_el.attrib.get('bitrate'))
1039             width = int_or_none(media_el.attrib.get('width'))
1040             height = int_or_none(media_el.attrib.get('height'))
1041             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1042             # If <bootstrapInfo> is present, the specified f4m is a
1043             # stream-level manifest, and only set-level manifests may refer to
1044             # external resources.  See section 11.4 and section 4 of F4M spec
1045             if bootstrap_info is None:
1046                 media_url = None
1047                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1048                 if manifest_version == '2.0':
1049                     media_url = media_el.attrib.get('href')
1050                 if media_url is None:
1051                     media_url = media_el.attrib.get('url')
1052                 if not media_url:
1053                     continue
1054                 manifest_url = (
1055                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1056                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1057                 # If media_url is itself a f4m manifest do the recursive extraction
1058                 # since bitrates in parent manifest (this one) and media_url manifest
1059                 # may differ leading to inability to resolve the format by requested
1060                 # bitrate in f4m downloader
1061                 ext = determine_ext(manifest_url)
1062                 if ext == 'f4m':
1063                     f4m_formats = self._extract_f4m_formats(
1064                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1065                         transform_source=transform_source, fatal=fatal)
1066                     # Sometimes stream-level manifest contains single media entry that
1067                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1068                     # At the same time parent's media entry in set-level manifest may
1069                     # contain it. We will copy it from parent in such cases.
1070                     if len(f4m_formats) == 1:
1071                         f = f4m_formats[0]
1072                         f.update({
1073                             'tbr': f.get('tbr') or tbr,
1074                             'width': f.get('width') or width,
1075                             'height': f.get('height') or height,
1076                             'format_id': f.get('format_id') if not tbr else format_id,
1077                         })
1078                     formats.extend(f4m_formats)
1079                     continue
1080                 elif ext == 'm3u8':
1081                     formats.extend(self._extract_m3u8_formats(
1082                         manifest_url, video_id, 'mp4', preference=preference,
1083                         m3u8_id=m3u8_id, fatal=fatal))
1084                     continue
1085             formats.append({
1086                 'format_id': format_id,
1087                 'url': manifest_url,
1088                 'ext': 'flv' if bootstrap_info else None,
1089                 'tbr': tbr,
1090                 'width': width,
1091                 'height': height,
1092                 'preference': preference,
1093             })
1094         return formats
1095
1096     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1097         return {
1098             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1099             'url': m3u8_url,
1100             'ext': ext,
1101             'protocol': 'm3u8',
1102             'preference': preference - 1 if preference else -1,
1103             'resolution': 'multiple',
1104             'format_note': 'Quality selection URL',
1105         }
1106
1107     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1108                               entry_protocol='m3u8', preference=None,
1109                               m3u8_id=None, note=None, errnote=None,
1110                               fatal=True, live=False):
1111
1112         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1113
1114         format_url = lambda u: (
1115             u
1116             if re.match(r'^https?://', u)
1117             else compat_urlparse.urljoin(m3u8_url, u))
1118
1119         res = self._download_webpage_handle(
1120             m3u8_url, video_id,
1121             note=note or 'Downloading m3u8 information',
1122             errnote=errnote or 'Failed to download m3u8 information',
1123             fatal=fatal)
1124         if res is False:
1125             return []
1126         m3u8_doc, urlh = res
1127         m3u8_url = urlh.geturl()
1128
1129         # We should try extracting formats only from master playlists [1], i.e.
1130         # playlists that describe available qualities. On the other hand media
1131         # playlists [2] should be returned as is since they contain just the media
1132         # without qualities renditions.
1133         # Fortunately, master playlist can be easily distinguished from media
1134         # playlist based on particular tags availability. As of [1, 2] master
1135         # playlist tags MUST NOT appear in a media playist and vice versa.
1136         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1137         # and MUST NOT appear in master playlist thus we can clearly detect media
1138         # playlist with this criterion.
1139         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1140         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1141         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1142         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1143             return [{
1144                 'url': m3u8_url,
1145                 'format_id': m3u8_id,
1146                 'ext': ext,
1147                 'protocol': entry_protocol,
1148                 'preference': preference,
1149             }]
1150         last_info = None
1151         last_media = None
1152         kv_rex = re.compile(
1153             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1154         for line in m3u8_doc.splitlines():
1155             if line.startswith('#EXT-X-STREAM-INF:'):
1156                 last_info = {}
1157                 for m in kv_rex.finditer(line):
1158                     v = m.group('val')
1159                     if v.startswith('"'):
1160                         v = v[1:-1]
1161                     last_info[m.group('key')] = v
1162             elif line.startswith('#EXT-X-MEDIA:'):
1163                 last_media = {}
1164                 for m in kv_rex.finditer(line):
1165                     v = m.group('val')
1166                     if v.startswith('"'):
1167                         v = v[1:-1]
1168                     last_media[m.group('key')] = v
1169             elif line.startswith('#') or not line.strip():
1170                 continue
1171             else:
1172                 if last_info is None:
1173                     formats.append({'url': format_url(line)})
1174                     continue
1175                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1176                 format_id = []
1177                 if m3u8_id:
1178                     format_id.append(m3u8_id)
1179                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
1180                 # Despite specification does not mention NAME attribute for
1181                 # EXT-X-STREAM-INF it still sometimes may be present
1182                 stream_name = last_info.get('NAME') or last_media_name
1183                 # Bandwidth of live streams may differ over time thus making
1184                 # format_id unpredictable. So it's better to keep provided
1185                 # format_id intact.
1186                 if not live:
1187                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1188                 f = {
1189                     'format_id': '-'.join(format_id),
1190                     'url': format_url(line.strip()),
1191                     'tbr': tbr,
1192                     'ext': ext,
1193                     'protocol': entry_protocol,
1194                     'preference': preference,
1195                 }
1196                 resolution = last_info.get('RESOLUTION')
1197                 if resolution:
1198                     width_str, height_str = resolution.split('x')
1199                     f['width'] = int(width_str)
1200                     f['height'] = int(height_str)
1201                 codecs = last_info.get('CODECS')
1202                 if codecs:
1203                     vcodec, acodec = [None] * 2
1204                     va_codecs = codecs.split(',')
1205                     if len(va_codecs) == 1:
1206                         # Audio only entries usually come with single codec and
1207                         # no resolution. For more robustness we also check it to
1208                         # be mp4 audio.
1209                         if not resolution and va_codecs[0].startswith('mp4a'):
1210                             vcodec, acodec = 'none', va_codecs[0]
1211                         else:
1212                             vcodec = va_codecs[0]
1213                     else:
1214                         vcodec, acodec = va_codecs[:2]
1215                     f.update({
1216                         'acodec': acodec,
1217                         'vcodec': vcodec,
1218                     })
1219                 if last_media is not None:
1220                     f['m3u8_media'] = last_media
1221                     last_media = None
1222                 formats.append(f)
1223                 last_info = {}
1224         return formats
1225
1226     @staticmethod
1227     def _xpath_ns(path, namespace=None):
1228         if not namespace:
1229             return path
1230         out = []
1231         for c in path.split('/'):
1232             if not c or c == '.':
1233                 out.append(c)
1234             else:
1235                 out.append('{%s}%s' % (namespace, c))
1236         return '/'.join(out)
1237
1238     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1239         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1240
1241         if smil is False:
1242             assert not fatal
1243             return []
1244
1245         namespace = self._parse_smil_namespace(smil)
1246
1247         return self._parse_smil_formats(
1248             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1249
1250     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1251         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1252         if smil is False:
1253             return {}
1254         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1255
1256     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1257         return self._download_xml(
1258             smil_url, video_id, 'Downloading SMIL file',
1259             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1260
1261     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1262         namespace = self._parse_smil_namespace(smil)
1263
1264         formats = self._parse_smil_formats(
1265             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1266         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1267
1268         video_id = os.path.splitext(url_basename(smil_url))[0]
1269         title = None
1270         description = None
1271         upload_date = None
1272         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1273             name = meta.attrib.get('name')
1274             content = meta.attrib.get('content')
1275             if not name or not content:
1276                 continue
1277             if not title and name == 'title':
1278                 title = content
1279             elif not description and name in ('description', 'abstract'):
1280                 description = content
1281             elif not upload_date and name == 'date':
1282                 upload_date = unified_strdate(content)
1283
1284         thumbnails = [{
1285             'id': image.get('type'),
1286             'url': image.get('src'),
1287             'width': int_or_none(image.get('width')),
1288             'height': int_or_none(image.get('height')),
1289         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1290
1291         return {
1292             'id': video_id,
1293             'title': title or video_id,
1294             'description': description,
1295             'upload_date': upload_date,
1296             'thumbnails': thumbnails,
1297             'formats': formats,
1298             'subtitles': subtitles,
1299         }
1300
1301     def _parse_smil_namespace(self, smil):
1302         return self._search_regex(
1303             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1304
1305     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1306         base = smil_url
1307         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1308             b = meta.get('base') or meta.get('httpBase')
1309             if b:
1310                 base = b
1311                 break
1312
1313         formats = []
1314         rtmp_count = 0
1315         http_count = 0
1316         m3u8_count = 0
1317
1318         srcs = []
1319         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1320         for medium in media:
1321             src = medium.get('src')
1322             if not src or src in srcs:
1323                 continue
1324             srcs.append(src)
1325
1326             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1327             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1328             width = int_or_none(medium.get('width'))
1329             height = int_or_none(medium.get('height'))
1330             proto = medium.get('proto')
1331             ext = medium.get('ext')
1332             src_ext = determine_ext(src)
1333             streamer = medium.get('streamer') or base
1334
1335             if proto == 'rtmp' or streamer.startswith('rtmp'):
1336                 rtmp_count += 1
1337                 formats.append({
1338                     'url': streamer,
1339                     'play_path': src,
1340                     'ext': 'flv',
1341                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1342                     'tbr': bitrate,
1343                     'filesize': filesize,
1344                     'width': width,
1345                     'height': height,
1346                 })
1347                 if transform_rtmp_url:
1348                     streamer, src = transform_rtmp_url(streamer, src)
1349                     formats[-1].update({
1350                         'url': streamer,
1351                         'play_path': src,
1352                     })
1353                 continue
1354
1355             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1356             src_url = src_url.strip()
1357
1358             if proto == 'm3u8' or src_ext == 'm3u8':
1359                 m3u8_formats = self._extract_m3u8_formats(
1360                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1361                 if len(m3u8_formats) == 1:
1362                     m3u8_count += 1
1363                     m3u8_formats[0].update({
1364                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1365                         'tbr': bitrate,
1366                         'width': width,
1367                         'height': height,
1368                     })
1369                 formats.extend(m3u8_formats)
1370                 continue
1371
1372             if src_ext == 'f4m':
1373                 f4m_url = src_url
1374                 if not f4m_params:
1375                     f4m_params = {
1376                         'hdcore': '3.2.0',
1377                         'plugin': 'flowplayer-3.2.0.1',
1378                     }
1379                 f4m_url += '&' if '?' in f4m_url else '?'
1380                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1381                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1382                 continue
1383
1384             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1385                 http_count += 1
1386                 formats.append({
1387                     'url': src_url,
1388                     'ext': ext or src_ext or 'flv',
1389                     'format_id': 'http-%d' % (bitrate or http_count),
1390                     'tbr': bitrate,
1391                     'filesize': filesize,
1392                     'width': width,
1393                     'height': height,
1394                 })
1395                 continue
1396
1397         return formats
1398
1399     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1400         urls = []
1401         subtitles = {}
1402         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1403             src = textstream.get('src')
1404             if not src or src in urls:
1405                 continue
1406             urls.append(src)
1407             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1408             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1409             subtitles.setdefault(lang, []).append({
1410                 'url': src,
1411                 'ext': ext,
1412             })
1413         return subtitles
1414
1415     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1416         xspf = self._download_xml(
1417             playlist_url, playlist_id, 'Downloading xpsf playlist',
1418             'Unable to download xspf manifest', fatal=fatal)
1419         if xspf is False:
1420             return []
1421         return self._parse_xspf(xspf, playlist_id)
1422
1423     def _parse_xspf(self, playlist, playlist_id):
1424         NS_MAP = {
1425             'xspf': 'http://xspf.org/ns/0/',
1426             's1': 'http://static.streamone.nl/player/ns/0',
1427         }
1428
1429         entries = []
1430         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1431             title = xpath_text(
1432                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1433             description = xpath_text(
1434                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1435             thumbnail = xpath_text(
1436                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1437             duration = float_or_none(
1438                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1439
1440             formats = [{
1441                 'url': location.text,
1442                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1443                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1444                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1445             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1446             self._sort_formats(formats)
1447
1448             entries.append({
1449                 'id': playlist_id,
1450                 'title': title,
1451                 'description': description,
1452                 'thumbnail': thumbnail,
1453                 'duration': duration,
1454                 'formats': formats,
1455             })
1456         return entries
1457
1458     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1459         res = self._download_webpage_handle(
1460             mpd_url, video_id,
1461             note=note or 'Downloading MPD manifest',
1462             errnote=errnote or 'Failed to download MPD manifest',
1463             fatal=fatal)
1464         if res is False:
1465             return []
1466         mpd, urlh = res
1467         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1468
1469         return self._parse_mpd_formats(
1470             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1471
1472     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1473         if mpd_doc.get('type') == 'dynamic':
1474             return []
1475
1476         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1477
1478         def _add_ns(path):
1479             return self._xpath_ns(path, namespace)
1480
1481         def is_drm_protected(element):
1482             return element.find(_add_ns('ContentProtection')) is not None
1483
1484         def extract_multisegment_info(element, ms_parent_info):
1485             ms_info = ms_parent_info.copy()
1486             segment_list = element.find(_add_ns('SegmentList'))
1487             if segment_list is not None:
1488                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1489                 if segment_urls_e:
1490                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1491                 initialization = segment_list.find(_add_ns('Initialization'))
1492                 if initialization is not None:
1493                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1494             else:
1495                 segment_template = element.find(_add_ns('SegmentTemplate'))
1496                 if segment_template is not None:
1497                     start_number = segment_template.get('startNumber')
1498                     if start_number:
1499                         ms_info['start_number'] = int(start_number)
1500                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1501                     if segment_timeline is not None:
1502                         s_e = segment_timeline.findall(_add_ns('S'))
1503                         if s_e:
1504                             ms_info['total_number'] = 0
1505                             for s in s_e:
1506                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1507                     else:
1508                         timescale = segment_template.get('timescale')
1509                         if timescale:
1510                             ms_info['timescale'] = int(timescale)
1511                         segment_duration = segment_template.get('duration')
1512                         if segment_duration:
1513                             ms_info['segment_duration'] = int(segment_duration)
1514                     media_template = segment_template.get('media')
1515                     if media_template:
1516                         ms_info['media_template'] = media_template
1517                     initialization = segment_template.get('initialization')
1518                     if initialization:
1519                         ms_info['initialization_url'] = initialization
1520                     else:
1521                         initialization = segment_template.find(_add_ns('Initialization'))
1522                         if initialization is not None:
1523                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1524             return ms_info
1525
1526         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1527         formats = []
1528         for period in mpd_doc.findall(_add_ns('Period')):
1529             period_duration = parse_duration(period.get('duration')) or mpd_duration
1530             period_ms_info = extract_multisegment_info(period, {
1531                 'start_number': 1,
1532                 'timescale': 1,
1533             })
1534             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1535                 if is_drm_protected(adaptation_set):
1536                     continue
1537                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1538                 for representation in adaptation_set.findall(_add_ns('Representation')):
1539                     if is_drm_protected(representation):
1540                         continue
1541                     representation_attrib = adaptation_set.attrib.copy()
1542                     representation_attrib.update(representation.attrib)
1543                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1544                     mime_type = representation_attrib['mimeType']
1545                     content_type = mime_type.split('/')[0]
1546                     if content_type == 'text':
1547                         # TODO implement WebVTT downloading
1548                         pass
1549                     elif content_type == 'video' or content_type == 'audio':
1550                         base_url = ''
1551                         for element in (representation, adaptation_set, period, mpd_doc):
1552                             base_url_e = element.find(_add_ns('BaseURL'))
1553                             if base_url_e is not None:
1554                                 base_url = base_url_e.text + base_url
1555                                 if re.match(r'^https?://', base_url):
1556                                     break
1557                         if mpd_base_url and not re.match(r'^https?://', base_url):
1558                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1559                                 mpd_base_url += '/'
1560                             base_url = mpd_base_url + base_url
1561                         representation_id = representation_attrib.get('id')
1562                         lang = representation_attrib.get('lang')
1563                         url_el = representation.find(_add_ns('BaseURL'))
1564                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1565                         f = {
1566                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1567                             'url': base_url,
1568                             'ext': mimetype2ext(mime_type),
1569                             'width': int_or_none(representation_attrib.get('width')),
1570                             'height': int_or_none(representation_attrib.get('height')),
1571                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1572                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1573                             'fps': int_or_none(representation_attrib.get('frameRate')),
1574                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1575                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1576                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1577                             'format_note': 'DASH %s' % content_type,
1578                             'filesize': filesize,
1579                         }
1580                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1581                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1582                             if 'total_number' not in representation_ms_info and 'segment_duration':
1583                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1584                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1585                             media_template = representation_ms_info['media_template']
1586                             media_template = media_template.replace('$RepresentationID$', representation_id)
1587                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
1588                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1589                             media_template.replace('$$', '$')
1590                             representation_ms_info['segment_urls'] = [
1591                                 media_template % {
1592                                     'Number': segment_number,
1593                                     'Bandwidth': representation_attrib.get('bandwidth')}
1594                                 for segment_number in range(
1595                                     representation_ms_info['start_number'],
1596                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1597                         if 'segment_urls' in representation_ms_info:
1598                             f.update({
1599                                 'segment_urls': representation_ms_info['segment_urls'],
1600                                 'protocol': 'http_dash_segments',
1601                             })
1602                             if 'initialization_url' in representation_ms_info:
1603                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1604                                 f.update({
1605                                     'initialization_url': initialization_url,
1606                                 })
1607                                 if not f.get('url'):
1608                                     f['url'] = initialization_url
1609                         try:
1610                             existing_format = next(
1611                                 fo for fo in formats
1612                                 if fo['format_id'] == representation_id)
1613                         except StopIteration:
1614                             full_info = formats_dict.get(representation_id, {}).copy()
1615                             full_info.update(f)
1616                             formats.append(full_info)
1617                         else:
1618                             existing_format.update(f)
1619                     else:
1620                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1621         return formats
1622
1623     def _live_title(self, name):
1624         """ Generate the title for a live video """
1625         now = datetime.datetime.now()
1626         now_str = now.strftime('%Y-%m-%d %H:%M')
1627         return name + ' ' + now_str
1628
1629     def _int(self, v, name, fatal=False, **kwargs):
1630         res = int_or_none(v, **kwargs)
1631         if 'get_attr' in kwargs:
1632             print(getattr(v, kwargs['get_attr']))
1633         if res is None:
1634             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1635             if fatal:
1636                 raise ExtractorError(msg)
1637             else:
1638                 self._downloader.report_warning(msg)
1639         return res
1640
1641     def _float(self, v, name, fatal=False, **kwargs):
1642         res = float_or_none(v, **kwargs)
1643         if res is None:
1644             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1645             if fatal:
1646                 raise ExtractorError(msg)
1647             else:
1648                 self._downloader.report_warning(msg)
1649         return res
1650
1651     def _set_cookie(self, domain, name, value, expire_time=None):
1652         cookie = compat_cookiejar.Cookie(
1653             0, name, value, None, None, domain, None,
1654             None, '/', True, False, expire_time, '', None, None, None)
1655         self._downloader.cookiejar.set_cookie(cookie)
1656
1657     def _get_cookies(self, url):
1658         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1659         req = sanitized_Request(url)
1660         self._downloader.cookiejar.add_cookie_header(req)
1661         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1662
1663     def get_testcases(self, include_onlymatching=False):
1664         t = getattr(self, '_TEST', None)
1665         if t:
1666             assert not hasattr(self, '_TESTS'), \
1667                 '%s has _TEST and _TESTS' % type(self).__name__
1668             tests = [t]
1669         else:
1670             tests = getattr(self, '_TESTS', [])
1671         for t in tests:
1672             if not include_onlymatching and t.get('only_matching', False):
1673                 continue
1674             t['name'] = type(self).__name__[:-len('IE')]
1675             yield t
1676
1677     def is_suitable(self, age_limit):
1678         """ Test whether the extractor is generally suitable for the given
1679         age limit (i.e. pornographic sites are not, all others usually are) """
1680
1681         any_restricted = False
1682         for tc in self.get_testcases(include_onlymatching=False):
1683             if 'playlist' in tc:
1684                 tc = tc['playlist'][0]
1685             is_restricted = age_restricted(
1686                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1687             if not is_restricted:
1688                 return True
1689             any_restricted = any_restricted or is_restricted
1690         return not any_restricted
1691
1692     def extract_subtitles(self, *args, **kwargs):
1693         if (self._downloader.params.get('writesubtitles', False) or
1694                 self._downloader.params.get('listsubtitles')):
1695             return self._get_subtitles(*args, **kwargs)
1696         return {}
1697
1698     def _get_subtitles(self, *args, **kwargs):
1699         raise NotImplementedError('This method must be implemented by subclasses')
1700
1701     @staticmethod
1702     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1703         """ Merge subtitle items for one language. Items with duplicated URLs
1704         will be dropped. """
1705         list1_urls = set([item['url'] for item in subtitle_list1])
1706         ret = list(subtitle_list1)
1707         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1708         return ret
1709
1710     @classmethod
1711     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1712         """ Merge two subtitle dictionaries, language by language. """
1713         ret = dict(subtitle_dict1)
1714         for lang in subtitle_dict2:
1715             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1716         return ret
1717
1718     def extract_automatic_captions(self, *args, **kwargs):
1719         if (self._downloader.params.get('writeautomaticsub', False) or
1720                 self._downloader.params.get('listsubtitles')):
1721             return self._get_automatic_captions(*args, **kwargs)
1722         return {}
1723
1724     def _get_automatic_captions(self, *args, **kwargs):
1725         raise NotImplementedError('This method must be implemented by subclasses')
1726
1727     def mark_watched(self, *args, **kwargs):
1728         if (self._downloader.params.get('mark_watched', False) and
1729                 (self._get_login_info()[0] is not None or
1730                     self._downloader.params.get('cookiefile') is not None)):
1731             self._mark_watched(*args, **kwargs)
1732
1733     def _mark_watched(self, *args, **kwargs):
1734         raise NotImplementedError('This method must be implemented by subclasses')
1735
1736
1737 class SearchInfoExtractor(InfoExtractor):
1738     """
1739     Base class for paged search queries extractors.
1740     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1741     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1742     """
1743
1744     @classmethod
1745     def _make_valid_url(cls):
1746         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1747
1748     @classmethod
1749     def suitable(cls, url):
1750         return re.match(cls._make_valid_url(), url) is not None
1751
1752     def _real_extract(self, query):
1753         mobj = re.match(self._make_valid_url(), query)
1754         if mobj is None:
1755             raise ExtractorError('Invalid search query "%s"' % query)
1756
1757         prefix = mobj.group('prefix')
1758         query = mobj.group('query')
1759         if prefix == '':
1760             return self._get_n_results(query, 1)
1761         elif prefix == 'all':
1762             return self._get_n_results(query, self._MAX_RESULTS)
1763         else:
1764             n = int(prefix)
1765             if n <= 0:
1766                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1767             elif n > self._MAX_RESULTS:
1768                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1769                 n = self._MAX_RESULTS
1770             return self._get_n_results(query, n)
1771
1772     def _get_n_results(self, query, n):
1773         """Get a specified number of results for a query"""
1774         raise NotImplementedError('This method must be implemented by subclasses')
1775
1776     @property
1777     def SEARCH_KEY(self):
1778         return self._SEARCH_KEY