_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     url_basename,
  48     xpath_element,
  49     xpath_text,
  50     xpath_with_ns,
  51     determine_protocol,
  52     parse_duration,
  53     mimetype2ext,
  54     update_Request,
  55     update_url_query,
  56     parse_m3u8_attributes,
  57 )
  58
  59
  60 class InfoExtractor(object):
  61     """Information Extractor class.
  62
  63     Information extractors are the classes that, given a URL, extract
  64     information about the video (or videos) the URL refers to. This
  65     information includes the real video URL, the video title, author and
  66     others. The information is stored in a dictionary which is then
  67     passed to the YoutubeDL. The YoutubeDL processes this
  68     information possibly downloading the video to the file system, among
  69     other possible outcomes.
  70
  71     The type field determines the type of the result.
  72     By far the most common value (and the default if _type is missing) is
  73     "video", which indicates a single video.
  74
  75     For a video, the dictionaries must include the following fields:
  76
  77     id:             Video identifier.
  78     title:          Video title, unescaped.
  79
  80     Additionally, it must contain either a formats entry or a url one:
  81
  82     formats:        A list of dictionaries for each format available, ordered
  83                     from worst to best quality.
  84
  85                     Potential fields:
  86                     * url        Mandatory. The URL of the video file
  87                     * ext        Will be calculated from URL if missing
  88                     * format     A human-readable description of the format
  89                                  ("mp4 container with h264/opus").
  90                                  Calculated from the format_id, width, height.
  91                                  and format_note fields if missing.
  92                     * format_id  A short description of the format
  93                                  ("mp4_h264_opus" or "19").
  94                                 Technically optional, but strongly recommended.
  95                     * format_note Additional info about the format
  96                                  ("3D" or "DASH video")
  97                     * width      Width of the video, if known
  98                     * height     Height of the video, if known
  99                     * resolution Textual description of width and height
 100                     * tbr        Average bitrate of audio and video in KBit/s
 101                     * abr        Average audio bitrate in KBit/s
 102                     * acodec     Name of the audio codec in use
 103                     * asr        Audio sampling rate in Hertz
 104                     * vbr        Average video bitrate in KBit/s
 105                     * fps        Frame rate
 106                     * vcodec     Name of the video codec in use
 107                     * container  Name of the container format
 108                     * filesize   The number of bytes, if known in advance
 109                     * filesize_approx  An estimate for the number of bytes
 110                     * player_url SWF Player URL (used for rtmpdump).
 111                     * protocol   The protocol that will be used for the actual
 112                                  download, lower-case.
 113                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 114                                  "m3u8", "m3u8_native" or "http_dash_segments".
 115                     * preference Order number of this format. If this field is
 116                                  present and not None, the formats get sorted
 117                                  by this field, regardless of all other values.
 118                                  -1 for default (order by other properties),
 119                                  -2 or smaller for less than default.
 120                                  < -1000 to hide the format (if there is
 121                                     another one which is strictly better)
 122                     * language   Language code, e.g. "de" or "en-US".
 123                     * language_preference  Is this in the language mentioned in
 124                                  the URL?
 125                                  10 if it's what the URL is about,
 126                                  -1 for default (don't know),
 127                                  -10 otherwise, other values reserved for now.
 128                     * quality    Order number of the video quality of this
 129                                  format, irrespective of the file format.
 130                                  -1 for default (order by other properties),
 131                                  -2 or smaller for less than default.
 132                     * source_preference  Order number for this video source
 133                                   (quality takes higher priority)
 134                                  -1 for default (order by other properties),
 135                                  -2 or smaller for less than default.
 136                     * http_headers  A dictionary of additional HTTP headers
 137                                  to add to the request.
 138                     * stretched_ratio  If given and not 1, indicates that the
 139                                  video's pixels are not square.
 140                                  width : height ratio as float.
 141                     * no_resume  The server does not support resuming the
 142                                  (HTTP or RTMP) download. Boolean.
 143
 144     url:            Final video URL.
 145     ext:            Video filename extension.
 146     format:         The video format, defaults to ext (used for --get-format)
 147     player_url:     SWF Player URL (used for rtmpdump).
 148
 149     The following fields are optional:
 150
 151     alt_title:      A secondary title of the video.
 152     display_id      An alternative identifier for the video, not necessarily
 153                     unique, but available before title. Typically, id is
 154                     something like "4234987", title "Dancing naked mole rats",
 155                     and display_id "dancing-naked-mole-rats"
 156     thumbnails:     A list of dictionaries, with the following entries:
 157                         * "id" (optional, string) - Thumbnail format ID
 158                         * "url"
 159                         * "preference" (optional, int) - quality of the image
 160                         * "width" (optional, int)
 161                         * "height" (optional, int)
 162                         * "resolution" (optional, string "{width}x{height"},
 163                                         deprecated)
 164     thumbnail:      Full URL to a video thumbnail image.
 165     description:    Full video description.
 166     uploader:       Full name of the video uploader.
 167     license:        License name the video is licensed under.
 168     creator:        The creator of the video.
 169     release_date:   The date (YYYYMMDD) when the video was released.
 170     timestamp:      UNIX timestamp of the moment the video became available.
 171     upload_date:    Video upload date (YYYYMMDD).
 172                     If not explicitly set, calculated from timestamp.
 173     uploader_id:    Nickname or id of the video uploader.
 174     uploader_url:   Full URL to a personal webpage of the video uploader.
 175     location:       Physical location where the video was filmed.
 176     subtitles:      The available subtitles as a dictionary in the format
 177                     {language: subformats}. "subformats" is a list sorted from
 178                     lower to higher preference, each element is a dictionary
 179                     with the "ext" entry and one of:
 180                         * "data": The subtitles file contents
 181                         * "url": A URL pointing to the subtitles file
 182                     "ext" will be calculated from URL if missing
 183     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 184                     automatically generated captions
 185     duration:       Length of the video in seconds, as an integer or float.
 186     view_count:     How many users have watched the video on the platform.
 187     like_count:     Number of positive ratings of the video
 188     dislike_count:  Number of negative ratings of the video
 189     repost_count:   Number of reposts of the video
 190     average_rating: Average rating give by users, the scale used depends on the webpage
 191     comment_count:  Number of comments on the video
 192     comments:       A list of comments, each with one or more of the following
 193                     properties (all but one of text or html optional):
 194                         * "author" - human-readable name of the comment author
 195                         * "author_id" - user ID of the comment author
 196                         * "id" - Comment ID
 197                         * "html" - Comment as HTML
 198                         * "text" - Plain text of the comment
 199                         * "timestamp" - UNIX timestamp of comment
 200                         * "parent" - ID of the comment this one is replying to.
 201                                      Set to "root" to indicate that this is a
 202                                      comment to the original video.
 203     age_limit:      Age restriction for the video, as an integer (years)
 204     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 205                     should allow to get the same result again. (It will be set
 206                     by YoutubeDL if it's missing)
 207     categories:     A list of categories that the video falls in, for example
 208                     ["Sports", "Berlin"]
 209     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 210     is_live:        True, False, or None (=unknown). Whether this video is a
 211                     live stream that goes on instead of a fixed-length video.
 212     start_time:     Time in seconds where the reproduction should start, as
 213                     specified in the URL.
 214     end_time:       Time in seconds where the reproduction should end, as
 215                     specified in the URL.
 216
 217     The following fields should only be used when the video belongs to some logical
 218     chapter or section:
 219
 220     chapter:        Name or title of the chapter the video belongs to.
 221     chapter_number: Number of the chapter the video belongs to, as an integer.
 222     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 223
 224     The following fields should only be used when the video is an episode of some
 225     series or programme:
 226
 227     series:         Title of the series or programme the video episode belongs to.
 228     season:         Title of the season the video episode belongs to.
 229     season_number:  Number of the season the video episode belongs to, as an integer.
 230     season_id:      Id of the season the video episode belongs to, as a unicode string.
 231     episode:        Title of the video episode. Unlike mandatory video title field,
 232                     this field should denote the exact title of the video episode
 233                     without any kind of decoration.
 234     episode_number: Number of the video episode within a season, as an integer.
 235     episode_id:     Id of the video episode, as a unicode string.
 236
 237     The following fields should only be used when the media is a track or a part of
 238     a music album:
 239
 240     track:          Title of the track.
 241     track_number:   Number of the track within an album or a disc, as an integer.
 242     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 243                     as a unicode string.
 244     artist:         Artist(s) of the track.
 245     genre:          Genre(s) of the track.
 246     album:          Title of the album the track belongs to.
 247     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 248     album_artist:   List of all artists appeared on the album (e.g.
 249                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 250                     and compilations).
 251     disc_number:    Number of the disc or other physical medium the track belongs to,
 252                     as an integer.
 253     release_year:   Year (YYYY) when the album was released.
 254
 255     Unless mentioned otherwise, the fields should be Unicode strings.
 256
 257     Unless mentioned otherwise, None is equivalent to absence of information.
 258
 259
 260     _type "playlist" indicates multiple videos.
 261     There must be a key "entries", which is a list, an iterable, or a PagedList
 262     object, each element of which is a valid dictionary by this specification.
 263
 264     Additionally, playlists can have "title", "description" and "id" attributes
 265     with the same semantics as videos (see above).
 266
 267
 268     _type "multi_video" indicates that there are multiple videos that
 269     form a single show, for examples multiple acts of an opera or TV episode.
 270     It must have an entries key like a playlist and contain all the keys
 271     required for a video at the same time.
 272
 273
 274     _type "url" indicates that the video must be extracted from another
 275     location, possibly by a different extractor. Its only required key is:
 276     "url" - the next URL to extract.
 277     The key "ie_key" can be set to the class name (minus the trailing "IE",
 278     e.g. "Youtube") if the extractor class is known in advance.
 279     Additionally, the dictionary may have any properties of the resolved entity
 280     known in advance, for example "title" if the title of the referred video is
 281     known ahead of time.
 282
 283
 284     _type "url_transparent" entities have the same specification as "url", but
 285     indicate that the given additional information is more precise than the one
 286     associated with the resolved URL.
 287     This is useful when a site employs a video service that hosts the video and
 288     its technical metadata, but that video service does not embed a useful
 289     title, description etc.
 290
 291
 292     Subclasses of this one should re-define the _real_initialize() and
 293     _real_extract() methods and define a _VALID_URL regexp.
 294     Probably, they should also be added to the list of extractors.
 295
 296     Finally, the _WORKING attribute should be set to False for broken IEs
 297     in order to warn the users and skip the tests.
 298     """
 299
 300     _ready = False
 301     _downloader = None
 302     _WORKING = True
 303
 304     def __init__(self, downloader=None):
 305         """Constructor. Receives an optional downloader."""
 306         self._ready = False
 307         self.set_downloader(downloader)
 308
 309     @classmethod
 310     def suitable(cls, url):
 311         """Receives a URL and returns True if suitable for this IE."""
 312
 313         # This does not use has/getattr intentionally - we want to know whether
 314         # we have cached the regexp for *this* class, whereas getattr would also
 315         # match the superclass
 316         if '_VALID_URL_RE' not in cls.__dict__:
 317             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 318         return cls._VALID_URL_RE.match(url) is not None
 319
 320     @classmethod
 321     def _match_id(cls, url):
 322         if '_VALID_URL_RE' not in cls.__dict__:
 323             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 324         m = cls._VALID_URL_RE.match(url)
 325         assert m
 326         return m.group('id')
 327
 328     @classmethod
 329     def working(cls):
 330         """Getter method for _WORKING."""
 331         return cls._WORKING
 332
 333     def initialize(self):
 334         """Initializes an instance (authentication, etc)."""
 335         if not self._ready:
 336             self._real_initialize()
 337             self._ready = True
 338
 339     def extract(self, url):
 340         """Extracts URL information and returns it in list of dicts."""
 341         try:
 342             self.initialize()
 343             return self._real_extract(url)
 344         except ExtractorError:
 345             raise
 346         except compat_http_client.IncompleteRead as e:
 347             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 348         except (KeyError, StopIteration) as e:
 349             raise ExtractorError('An extractor error has occurred.', cause=e)
 350
 351     def set_downloader(self, downloader):
 352         """Sets the downloader for this IE."""
 353         self._downloader = downloader
 354
 355     def _real_initialize(self):
 356         """Real initialization process. Redefine in subclasses."""
 357         pass
 358
 359     def _real_extract(self, url):
 360         """Real extraction process. Redefine in subclasses."""
 361         pass
 362
 363     @classmethod
 364     def ie_key(cls):
 365         """A string for getting the InfoExtractor with get_info_extractor"""
 366         return compat_str(cls.__name__[:-2])
 367
 368     @property
 369     def IE_NAME(self):
 370         return compat_str(type(self).__name__[:-2])
 371
 372     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 373         """ Returns the response handle """
 374         if note is None:
 375             self.report_download_webpage(video_id)
 376         elif note is not False:
 377             if video_id is None:
 378                 self.to_screen('%s' % (note,))
 379             else:
 380                 self.to_screen('%s: %s' % (video_id, note))
 381         if isinstance(url_or_request, compat_urllib_request.Request):
 382             url_or_request = update_Request(
 383                 url_or_request, data=data, headers=headers, query=query)
 384         else:
 385             if query:
 386                 url_or_request = update_url_query(url_or_request, query)
 387             if data is not None or headers:
 388                 url_or_request = sanitized_Request(url_or_request, data, headers)
 389         try:
 390             return self._downloader.urlopen(url_or_request)
 391         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 392             if errnote is False:
 393                 return False
 394             if errnote is None:
 395                 errnote = 'Unable to download webpage'
 396
 397             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 398             if fatal:
 399                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 400             else:
 401                 self._downloader.report_warning(errmsg)
 402                 return False
 403
 404     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 405         """ Returns a tuple (page content as string, URL handle) """
 406         # Strip hashes from the URL (#1038)
 407         if isinstance(url_or_request, (compat_str, str)):
 408             url_or_request = url_or_request.partition('#')[0]
 409
 410         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 411         if urlh is False:
 412             assert not fatal
 413             return False
 414         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 415         return (content, urlh)
 416
 417     @staticmethod
 418     def _guess_encoding_from_content(content_type, webpage_bytes):
 419         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 420         if m:
 421             encoding = m.group(1)
 422         else:
 423             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 424                           webpage_bytes[:1024])
 425             if m:
 426                 encoding = m.group(1).decode('ascii')
 427             elif webpage_bytes.startswith(b'\xff\xfe'):
 428                 encoding = 'utf-16'
 429             else:
 430                 encoding = 'utf-8'
 431
 432         return encoding
 433
 434     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 435         content_type = urlh.headers.get('Content-Type', '')
 436         webpage_bytes = urlh.read()
 437         if prefix is not None:
 438             webpage_bytes = prefix + webpage_bytes
 439         if not encoding:
 440             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 441         if self._downloader.params.get('dump_intermediate_pages', False):
 442             try:
 443                 url = url_or_request.get_full_url()
 444             except AttributeError:
 445                 url = url_or_request
 446             self.to_screen('Dumping request to ' + url)
 447             dump = base64.b64encode(webpage_bytes).decode('ascii')
 448             self._downloader.to_screen(dump)
 449         if self._downloader.params.get('write_pages', False):
 450             try:
 451                 url = url_or_request.get_full_url()
 452             except AttributeError:
 453                 url = url_or_request
 454             basen = '%s_%s' % (video_id, url)
 455             if len(basen) > 240:
 456                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 457                 basen = basen[:240 - len(h)] + h
 458             raw_filename = basen + '.dump'
 459             filename = sanitize_filename(raw_filename, restricted=True)
 460             self.to_screen('Saving request to ' + filename)
 461             # Working around MAX_PATH limitation on Windows (see
 462             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 463             if compat_os_name == 'nt':
 464                 absfilepath = os.path.abspath(filename)
 465                 if len(absfilepath) > 259:
 466                     filename = '\\\\?\\' + absfilepath
 467             with open(filename, 'wb') as outf:
 468                 outf.write(webpage_bytes)
 469
 470         try:
 471             content = webpage_bytes.decode(encoding, 'replace')
 472         except LookupError:
 473             content = webpage_bytes.decode('utf-8', 'replace')
 474
 475         if ('<title>Access to this site is blocked</title>' in content and
 476                 'Websense' in content[:512]):
 477             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 478             blocked_iframe = self._html_search_regex(
 479                 r'<iframe src="([^"]+)"', content,
 480                 'Websense information URL', default=None)
 481             if blocked_iframe:
 482                 msg += ' Visit %s for more details' % blocked_iframe
 483             raise ExtractorError(msg, expected=True)
 484         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 485             msg = (
 486                 'Access to this webpage has been blocked by Indian censorship. '
 487                 'Use a VPN or proxy server (with --proxy) to route around it.')
 488             block_msg = self._html_search_regex(
 489                 r'</h1><p>(.*?)</p>',
 490                 content, 'block message', default=None)
 491             if block_msg:
 492                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 493             raise ExtractorError(msg, expected=True)
 494
 495         return content
 496
 497     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 498         """ Returns the data of the page as a string """
 499         success = False
 500         try_count = 0
 501         while success is False:
 502             try:
 503                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 504                 success = True
 505             except compat_http_client.IncompleteRead as e:
 506                 try_count += 1
 507                 if try_count >= tries:
 508                     raise e
 509                 self._sleep(timeout, video_id)
 510         if res is False:
 511             return res
 512         else:
 513             content, _ = res
 514             return content
 515
 516     def _download_xml(self, url_or_request, video_id,
 517                       note='Downloading XML', errnote='Unable to download XML',
 518                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 519         """Return the xml as an xml.etree.ElementTree.Element"""
 520         xml_string = self._download_webpage(
 521             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 522         if xml_string is False:
 523             return xml_string
 524         if transform_source:
 525             xml_string = transform_source(xml_string)
 526         return compat_etree_fromstring(xml_string.encode('utf-8'))
 527
 528     def _download_json(self, url_or_request, video_id,
 529                        note='Downloading JSON metadata',
 530                        errnote='Unable to download JSON metadata',
 531                        transform_source=None,
 532                        fatal=True, encoding=None, data=None, headers={}, query={}):
 533         json_string = self._download_webpage(
 534             url_or_request, video_id, note, errnote, fatal=fatal,
 535             encoding=encoding, data=data, headers=headers, query=query)
 536         if (not fatal) and json_string is False:
 537             return None
 538         return self._parse_json(
 539             json_string, video_id, transform_source=transform_source, fatal=fatal)
 540
 541     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 542         if transform_source:
 543             json_string = transform_source(json_string)
 544         try:
 545             return json.loads(json_string)
 546         except ValueError as ve:
 547             errmsg = '%s: Failed to parse JSON ' % video_id
 548             if fatal:
 549                 raise ExtractorError(errmsg, cause=ve)
 550             else:
 551                 self.report_warning(errmsg + str(ve))
 552
 553     def report_warning(self, msg, video_id=None):
 554         idstr = '' if video_id is None else '%s: ' % video_id
 555         self._downloader.report_warning(
 556             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 557
 558     def to_screen(self, msg):
 559         """Print msg to screen, prefixing it with '[ie_name]'"""
 560         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 561
 562     def report_extraction(self, id_or_name):
 563         """Report information extraction."""
 564         self.to_screen('%s: Extracting information' % id_or_name)
 565
 566     def report_download_webpage(self, video_id):
 567         """Report webpage download."""
 568         self.to_screen('%s: Downloading webpage' % video_id)
 569
 570     def report_age_confirmation(self):
 571         """Report attempt to confirm age."""
 572         self.to_screen('Confirming age')
 573
 574     def report_login(self):
 575         """Report attempt to log in."""
 576         self.to_screen('Logging in')
 577
 578     @staticmethod
 579     def raise_login_required(msg='This video is only available for registered users'):
 580         raise ExtractorError(
 581             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 582             expected=True)
 583
 584     @staticmethod
 585     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 586         raise ExtractorError(
 587             '%s. You might want to use --proxy to workaround.' % msg,
 588             expected=True)
 589
 590     # Methods for following #608
 591     @staticmethod
 592     def url_result(url, ie=None, video_id=None, video_title=None):
 593         """Returns a URL that points to a page that should be processed"""
 594         # TODO: ie should be the class used for getting the info
 595         video_info = {'_type': 'url',
 596                       'url': url,
 597                       'ie_key': ie}
 598         if video_id is not None:
 599             video_info['id'] = video_id
 600         if video_title is not None:
 601             video_info['title'] = video_title
 602         return video_info
 603
 604     @staticmethod
 605     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 606         """Returns a playlist"""
 607         video_info = {'_type': 'playlist',
 608                       'entries': entries}
 609         if playlist_id:
 610             video_info['id'] = playlist_id
 611         if playlist_title:
 612             video_info['title'] = playlist_title
 613         if playlist_description:
 614             video_info['description'] = playlist_description
 615         return video_info
 616
 617     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 618         """
 619         Perform a regex search on the given string, using a single or a list of
 620         patterns returning the first matching group.
 621         In case of failure return a default value or raise a WARNING or a
 622         RegexNotFoundError, depending on fatal, specifying the field name.
 623         """
 624         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 625             mobj = re.search(pattern, string, flags)
 626         else:
 627             for p in pattern:
 628                 mobj = re.search(p, string, flags)
 629                 if mobj:
 630                     break
 631
 632         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 633             _name = '\033[0;34m%s\033[0m' % name
 634         else:
 635             _name = name
 636
 637         if mobj:
 638             if group is None:
 639                 # return the first matching group
 640                 return next(g for g in mobj.groups() if g is not None)
 641             else:
 642                 return mobj.group(group)
 643         elif default is not NO_DEFAULT:
 644             return default
 645         elif fatal:
 646             raise RegexNotFoundError('Unable to extract %s' % _name)
 647         else:
 648             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 649             return None
 650
 651     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 652         """
 653         Like _search_regex, but strips HTML tags and unescapes entities.
 654         """
 655         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 656         if res:
 657             return clean_html(res).strip()
 658         else:
 659             return res
 660
 661     def _get_login_info(self):
 662         """
 663         Get the login info as (username, password)
 664         It will look in the netrc file using the _NETRC_MACHINE value
 665         If there's no info available, return (None, None)
 666         """
 667         if self._downloader is None:
 668             return (None, None)
 669
 670         username = None
 671         password = None
 672         downloader_params = self._downloader.params
 673
 674         # Attempt to use provided username and password or .netrc data
 675         if downloader_params.get('username') is not None:
 676             username = downloader_params['username']
 677             password = downloader_params['password']
 678         elif downloader_params.get('usenetrc', False):
 679             try:
 680                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 681                 if info is not None:
 682                     username = info[0]
 683                     password = info[2]
 684                 else:
 685                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 686             except (IOError, netrc.NetrcParseError) as err:
 687                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 688
 689         return (username, password)
 690
 691     def _get_tfa_info(self, note='two-factor verification code'):
 692         """
 693         Get the two-factor authentication info
 694         TODO - asking the user will be required for sms/phone verify
 695         currently just uses the command line option
 696         If there's no info available, return None
 697         """
 698         if self._downloader is None:
 699             return None
 700         downloader_params = self._downloader.params
 701
 702         if downloader_params.get('twofactor') is not None:
 703             return downloader_params['twofactor']
 704
 705         return compat_getpass('Type %s and press [Return]: ' % note)
 706
 707     # Helper functions for extracting OpenGraph info
 708     @staticmethod
 709     def _og_regexes(prop):
 710         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 711         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 712                        % {'prop': re.escape(prop)})
 713         template = r'<meta[^>]+?%s[^>]+?%s'
 714         return [
 715             template % (property_re, content_re),
 716             template % (content_re, property_re),
 717         ]
 718
 719     @staticmethod
 720     def _meta_regex(prop):
 721         return r'''(?isx)<meta
 722                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 723                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 724
 725     def _og_search_property(self, prop, html, name=None, **kargs):
 726         if name is None:
 727             name = 'OpenGraph %s' % prop
 728         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 729         if escaped is None:
 730             return None
 731         return unescapeHTML(escaped)
 732
 733     def _og_search_thumbnail(self, html, **kargs):
 734         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 735
 736     def _og_search_description(self, html, **kargs):
 737         return self._og_search_property('description', html, fatal=False, **kargs)
 738
 739     def _og_search_title(self, html, **kargs):
 740         return self._og_search_property('title', html, **kargs)
 741
 742     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 743         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 744         if secure:
 745             regexes = self._og_regexes('video:secure_url') + regexes
 746         return self._html_search_regex(regexes, html, name, **kargs)
 747
 748     def _og_search_url(self, html, **kargs):
 749         return self._og_search_property('url', html, **kargs)
 750
 751     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 752         if not isinstance(name, (list, tuple)):
 753             name = [name]
 754         if display_name is None:
 755             display_name = name[0]
 756         return self._html_search_regex(
 757             [self._meta_regex(n) for n in name],
 758             html, display_name, fatal=fatal, group='content', **kwargs)
 759
 760     def _dc_search_uploader(self, html):
 761         return self._html_search_meta('dc.creator', html, 'uploader')
 762
 763     def _rta_search(self, html):
 764         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 765         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 766                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 767                      html):
 768             return 18
 769         return 0
 770
 771     def _media_rating_search(self, html):
 772         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 773         rating = self._html_search_meta('rating', html)
 774
 775         if not rating:
 776             return None
 777
 778         RATING_TABLE = {
 779             'safe for kids': 0,
 780             'general': 8,
 781             '14 years': 14,
 782             'mature': 17,
 783             'restricted': 19,
 784         }
 785         return RATING_TABLE.get(rating.lower())
 786
 787     def _family_friendly_search(self, html):
 788         # See http://schema.org/VideoObject
 789         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 790
 791         if not family_friendly:
 792             return None
 793
 794         RATING_TABLE = {
 795             '1': 0,
 796             'true': 0,
 797             '0': 18,
 798             'false': 18,
 799         }
 800         return RATING_TABLE.get(family_friendly.lower())
 801
 802     def _twitter_search_player(self, html):
 803         return self._html_search_meta('twitter:player', html,
 804                                       'twitter card player')
 805
 806     def _search_json_ld(self, html, video_id, **kwargs):
 807         json_ld = self._search_regex(
 808             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 809             html, 'JSON-LD', group='json_ld', **kwargs)
 810         if not json_ld:
 811             return {}
 812         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 813
 814     def _json_ld(self, json_ld, video_id, fatal=True):
 815         if isinstance(json_ld, compat_str):
 816             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 817         if not json_ld:
 818             return {}
 819         info = {}
 820         if json_ld.get('@context') == 'http://schema.org':
 821             item_type = json_ld.get('@type')
 822             if item_type == 'TVEpisode':
 823                 info.update({
 824                     'episode': unescapeHTML(json_ld.get('name')),
 825                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 826                     'description': unescapeHTML(json_ld.get('description')),
 827                 })
 828                 part_of_season = json_ld.get('partOfSeason')
 829                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 830                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 831                 part_of_series = json_ld.get('partOfSeries')
 832                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 833                     info['series'] = unescapeHTML(part_of_series.get('name'))
 834             elif item_type == 'Article':
 835                 info.update({
 836                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 837                     'title': unescapeHTML(json_ld.get('headline')),
 838                     'description': unescapeHTML(json_ld.get('articleBody')),
 839                 })
 840         return dict((k, v) for k, v in info.items() if v is not None)
 841
 842     @staticmethod
 843     def _hidden_inputs(html):
 844         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 845         hidden_inputs = {}
 846         for input in re.findall(r'(?i)<input([^>]+)>', html):
 847             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 848                 continue
 849             name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
 850             if not name:
 851                 continue
 852             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 853             if not value:
 854                 continue
 855             hidden_inputs[name.group('value')] = value.group('value')
 856         return hidden_inputs
 857
 858     def _form_hidden_inputs(self, form_id, html):
 859         form = self._search_regex(
 860             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 861             html, '%s form' % form_id, group='form')
 862         return self._hidden_inputs(form)
 863
 864     def _sort_formats(self, formats, field_preference=None):
 865         if not formats:
 866             raise ExtractorError('No video formats found')
 867
 868         for f in formats:
 869             # Automatically determine tbr when missing based on abr and vbr (improves
 870             # formats sorting in some cases)
 871             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 872                 f['tbr'] = f['abr'] + f['vbr']
 873
 874         def _formats_key(f):
 875             # TODO remove the following workaround
 876             from ..utils import determine_ext
 877             if not f.get('ext') and 'url' in f:
 878                 f['ext'] = determine_ext(f['url'])
 879
 880             if isinstance(field_preference, (list, tuple)):
 881                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 882
 883             preference = f.get('preference')
 884             if preference is None:
 885                 preference = 0
 886                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 887                     preference -= 0.5
 888
 889             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 890
 891             if f.get('vcodec') == 'none':  # audio only
 892                 preference -= 50
 893                 if self._downloader.params.get('prefer_free_formats'):
 894                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 895                 else:
 896                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 897                 ext_preference = 0
 898                 try:
 899                     audio_ext_preference = ORDER.index(f['ext'])
 900                 except ValueError:
 901                     audio_ext_preference = -1
 902             else:
 903                 if f.get('acodec') == 'none':  # video only
 904                     preference -= 40
 905                 if self._downloader.params.get('prefer_free_formats'):
 906                     ORDER = ['flv', 'mp4', 'webm']
 907                 else:
 908                     ORDER = ['webm', 'flv', 'mp4']
 909                 try:
 910                     ext_preference = ORDER.index(f['ext'])
 911                 except ValueError:
 912                     ext_preference = -1
 913                 audio_ext_preference = 0
 914
 915             return (
 916                 preference,
 917                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 918                 f.get('quality') if f.get('quality') is not None else -1,
 919                 f.get('tbr') if f.get('tbr') is not None else -1,
 920                 f.get('filesize') if f.get('filesize') is not None else -1,
 921                 f.get('vbr') if f.get('vbr') is not None else -1,
 922                 f.get('height') if f.get('height') is not None else -1,
 923                 f.get('width') if f.get('width') is not None else -1,
 924                 proto_preference,
 925                 ext_preference,
 926                 f.get('abr') if f.get('abr') is not None else -1,
 927                 audio_ext_preference,
 928                 f.get('fps') if f.get('fps') is not None else -1,
 929                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 930                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 931                 f.get('format_id') if f.get('format_id') is not None else '',
 932             )
 933         formats.sort(key=_formats_key)
 934
 935     def _check_formats(self, formats, video_id):
 936         if formats:
 937             formats[:] = filter(
 938                 lambda f: self._is_valid_url(
 939                     f['url'], video_id,
 940                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 941                 formats)
 942
 943     @staticmethod
 944     def _remove_duplicate_formats(formats):
 945         format_urls = set()
 946         unique_formats = []
 947         for f in formats:
 948             if f['url'] not in format_urls:
 949                 format_urls.add(f['url'])
 950                 unique_formats.append(f)
 951         formats[:] = unique_formats
 952
 953     def _is_valid_url(self, url, video_id, item='video'):
 954         url = self._proto_relative_url(url, scheme='http:')
 955         # For now assume non HTTP(S) URLs always valid
 956         if not (url.startswith('http://') or url.startswith('https://')):
 957             return True
 958         try:
 959             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 960             return True
 961         except ExtractorError as e:
 962             if isinstance(e.cause, compat_urllib_error.URLError):
 963                 self.to_screen(
 964                     '%s: %s URL is invalid, skipping' % (video_id, item))
 965                 return False
 966             raise
 967
 968     def http_scheme(self):
 969         """ Either "http:" or "https:", depending on the user's preferences """
 970         return (
 971             'http:'
 972             if self._downloader.params.get('prefer_insecure', False)
 973             else 'https:')
 974
 975     def _proto_relative_url(self, url, scheme=None):
 976         if url is None:
 977             return url
 978         if url.startswith('//'):
 979             if scheme is None:
 980                 scheme = self.http_scheme()
 981             return scheme + url
 982         else:
 983             return url
 984
 985     def _sleep(self, timeout, video_id, msg_template=None):
 986         if msg_template is None:
 987             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 988         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 989         self.to_screen(msg)
 990         time.sleep(timeout)
 991
 992     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 993                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 994                              fatal=True, m3u8_id=None):
 995         manifest = self._download_xml(
 996             manifest_url, video_id, 'Downloading f4m manifest',
 997             'Unable to download f4m manifest',
 998             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 999             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1000             transform_source=transform_source,
1001             fatal=fatal)
1002
1003         if manifest is False:
1004             return []
1005
1006         return self._parse_f4m_formats(
1007             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1008             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1009
1010     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1011                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1012                            fatal=True, m3u8_id=None):
1013         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1014         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1015         if akamai_pv is not None and ';' in akamai_pv.text:
1016             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1017             if playerVerificationChallenge.strip() != '':
1018                 return []
1019
1020         formats = []
1021         manifest_version = '1.0'
1022         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1023         if not media_nodes:
1024             manifest_version = '2.0'
1025             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1026         # Remove unsupported DRM protected media from final formats
1027         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1028         media_nodes = remove_encrypted_media(media_nodes)
1029         if not media_nodes:
1030             return formats
1031         base_url = xpath_text(
1032             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1033             'base URL', default=None)
1034         if base_url:
1035             base_url = base_url.strip()
1036
1037         bootstrap_info = xpath_element(
1038             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1039             'bootstrap info', default=None)
1040
1041         for i, media_el in enumerate(media_nodes):
1042             tbr = int_or_none(media_el.attrib.get('bitrate'))
1043             width = int_or_none(media_el.attrib.get('width'))
1044             height = int_or_none(media_el.attrib.get('height'))
1045             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1046             # If <bootstrapInfo> is present, the specified f4m is a
1047             # stream-level manifest, and only set-level manifests may refer to
1048             # external resources.  See section 11.4 and section 4 of F4M spec
1049             if bootstrap_info is None:
1050                 media_url = None
1051                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1052                 if manifest_version == '2.0':
1053                     media_url = media_el.attrib.get('href')
1054                 if media_url is None:
1055                     media_url = media_el.attrib.get('url')
1056                 if not media_url:
1057                     continue
1058                 manifest_url = (
1059                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1060                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1061                 # If media_url is itself a f4m manifest do the recursive extraction
1062                 # since bitrates in parent manifest (this one) and media_url manifest
1063                 # may differ leading to inability to resolve the format by requested
1064                 # bitrate in f4m downloader
1065                 ext = determine_ext(manifest_url)
1066                 if ext == 'f4m':
1067                     f4m_formats = self._extract_f4m_formats(
1068                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1069                         transform_source=transform_source, fatal=fatal)
1070                     # Sometimes stream-level manifest contains single media entry that
1071                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1072                     # At the same time parent's media entry in set-level manifest may
1073                     # contain it. We will copy it from parent in such cases.
1074                     if len(f4m_formats) == 1:
1075                         f = f4m_formats[0]
1076                         f.update({
1077                             'tbr': f.get('tbr') or tbr,
1078                             'width': f.get('width') or width,
1079                             'height': f.get('height') or height,
1080                             'format_id': f.get('format_id') if not tbr else format_id,
1081                         })
1082                     formats.extend(f4m_formats)
1083                     continue
1084                 elif ext == 'm3u8':
1085                     formats.extend(self._extract_m3u8_formats(
1086                         manifest_url, video_id, 'mp4', preference=preference,
1087                         m3u8_id=m3u8_id, fatal=fatal))
1088                     continue
1089             formats.append({
1090                 'format_id': format_id,
1091                 'url': manifest_url,
1092                 'ext': 'flv' if bootstrap_info is not None else None,
1093                 'tbr': tbr,
1094                 'width': width,
1095                 'height': height,
1096                 'preference': preference,
1097             })
1098         return formats
1099
1100     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1101         return {
1102             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1103             'url': m3u8_url,
1104             'ext': ext,
1105             'protocol': 'm3u8',
1106             'preference': preference - 1 if preference else -1,
1107             'resolution': 'multiple',
1108             'format_note': 'Quality selection URL',
1109         }
1110
1111     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1112                               entry_protocol='m3u8', preference=None,
1113                               m3u8_id=None, note=None, errnote=None,
1114                               fatal=True, live=False):
1115
1116         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1117
1118         format_url = lambda u: (
1119             u
1120             if re.match(r'^https?://', u)
1121             else compat_urlparse.urljoin(m3u8_url, u))
1122
1123         res = self._download_webpage_handle(
1124             m3u8_url, video_id,
1125             note=note or 'Downloading m3u8 information',
1126             errnote=errnote or 'Failed to download m3u8 information',
1127             fatal=fatal)
1128         if res is False:
1129             return []
1130         m3u8_doc, urlh = res
1131         m3u8_url = urlh.geturl()
1132
1133         # We should try extracting formats only from master playlists [1], i.e.
1134         # playlists that describe available qualities. On the other hand media
1135         # playlists [2] should be returned as is since they contain just the media
1136         # without qualities renditions.
1137         # Fortunately, master playlist can be easily distinguished from media
1138         # playlist based on particular tags availability. As of [1, 2] master
1139         # playlist tags MUST NOT appear in a media playist and vice versa.
1140         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1141         # and MUST NOT appear in master playlist thus we can clearly detect media
1142         # playlist with this criterion.
1143         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1144         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1145         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1146         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1147             return [{
1148                 'url': m3u8_url,
1149                 'format_id': m3u8_id,
1150                 'ext': ext,
1151                 'protocol': entry_protocol,
1152                 'preference': preference,
1153             }]
1154         last_info = None
1155         last_media = None
1156         for line in m3u8_doc.splitlines():
1157             if line.startswith('#EXT-X-STREAM-INF:'):
1158                 last_info = parse_m3u8_attributes(line)
1159             elif line.startswith('#EXT-X-MEDIA:'):
1160                 last_media = parse_m3u8_attributes(line)
1161             elif line.startswith('#') or not line.strip():
1162                 continue
1163             else:
1164                 if last_info is None:
1165                     formats.append({'url': format_url(line)})
1166                     continue
1167                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1168                 format_id = []
1169                 if m3u8_id:
1170                     format_id.append(m3u8_id)
1171                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
1172                 # Despite specification does not mention NAME attribute for
1173                 # EXT-X-STREAM-INF it still sometimes may be present
1174                 stream_name = last_info.get('NAME') or last_media_name
1175                 # Bandwidth of live streams may differ over time thus making
1176                 # format_id unpredictable. So it's better to keep provided
1177                 # format_id intact.
1178                 if not live:
1179                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1180                 f = {
1181                     'format_id': '-'.join(format_id),
1182                     'url': format_url(line.strip()),
1183                     'tbr': tbr,
1184                     'ext': ext,
1185                     'protocol': entry_protocol,
1186                     'preference': preference,
1187                 }
1188                 resolution = last_info.get('RESOLUTION')
1189                 if resolution:
1190                     width_str, height_str = resolution.split('x')
1191                     f['width'] = int(width_str)
1192                     f['height'] = int(height_str)
1193                 codecs = last_info.get('CODECS')
1194                 if codecs:
1195                     vcodec, acodec = [None] * 2
1196                     va_codecs = codecs.split(',')
1197                     if len(va_codecs) == 1:
1198                         # Audio only entries usually come with single codec and
1199                         # no resolution. For more robustness we also check it to
1200                         # be mp4 audio.
1201                         if not resolution and va_codecs[0].startswith('mp4a'):
1202                             vcodec, acodec = 'none', va_codecs[0]
1203                         else:
1204                             vcodec = va_codecs[0]
1205                     else:
1206                         vcodec, acodec = va_codecs[:2]
1207                     f.update({
1208                         'acodec': acodec,
1209                         'vcodec': vcodec,
1210                     })
1211                 if last_media is not None:
1212                     f['m3u8_media'] = last_media
1213                     last_media = None
1214                 formats.append(f)
1215                 last_info = {}
1216         return formats
1217
1218     @staticmethod
1219     def _xpath_ns(path, namespace=None):
1220         if not namespace:
1221             return path
1222         out = []
1223         for c in path.split('/'):
1224             if not c or c == '.':
1225                 out.append(c)
1226             else:
1227                 out.append('{%s}%s' % (namespace, c))
1228         return '/'.join(out)
1229
1230     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1231         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1232
1233         if smil is False:
1234             assert not fatal
1235             return []
1236
1237         namespace = self._parse_smil_namespace(smil)
1238
1239         return self._parse_smil_formats(
1240             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1241
1242     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1243         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1244         if smil is False:
1245             return {}
1246         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1247
1248     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1249         return self._download_xml(
1250             smil_url, video_id, 'Downloading SMIL file',
1251             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1252
1253     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1254         namespace = self._parse_smil_namespace(smil)
1255
1256         formats = self._parse_smil_formats(
1257             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1258         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1259
1260         video_id = os.path.splitext(url_basename(smil_url))[0]
1261         title = None
1262         description = None
1263         upload_date = None
1264         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1265             name = meta.attrib.get('name')
1266             content = meta.attrib.get('content')
1267             if not name or not content:
1268                 continue
1269             if not title and name == 'title':
1270                 title = content
1271             elif not description and name in ('description', 'abstract'):
1272                 description = content
1273             elif not upload_date and name == 'date':
1274                 upload_date = unified_strdate(content)
1275
1276         thumbnails = [{
1277             'id': image.get('type'),
1278             'url': image.get('src'),
1279             'width': int_or_none(image.get('width')),
1280             'height': int_or_none(image.get('height')),
1281         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1282
1283         return {
1284             'id': video_id,
1285             'title': title or video_id,
1286             'description': description,
1287             'upload_date': upload_date,
1288             'thumbnails': thumbnails,
1289             'formats': formats,
1290             'subtitles': subtitles,
1291         }
1292
1293     def _parse_smil_namespace(self, smil):
1294         return self._search_regex(
1295             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1296
1297     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1298         base = smil_url
1299         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1300             b = meta.get('base') or meta.get('httpBase')
1301             if b:
1302                 base = b
1303                 break
1304
1305         formats = []
1306         rtmp_count = 0
1307         http_count = 0
1308         m3u8_count = 0
1309
1310         srcs = []
1311         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1312         for medium in media:
1313             src = medium.get('src')
1314             if not src or src in srcs:
1315                 continue
1316             srcs.append(src)
1317
1318             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1319             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1320             width = int_or_none(medium.get('width'))
1321             height = int_or_none(medium.get('height'))
1322             proto = medium.get('proto')
1323             ext = medium.get('ext')
1324             src_ext = determine_ext(src)
1325             streamer = medium.get('streamer') or base
1326
1327             if proto == 'rtmp' or streamer.startswith('rtmp'):
1328                 rtmp_count += 1
1329                 formats.append({
1330                     'url': streamer,
1331                     'play_path': src,
1332                     'ext': 'flv',
1333                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1334                     'tbr': bitrate,
1335                     'filesize': filesize,
1336                     'width': width,
1337                     'height': height,
1338                 })
1339                 if transform_rtmp_url:
1340                     streamer, src = transform_rtmp_url(streamer, src)
1341                     formats[-1].update({
1342                         'url': streamer,
1343                         'play_path': src,
1344                     })
1345                 continue
1346
1347             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1348             src_url = src_url.strip()
1349
1350             if proto == 'm3u8' or src_ext == 'm3u8':
1351                 m3u8_formats = self._extract_m3u8_formats(
1352                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1353                 if len(m3u8_formats) == 1:
1354                     m3u8_count += 1
1355                     m3u8_formats[0].update({
1356                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1357                         'tbr': bitrate,
1358                         'width': width,
1359                         'height': height,
1360                     })
1361                 formats.extend(m3u8_formats)
1362                 continue
1363
1364             if src_ext == 'f4m':
1365                 f4m_url = src_url
1366                 if not f4m_params:
1367                     f4m_params = {
1368                         'hdcore': '3.2.0',
1369                         'plugin': 'flowplayer-3.2.0.1',
1370                     }
1371                 f4m_url += '&' if '?' in f4m_url else '?'
1372                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1373                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1374                 continue
1375
1376             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1377                 http_count += 1
1378                 formats.append({
1379                     'url': src_url,
1380                     'ext': ext or src_ext or 'flv',
1381                     'format_id': 'http-%d' % (bitrate or http_count),
1382                     'tbr': bitrate,
1383                     'filesize': filesize,
1384                     'width': width,
1385                     'height': height,
1386                 })
1387                 continue
1388
1389         return formats
1390
1391     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1392         urls = []
1393         subtitles = {}
1394         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1395             src = textstream.get('src')
1396             if not src or src in urls:
1397                 continue
1398             urls.append(src)
1399             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1400             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1401             subtitles.setdefault(lang, []).append({
1402                 'url': src,
1403                 'ext': ext,
1404             })
1405         return subtitles
1406
1407     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1408         xspf = self._download_xml(
1409             playlist_url, playlist_id, 'Downloading xpsf playlist',
1410             'Unable to download xspf manifest', fatal=fatal)
1411         if xspf is False:
1412             return []
1413         return self._parse_xspf(xspf, playlist_id)
1414
1415     def _parse_xspf(self, playlist, playlist_id):
1416         NS_MAP = {
1417             'xspf': 'http://xspf.org/ns/0/',
1418             's1': 'http://static.streamone.nl/player/ns/0',
1419         }
1420
1421         entries = []
1422         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1423             title = xpath_text(
1424                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1425             description = xpath_text(
1426                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1427             thumbnail = xpath_text(
1428                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1429             duration = float_or_none(
1430                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1431
1432             formats = [{
1433                 'url': location.text,
1434                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1435                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1436                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1437             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1438             self._sort_formats(formats)
1439
1440             entries.append({
1441                 'id': playlist_id,
1442                 'title': title,
1443                 'description': description,
1444                 'thumbnail': thumbnail,
1445                 'duration': duration,
1446                 'formats': formats,
1447             })
1448         return entries
1449
1450     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1451         res = self._download_webpage_handle(
1452             mpd_url, video_id,
1453             note=note or 'Downloading MPD manifest',
1454             errnote=errnote or 'Failed to download MPD manifest',
1455             fatal=fatal)
1456         if res is False:
1457             return []
1458         mpd, urlh = res
1459         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1460
1461         return self._parse_mpd_formats(
1462             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1463
1464     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1465         if mpd_doc.get('type') == 'dynamic':
1466             return []
1467
1468         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1469
1470         def _add_ns(path):
1471             return self._xpath_ns(path, namespace)
1472
1473         def is_drm_protected(element):
1474             return element.find(_add_ns('ContentProtection')) is not None
1475
1476         def extract_multisegment_info(element, ms_parent_info):
1477             ms_info = ms_parent_info.copy()
1478             segment_list = element.find(_add_ns('SegmentList'))
1479             if segment_list is not None:
1480                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1481                 if segment_urls_e:
1482                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1483                 initialization = segment_list.find(_add_ns('Initialization'))
1484                 if initialization is not None:
1485                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1486             else:
1487                 segment_template = element.find(_add_ns('SegmentTemplate'))
1488                 if segment_template is not None:
1489                     start_number = segment_template.get('startNumber')
1490                     if start_number:
1491                         ms_info['start_number'] = int(start_number)
1492                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1493                     if segment_timeline is not None:
1494                         s_e = segment_timeline.findall(_add_ns('S'))
1495                         if s_e:
1496                             ms_info['total_number'] = 0
1497                             for s in s_e:
1498                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1499                     else:
1500                         timescale = segment_template.get('timescale')
1501                         if timescale:
1502                             ms_info['timescale'] = int(timescale)
1503                         segment_duration = segment_template.get('duration')
1504                         if segment_duration:
1505                             ms_info['segment_duration'] = int(segment_duration)
1506                     media_template = segment_template.get('media')
1507                     if media_template:
1508                         ms_info['media_template'] = media_template
1509                     initialization = segment_template.get('initialization')
1510                     if initialization:
1511                         ms_info['initialization_url'] = initialization
1512                     else:
1513                         initialization = segment_template.find(_add_ns('Initialization'))
1514                         if initialization is not None:
1515                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1516             return ms_info
1517
1518         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1519         formats = []
1520         for period in mpd_doc.findall(_add_ns('Period')):
1521             period_duration = parse_duration(period.get('duration')) or mpd_duration
1522             period_ms_info = extract_multisegment_info(period, {
1523                 'start_number': 1,
1524                 'timescale': 1,
1525             })
1526             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1527                 if is_drm_protected(adaptation_set):
1528                     continue
1529                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1530                 for representation in adaptation_set.findall(_add_ns('Representation')):
1531                     if is_drm_protected(representation):
1532                         continue
1533                     representation_attrib = adaptation_set.attrib.copy()
1534                     representation_attrib.update(representation.attrib)
1535                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1536                     mime_type = representation_attrib['mimeType']
1537                     content_type = mime_type.split('/')[0]
1538                     if content_type == 'text':
1539                         # TODO implement WebVTT downloading
1540                         pass
1541                     elif content_type == 'video' or content_type == 'audio':
1542                         base_url = ''
1543                         for element in (representation, adaptation_set, period, mpd_doc):
1544                             base_url_e = element.find(_add_ns('BaseURL'))
1545                             if base_url_e is not None:
1546                                 base_url = base_url_e.text + base_url
1547                                 if re.match(r'^https?://', base_url):
1548                                     break
1549                         if mpd_base_url and not re.match(r'^https?://', base_url):
1550                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1551                                 mpd_base_url += '/'
1552                             base_url = mpd_base_url + base_url
1553                         representation_id = representation_attrib.get('id')
1554                         lang = representation_attrib.get('lang')
1555                         url_el = representation.find(_add_ns('BaseURL'))
1556                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1557                         f = {
1558                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1559                             'url': base_url,
1560                             'ext': mimetype2ext(mime_type),
1561                             'width': int_or_none(representation_attrib.get('width')),
1562                             'height': int_or_none(representation_attrib.get('height')),
1563                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1564                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1565                             'fps': int_or_none(representation_attrib.get('frameRate')),
1566                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1567                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1568                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1569                             'format_note': 'DASH %s' % content_type,
1570                             'filesize': filesize,
1571                         }
1572                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1573                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1574                             if 'total_number' not in representation_ms_info and 'segment_duration':
1575                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1576                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1577                             media_template = representation_ms_info['media_template']
1578                             media_template = media_template.replace('$RepresentationID$', representation_id)
1579                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
1580                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1581                             media_template.replace('$$', '$')
1582                             representation_ms_info['segment_urls'] = [
1583                                 media_template % {
1584                                     'Number': segment_number,
1585                                     'Bandwidth': representation_attrib.get('bandwidth')}
1586                                 for segment_number in range(
1587                                     representation_ms_info['start_number'],
1588                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1589                         if 'segment_urls' in representation_ms_info:
1590                             f.update({
1591                                 'segment_urls': representation_ms_info['segment_urls'],
1592                                 'protocol': 'http_dash_segments',
1593                             })
1594                             if 'initialization_url' in representation_ms_info:
1595                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1596                                 f.update({
1597                                     'initialization_url': initialization_url,
1598                                 })
1599                                 if not f.get('url'):
1600                                     f['url'] = initialization_url
1601                         try:
1602                             existing_format = next(
1603                                 fo for fo in formats
1604                                 if fo['format_id'] == representation_id)
1605                         except StopIteration:
1606                             full_info = formats_dict.get(representation_id, {}).copy()
1607                             full_info.update(f)
1608                             formats.append(full_info)
1609                         else:
1610                             existing_format.update(f)
1611                     else:
1612                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1613         return formats
1614
1615     def _live_title(self, name):
1616         """ Generate the title for a live video """
1617         now = datetime.datetime.now()
1618         now_str = now.strftime('%Y-%m-%d %H:%M')
1619         return name + ' ' + now_str
1620
1621     def _int(self, v, name, fatal=False, **kwargs):
1622         res = int_or_none(v, **kwargs)
1623         if 'get_attr' in kwargs:
1624             print(getattr(v, kwargs['get_attr']))
1625         if res is None:
1626             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1627             if fatal:
1628                 raise ExtractorError(msg)
1629             else:
1630                 self._downloader.report_warning(msg)
1631         return res
1632
1633     def _float(self, v, name, fatal=False, **kwargs):
1634         res = float_or_none(v, **kwargs)
1635         if res is None:
1636             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1637             if fatal:
1638                 raise ExtractorError(msg)
1639             else:
1640                 self._downloader.report_warning(msg)
1641         return res
1642
1643     def _set_cookie(self, domain, name, value, expire_time=None):
1644         cookie = compat_cookiejar.Cookie(
1645             0, name, value, None, None, domain, None,
1646             None, '/', True, False, expire_time, '', None, None, None)
1647         self._downloader.cookiejar.set_cookie(cookie)
1648
1649     def _get_cookies(self, url):
1650         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1651         req = sanitized_Request(url)
1652         self._downloader.cookiejar.add_cookie_header(req)
1653         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1654
1655     def get_testcases(self, include_onlymatching=False):
1656         t = getattr(self, '_TEST', None)
1657         if t:
1658             assert not hasattr(self, '_TESTS'), \
1659                 '%s has _TEST and _TESTS' % type(self).__name__
1660             tests = [t]
1661         else:
1662             tests = getattr(self, '_TESTS', [])
1663         for t in tests:
1664             if not include_onlymatching and t.get('only_matching', False):
1665                 continue
1666             t['name'] = type(self).__name__[:-len('IE')]
1667             yield t
1668
1669     def is_suitable(self, age_limit):
1670         """ Test whether the extractor is generally suitable for the given
1671         age limit (i.e. pornographic sites are not, all others usually are) """
1672
1673         any_restricted = False
1674         for tc in self.get_testcases(include_onlymatching=False):
1675             if 'playlist' in tc:
1676                 tc = tc['playlist'][0]
1677             is_restricted = age_restricted(
1678                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1679             if not is_restricted:
1680                 return True
1681             any_restricted = any_restricted or is_restricted
1682         return not any_restricted
1683
1684     def extract_subtitles(self, *args, **kwargs):
1685         if (self._downloader.params.get('writesubtitles', False) or
1686                 self._downloader.params.get('listsubtitles')):
1687             return self._get_subtitles(*args, **kwargs)
1688         return {}
1689
1690     def _get_subtitles(self, *args, **kwargs):
1691         raise NotImplementedError('This method must be implemented by subclasses')
1692
1693     @staticmethod
1694     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1695         """ Merge subtitle items for one language. Items with duplicated URLs
1696         will be dropped. """
1697         list1_urls = set([item['url'] for item in subtitle_list1])
1698         ret = list(subtitle_list1)
1699         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1700         return ret
1701
1702     @classmethod
1703     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1704         """ Merge two subtitle dictionaries, language by language. """
1705         ret = dict(subtitle_dict1)
1706         for lang in subtitle_dict2:
1707             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1708         return ret
1709
1710     def extract_automatic_captions(self, *args, **kwargs):
1711         if (self._downloader.params.get('writeautomaticsub', False) or
1712                 self._downloader.params.get('listsubtitles')):
1713             return self._get_automatic_captions(*args, **kwargs)
1714         return {}
1715
1716     def _get_automatic_captions(self, *args, **kwargs):
1717         raise NotImplementedError('This method must be implemented by subclasses')
1718
1719     def mark_watched(self, *args, **kwargs):
1720         if (self._downloader.params.get('mark_watched', False) and
1721                 (self._get_login_info()[0] is not None or
1722                     self._downloader.params.get('cookiefile') is not None)):
1723             self._mark_watched(*args, **kwargs)
1724
1725     def _mark_watched(self, *args, **kwargs):
1726         raise NotImplementedError('This method must be implemented by subclasses')
1727
1728
1729 class SearchInfoExtractor(InfoExtractor):
1730     """
1731     Base class for paged search queries extractors.
1732     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1733     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1734     """
1735
1736     @classmethod
1737     def _make_valid_url(cls):
1738         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1739
1740     @classmethod
1741     def suitable(cls, url):
1742         return re.match(cls._make_valid_url(), url) is not None
1743
1744     def _real_extract(self, query):
1745         mobj = re.match(self._make_valid_url(), query)
1746         if mobj is None:
1747             raise ExtractorError('Invalid search query "%s"' % query)
1748
1749         prefix = mobj.group('prefix')
1750         query = mobj.group('query')
1751         if prefix == '':
1752             return self._get_n_results(query, 1)
1753         elif prefix == 'all':
1754             return self._get_n_results(query, self._MAX_RESULTS)
1755         else:
1756             n = int(prefix)
1757             if n <= 0:
1758                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1759             elif n > self._MAX_RESULTS:
1760                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1761                 n = self._MAX_RESULTS
1762             return self._get_n_results(query, n)
1763
1764     def _get_n_results(self, query, n):
1765         """Get a specified number of results for a query"""
1766         raise NotImplementedError('This method must be implemented by subclasses')
1767
1768     @property
1769     def SEARCH_KEY(self):
1770         return self._SEARCH_KEY