_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     unified_timestamp,
  48     url_basename,
  49     xpath_element,
  50     xpath_text,
  51     xpath_with_ns,
  52     determine_protocol,
  53     parse_duration,
  54     mimetype2ext,
  55     update_Request,
  56     update_url_query,
  57     parse_m3u8_attributes,
  58 )
  59
  60
  61 class InfoExtractor(object):
  62     """Information Extractor class.
  63
  64     Information extractors are the classes that, given a URL, extract
  65     information about the video (or videos) the URL refers to. This
  66     information includes the real video URL, the video title, author and
  67     others. The information is stored in a dictionary which is then
  68     passed to the YoutubeDL. The YoutubeDL processes this
  69     information possibly downloading the video to the file system, among
  70     other possible outcomes.
  71
  72     The type field determines the type of the result.
  73     By far the most common value (and the default if _type is missing) is
  74     "video", which indicates a single video.
  75
  76     For a video, the dictionaries must include the following fields:
  77
  78     id:             Video identifier.
  79     title:          Video title, unescaped.
  80
  81     Additionally, it must contain either a formats entry or a url one:
  82
  83     formats:        A list of dictionaries for each format available, ordered
  84                     from worst to best quality.
  85
  86                     Potential fields:
  87                     * url        Mandatory. The URL of the video file
  88                     * ext        Will be calculated from URL if missing
  89                     * format     A human-readable description of the format
  90                                  ("mp4 container with h264/opus").
  91                                  Calculated from the format_id, width, height.
  92                                  and format_note fields if missing.
  93                     * format_id  A short description of the format
  94                                  ("mp4_h264_opus" or "19").
  95                                 Technically optional, but strongly recommended.
  96                     * format_note Additional info about the format
  97                                  ("3D" or "DASH video")
  98                     * width      Width of the video, if known
  99                     * height     Height of the video, if known
 100                     * resolution Textual description of width and height
 101                     * tbr        Average bitrate of audio and video in KBit/s
 102                     * abr        Average audio bitrate in KBit/s
 103                     * acodec     Name of the audio codec in use
 104                     * asr        Audio sampling rate in Hertz
 105                     * vbr        Average video bitrate in KBit/s
 106                     * fps        Frame rate
 107                     * vcodec     Name of the video codec in use
 108                     * container  Name of the container format
 109                     * filesize   The number of bytes, if known in advance
 110                     * filesize_approx  An estimate for the number of bytes
 111                     * player_url SWF Player URL (used for rtmpdump).
 112                     * protocol   The protocol that will be used for the actual
 113                                  download, lower-case.
 114                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 115                                  "m3u8", "m3u8_native" or "http_dash_segments".
 116                     * preference Order number of this format. If this field is
 117                                  present and not None, the formats get sorted
 118                                  by this field, regardless of all other values.
 119                                  -1 for default (order by other properties),
 120                                  -2 or smaller for less than default.
 121                                  < -1000 to hide the format (if there is
 122                                     another one which is strictly better)
 123                     * language   Language code, e.g. "de" or "en-US".
 124                     * language_preference  Is this in the language mentioned in
 125                                  the URL?
 126                                  10 if it's what the URL is about,
 127                                  -1 for default (don't know),
 128                                  -10 otherwise, other values reserved for now.
 129                     * quality    Order number of the video quality of this
 130                                  format, irrespective of the file format.
 131                                  -1 for default (order by other properties),
 132                                  -2 or smaller for less than default.
 133                     * source_preference  Order number for this video source
 134                                   (quality takes higher priority)
 135                                  -1 for default (order by other properties),
 136                                  -2 or smaller for less than default.
 137                     * http_headers  A dictionary of additional HTTP headers
 138                                  to add to the request.
 139                     * stretched_ratio  If given and not 1, indicates that the
 140                                  video's pixels are not square.
 141                                  width : height ratio as float.
 142                     * no_resume  The server does not support resuming the
 143                                  (HTTP or RTMP) download. Boolean.
 144
 145     url:            Final video URL.
 146     ext:            Video filename extension.
 147     format:         The video format, defaults to ext (used for --get-format)
 148     player_url:     SWF Player URL (used for rtmpdump).
 149
 150     The following fields are optional:
 151
 152     alt_title:      A secondary title of the video.
 153     display_id      An alternative identifier for the video, not necessarily
 154                     unique, but available before title. Typically, id is
 155                     something like "4234987", title "Dancing naked mole rats",
 156                     and display_id "dancing-naked-mole-rats"
 157     thumbnails:     A list of dictionaries, with the following entries:
 158                         * "id" (optional, string) - Thumbnail format ID
 159                         * "url"
 160                         * "preference" (optional, int) - quality of the image
 161                         * "width" (optional, int)
 162                         * "height" (optional, int)
 163                         * "resolution" (optional, string "{width}x{height"},
 164                                         deprecated)
 165                         * "filesize" (optional, int)
 166     thumbnail:      Full URL to a video thumbnail image.
 167     description:    Full video description.
 168     uploader:       Full name of the video uploader.
 169     license:        License name the video is licensed under.
 170     creator:        The creator of the video.
 171     release_date:   The date (YYYYMMDD) when the video was released.
 172     timestamp:      UNIX timestamp of the moment the video became available.
 173     upload_date:    Video upload date (YYYYMMDD).
 174                     If not explicitly set, calculated from timestamp.
 175     uploader_id:    Nickname or id of the video uploader.
 176     uploader_url:   Full URL to a personal webpage of the video uploader.
 177     location:       Physical location where the video was filmed.
 178     subtitles:      The available subtitles as a dictionary in the format
 179                     {language: subformats}. "subformats" is a list sorted from
 180                     lower to higher preference, each element is a dictionary
 181                     with the "ext" entry and one of:
 182                         * "data": The subtitles file contents
 183                         * "url": A URL pointing to the subtitles file
 184                     "ext" will be calculated from URL if missing
 185     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 186                     automatically generated captions
 187     duration:       Length of the video in seconds, as an integer or float.
 188     view_count:     How many users have watched the video on the platform.
 189     like_count:     Number of positive ratings of the video
 190     dislike_count:  Number of negative ratings of the video
 191     repost_count:   Number of reposts of the video
 192     average_rating: Average rating give by users, the scale used depends on the webpage
 193     comment_count:  Number of comments on the video
 194     comments:       A list of comments, each with one or more of the following
 195                     properties (all but one of text or html optional):
 196                         * "author" - human-readable name of the comment author
 197                         * "author_id" - user ID of the comment author
 198                         * "id" - Comment ID
 199                         * "html" - Comment as HTML
 200                         * "text" - Plain text of the comment
 201                         * "timestamp" - UNIX timestamp of comment
 202                         * "parent" - ID of the comment this one is replying to.
 203                                      Set to "root" to indicate that this is a
 204                                      comment to the original video.
 205     age_limit:      Age restriction for the video, as an integer (years)
 206     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 207                     should allow to get the same result again. (It will be set
 208                     by YoutubeDL if it's missing)
 209     categories:     A list of categories that the video falls in, for example
 210                     ["Sports", "Berlin"]
 211     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 212     is_live:        True, False, or None (=unknown). Whether this video is a
 213                     live stream that goes on instead of a fixed-length video.
 214     start_time:     Time in seconds where the reproduction should start, as
 215                     specified in the URL.
 216     end_time:       Time in seconds where the reproduction should end, as
 217                     specified in the URL.
 218
 219     The following fields should only be used when the video belongs to some logical
 220     chapter or section:
 221
 222     chapter:        Name or title of the chapter the video belongs to.
 223     chapter_number: Number of the chapter the video belongs to, as an integer.
 224     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 225
 226     The following fields should only be used when the video is an episode of some
 227     series or programme:
 228
 229     series:         Title of the series or programme the video episode belongs to.
 230     season:         Title of the season the video episode belongs to.
 231     season_number:  Number of the season the video episode belongs to, as an integer.
 232     season_id:      Id of the season the video episode belongs to, as a unicode string.
 233     episode:        Title of the video episode. Unlike mandatory video title field,
 234                     this field should denote the exact title of the video episode
 235                     without any kind of decoration.
 236     episode_number: Number of the video episode within a season, as an integer.
 237     episode_id:     Id of the video episode, as a unicode string.
 238
 239     The following fields should only be used when the media is a track or a part of
 240     a music album:
 241
 242     track:          Title of the track.
 243     track_number:   Number of the track within an album or a disc, as an integer.
 244     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 245                     as a unicode string.
 246     artist:         Artist(s) of the track.
 247     genre:          Genre(s) of the track.
 248     album:          Title of the album the track belongs to.
 249     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 250     album_artist:   List of all artists appeared on the album (e.g.
 251                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 252                     and compilations).
 253     disc_number:    Number of the disc or other physical medium the track belongs to,
 254                     as an integer.
 255     release_year:   Year (YYYY) when the album was released.
 256
 257     Unless mentioned otherwise, the fields should be Unicode strings.
 258
 259     Unless mentioned otherwise, None is equivalent to absence of information.
 260
 261
 262     _type "playlist" indicates multiple videos.
 263     There must be a key "entries", which is a list, an iterable, or a PagedList
 264     object, each element of which is a valid dictionary by this specification.
 265
 266     Additionally, playlists can have "title", "description" and "id" attributes
 267     with the same semantics as videos (see above).
 268
 269
 270     _type "multi_video" indicates that there are multiple videos that
 271     form a single show, for examples multiple acts of an opera or TV episode.
 272     It must have an entries key like a playlist and contain all the keys
 273     required for a video at the same time.
 274
 275
 276     _type "url" indicates that the video must be extracted from another
 277     location, possibly by a different extractor. Its only required key is:
 278     "url" - the next URL to extract.
 279     The key "ie_key" can be set to the class name (minus the trailing "IE",
 280     e.g. "Youtube") if the extractor class is known in advance.
 281     Additionally, the dictionary may have any properties of the resolved entity
 282     known in advance, for example "title" if the title of the referred video is
 283     known ahead of time.
 284
 285
 286     _type "url_transparent" entities have the same specification as "url", but
 287     indicate that the given additional information is more precise than the one
 288     associated with the resolved URL.
 289     This is useful when a site employs a video service that hosts the video and
 290     its technical metadata, but that video service does not embed a useful
 291     title, description etc.
 292
 293
 294     Subclasses of this one should re-define the _real_initialize() and
 295     _real_extract() methods and define a _VALID_URL regexp.
 296     Probably, they should also be added to the list of extractors.
 297
 298     Finally, the _WORKING attribute should be set to False for broken IEs
 299     in order to warn the users and skip the tests.
 300     """
 301
 302     _ready = False
 303     _downloader = None
 304     _WORKING = True
 305
 306     def __init__(self, downloader=None):
 307         """Constructor. Receives an optional downloader."""
 308         self._ready = False
 309         self.set_downloader(downloader)
 310
 311     @classmethod
 312     def suitable(cls, url):
 313         """Receives a URL and returns True if suitable for this IE."""
 314
 315         # This does not use has/getattr intentionally - we want to know whether
 316         # we have cached the regexp for *this* class, whereas getattr would also
 317         # match the superclass
 318         if '_VALID_URL_RE' not in cls.__dict__:
 319             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 320         return cls._VALID_URL_RE.match(url) is not None
 321
 322     @classmethod
 323     def _match_id(cls, url):
 324         if '_VALID_URL_RE' not in cls.__dict__:
 325             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 326         m = cls._VALID_URL_RE.match(url)
 327         assert m
 328         return m.group('id')
 329
 330     @classmethod
 331     def working(cls):
 332         """Getter method for _WORKING."""
 333         return cls._WORKING
 334
 335     def initialize(self):
 336         """Initializes an instance (authentication, etc)."""
 337         if not self._ready:
 338             self._real_initialize()
 339             self._ready = True
 340
 341     def extract(self, url):
 342         """Extracts URL information and returns it in list of dicts."""
 343         try:
 344             self.initialize()
 345             return self._real_extract(url)
 346         except ExtractorError:
 347             raise
 348         except compat_http_client.IncompleteRead as e:
 349             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 350         except (KeyError, StopIteration) as e:
 351             raise ExtractorError('An extractor error has occurred.', cause=e)
 352
 353     def set_downloader(self, downloader):
 354         """Sets the downloader for this IE."""
 355         self._downloader = downloader
 356
 357     def _real_initialize(self):
 358         """Real initialization process. Redefine in subclasses."""
 359         pass
 360
 361     def _real_extract(self, url):
 362         """Real extraction process. Redefine in subclasses."""
 363         pass
 364
 365     @classmethod
 366     def ie_key(cls):
 367         """A string for getting the InfoExtractor with get_info_extractor"""
 368         return compat_str(cls.__name__[:-2])
 369
 370     @property
 371     def IE_NAME(self):
 372         return compat_str(type(self).__name__[:-2])
 373
 374     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 375         """ Returns the response handle """
 376         if note is None:
 377             self.report_download_webpage(video_id)
 378         elif note is not False:
 379             if video_id is None:
 380                 self.to_screen('%s' % (note,))
 381             else:
 382                 self.to_screen('%s: %s' % (video_id, note))
 383         if isinstance(url_or_request, compat_urllib_request.Request):
 384             url_or_request = update_Request(
 385                 url_or_request, data=data, headers=headers, query=query)
 386         else:
 387             if query:
 388                 url_or_request = update_url_query(url_or_request, query)
 389             if data is not None or headers:
 390                 url_or_request = sanitized_Request(url_or_request, data, headers)
 391         try:
 392             return self._downloader.urlopen(url_or_request)
 393         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 394             if errnote is False:
 395                 return False
 396             if errnote is None:
 397                 errnote = 'Unable to download webpage'
 398
 399             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 400             if fatal:
 401                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 402             else:
 403                 self._downloader.report_warning(errmsg)
 404                 return False
 405
 406     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 407         """ Returns a tuple (page content as string, URL handle) """
 408         # Strip hashes from the URL (#1038)
 409         if isinstance(url_or_request, (compat_str, str)):
 410             url_or_request = url_or_request.partition('#')[0]
 411
 412         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 413         if urlh is False:
 414             assert not fatal
 415             return False
 416         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 417         return (content, urlh)
 418
 419     @staticmethod
 420     def _guess_encoding_from_content(content_type, webpage_bytes):
 421         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 422         if m:
 423             encoding = m.group(1)
 424         else:
 425             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 426                           webpage_bytes[:1024])
 427             if m:
 428                 encoding = m.group(1).decode('ascii')
 429             elif webpage_bytes.startswith(b'\xff\xfe'):
 430                 encoding = 'utf-16'
 431             else:
 432                 encoding = 'utf-8'
 433
 434         return encoding
 435
 436     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 437         content_type = urlh.headers.get('Content-Type', '')
 438         webpage_bytes = urlh.read()
 439         if prefix is not None:
 440             webpage_bytes = prefix + webpage_bytes
 441         if not encoding:
 442             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 443         if self._downloader.params.get('dump_intermediate_pages', False):
 444             try:
 445                 url = url_or_request.get_full_url()
 446             except AttributeError:
 447                 url = url_or_request
 448             self.to_screen('Dumping request to ' + url)
 449             dump = base64.b64encode(webpage_bytes).decode('ascii')
 450             self._downloader.to_screen(dump)
 451         if self._downloader.params.get('write_pages', False):
 452             try:
 453                 url = url_or_request.get_full_url()
 454             except AttributeError:
 455                 url = url_or_request
 456             basen = '%s_%s' % (video_id, url)
 457             if len(basen) > 240:
 458                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 459                 basen = basen[:240 - len(h)] + h
 460             raw_filename = basen + '.dump'
 461             filename = sanitize_filename(raw_filename, restricted=True)
 462             self.to_screen('Saving request to ' + filename)
 463             # Working around MAX_PATH limitation on Windows (see
 464             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 465             if compat_os_name == 'nt':
 466                 absfilepath = os.path.abspath(filename)
 467                 if len(absfilepath) > 259:
 468                     filename = '\\\\?\\' + absfilepath
 469             with open(filename, 'wb') as outf:
 470                 outf.write(webpage_bytes)
 471
 472         try:
 473             content = webpage_bytes.decode(encoding, 'replace')
 474         except LookupError:
 475             content = webpage_bytes.decode('utf-8', 'replace')
 476
 477         if ('<title>Access to this site is blocked</title>' in content and
 478                 'Websense' in content[:512]):
 479             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 480             blocked_iframe = self._html_search_regex(
 481                 r'<iframe src="([^"]+)"', content,
 482                 'Websense information URL', default=None)
 483             if blocked_iframe:
 484                 msg += ' Visit %s for more details' % blocked_iframe
 485             raise ExtractorError(msg, expected=True)
 486         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 487             msg = (
 488                 'Access to this webpage has been blocked by Indian censorship. '
 489                 'Use a VPN or proxy server (with --proxy) to route around it.')
 490             block_msg = self._html_search_regex(
 491                 r'</h1><p>(.*?)</p>',
 492                 content, 'block message', default=None)
 493             if block_msg:
 494                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 495             raise ExtractorError(msg, expected=True)
 496
 497         return content
 498
 499     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 500         """ Returns the data of the page as a string """
 501         success = False
 502         try_count = 0
 503         while success is False:
 504             try:
 505                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 506                 success = True
 507             except compat_http_client.IncompleteRead as e:
 508                 try_count += 1
 509                 if try_count >= tries:
 510                     raise e
 511                 self._sleep(timeout, video_id)
 512         if res is False:
 513             return res
 514         else:
 515             content, _ = res
 516             return content
 517
 518     def _download_xml(self, url_or_request, video_id,
 519                       note='Downloading XML', errnote='Unable to download XML',
 520                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 521         """Return the xml as an xml.etree.ElementTree.Element"""
 522         xml_string = self._download_webpage(
 523             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 524         if xml_string is False:
 525             return xml_string
 526         if transform_source:
 527             xml_string = transform_source(xml_string)
 528         return compat_etree_fromstring(xml_string.encode('utf-8'))
 529
 530     def _download_json(self, url_or_request, video_id,
 531                        note='Downloading JSON metadata',
 532                        errnote='Unable to download JSON metadata',
 533                        transform_source=None,
 534                        fatal=True, encoding=None, data=None, headers={}, query={}):
 535         json_string = self._download_webpage(
 536             url_or_request, video_id, note, errnote, fatal=fatal,
 537             encoding=encoding, data=data, headers=headers, query=query)
 538         if (not fatal) and json_string is False:
 539             return None
 540         return self._parse_json(
 541             json_string, video_id, transform_source=transform_source, fatal=fatal)
 542
 543     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 544         if transform_source:
 545             json_string = transform_source(json_string)
 546         try:
 547             return json.loads(json_string)
 548         except ValueError as ve:
 549             errmsg = '%s: Failed to parse JSON ' % video_id
 550             if fatal:
 551                 raise ExtractorError(errmsg, cause=ve)
 552             else:
 553                 self.report_warning(errmsg + str(ve))
 554
 555     def report_warning(self, msg, video_id=None):
 556         idstr = '' if video_id is None else '%s: ' % video_id
 557         self._downloader.report_warning(
 558             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 559
 560     def to_screen(self, msg):
 561         """Print msg to screen, prefixing it with '[ie_name]'"""
 562         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 563
 564     def report_extraction(self, id_or_name):
 565         """Report information extraction."""
 566         self.to_screen('%s: Extracting information' % id_or_name)
 567
 568     def report_download_webpage(self, video_id):
 569         """Report webpage download."""
 570         self.to_screen('%s: Downloading webpage' % video_id)
 571
 572     def report_age_confirmation(self):
 573         """Report attempt to confirm age."""
 574         self.to_screen('Confirming age')
 575
 576     def report_login(self):
 577         """Report attempt to log in."""
 578         self.to_screen('Logging in')
 579
 580     @staticmethod
 581     def raise_login_required(msg='This video is only available for registered users'):
 582         raise ExtractorError(
 583             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 584             expected=True)
 585
 586     @staticmethod
 587     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 588         raise ExtractorError(
 589             '%s. You might want to use --proxy to workaround.' % msg,
 590             expected=True)
 591
 592     # Methods for following #608
 593     @staticmethod
 594     def url_result(url, ie=None, video_id=None, video_title=None):
 595         """Returns a URL that points to a page that should be processed"""
 596         # TODO: ie should be the class used for getting the info
 597         video_info = {'_type': 'url',
 598                       'url': url,
 599                       'ie_key': ie}
 600         if video_id is not None:
 601             video_info['id'] = video_id
 602         if video_title is not None:
 603             video_info['title'] = video_title
 604         return video_info
 605
 606     @staticmethod
 607     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 608         """Returns a playlist"""
 609         video_info = {'_type': 'playlist',
 610                       'entries': entries}
 611         if playlist_id:
 612             video_info['id'] = playlist_id
 613         if playlist_title:
 614             video_info['title'] = playlist_title
 615         if playlist_description:
 616             video_info['description'] = playlist_description
 617         return video_info
 618
 619     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 620         """
 621         Perform a regex search on the given string, using a single or a list of
 622         patterns returning the first matching group.
 623         In case of failure return a default value or raise a WARNING or a
 624         RegexNotFoundError, depending on fatal, specifying the field name.
 625         """
 626         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 627             mobj = re.search(pattern, string, flags)
 628         else:
 629             for p in pattern:
 630                 mobj = re.search(p, string, flags)
 631                 if mobj:
 632                     break
 633
 634         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 635             _name = '\033[0;34m%s\033[0m' % name
 636         else:
 637             _name = name
 638
 639         if mobj:
 640             if group is None:
 641                 # return the first matching group
 642                 return next(g for g in mobj.groups() if g is not None)
 643             else:
 644                 return mobj.group(group)
 645         elif default is not NO_DEFAULT:
 646             return default
 647         elif fatal:
 648             raise RegexNotFoundError('Unable to extract %s' % _name)
 649         else:
 650             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 651             return None
 652
 653     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 654         """
 655         Like _search_regex, but strips HTML tags and unescapes entities.
 656         """
 657         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 658         if res:
 659             return clean_html(res).strip()
 660         else:
 661             return res
 662
 663     def _get_login_info(self):
 664         """
 665         Get the login info as (username, password)
 666         It will look in the netrc file using the _NETRC_MACHINE value
 667         If there's no info available, return (None, None)
 668         """
 669         if self._downloader is None:
 670             return (None, None)
 671
 672         username = None
 673         password = None
 674         downloader_params = self._downloader.params
 675
 676         # Attempt to use provided username and password or .netrc data
 677         if downloader_params.get('username') is not None:
 678             username = downloader_params['username']
 679             password = downloader_params['password']
 680         elif downloader_params.get('usenetrc', False):
 681             try:
 682                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 683                 if info is not None:
 684                     username = info[0]
 685                     password = info[2]
 686                 else:
 687                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 688             except (IOError, netrc.NetrcParseError) as err:
 689                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 690
 691         return (username, password)
 692
 693     def _get_tfa_info(self, note='two-factor verification code'):
 694         """
 695         Get the two-factor authentication info
 696         TODO - asking the user will be required for sms/phone verify
 697         currently just uses the command line option
 698         If there's no info available, return None
 699         """
 700         if self._downloader is None:
 701             return None
 702         downloader_params = self._downloader.params
 703
 704         if downloader_params.get('twofactor') is not None:
 705             return downloader_params['twofactor']
 706
 707         return compat_getpass('Type %s and press [Return]: ' % note)
 708
 709     # Helper functions for extracting OpenGraph info
 710     @staticmethod
 711     def _og_regexes(prop):
 712         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 713         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 714                        % {'prop': re.escape(prop)})
 715         template = r'<meta[^>]+?%s[^>]+?%s'
 716         return [
 717             template % (property_re, content_re),
 718             template % (content_re, property_re),
 719         ]
 720
 721     @staticmethod
 722     def _meta_regex(prop):
 723         return r'''(?isx)<meta
 724                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 725                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 726
 727     def _og_search_property(self, prop, html, name=None, **kargs):
 728         if name is None:
 729             name = 'OpenGraph %s' % prop
 730         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 731         if escaped is None:
 732             return None
 733         return unescapeHTML(escaped)
 734
 735     def _og_search_thumbnail(self, html, **kargs):
 736         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 737
 738     def _og_search_description(self, html, **kargs):
 739         return self._og_search_property('description', html, fatal=False, **kargs)
 740
 741     def _og_search_title(self, html, **kargs):
 742         return self._og_search_property('title', html, **kargs)
 743
 744     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 745         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 746         if secure:
 747             regexes = self._og_regexes('video:secure_url') + regexes
 748         return self._html_search_regex(regexes, html, name, **kargs)
 749
 750     def _og_search_url(self, html, **kargs):
 751         return self._og_search_property('url', html, **kargs)
 752
 753     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 754         if not isinstance(name, (list, tuple)):
 755             name = [name]
 756         if display_name is None:
 757             display_name = name[0]
 758         return self._html_search_regex(
 759             [self._meta_regex(n) for n in name],
 760             html, display_name, fatal=fatal, group='content', **kwargs)
 761
 762     def _dc_search_uploader(self, html):
 763         return self._html_search_meta('dc.creator', html, 'uploader')
 764
 765     def _rta_search(self, html):
 766         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 767         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 768                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 769                      html):
 770             return 18
 771         return 0
 772
 773     def _media_rating_search(self, html):
 774         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 775         rating = self._html_search_meta('rating', html)
 776
 777         if not rating:
 778             return None
 779
 780         RATING_TABLE = {
 781             'safe for kids': 0,
 782             'general': 8,
 783             '14 years': 14,
 784             'mature': 17,
 785             'restricted': 19,
 786         }
 787         return RATING_TABLE.get(rating.lower())
 788
 789     def _family_friendly_search(self, html):
 790         # See http://schema.org/VideoObject
 791         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 792
 793         if not family_friendly:
 794             return None
 795
 796         RATING_TABLE = {
 797             '1': 0,
 798             'true': 0,
 799             '0': 18,
 800             'false': 18,
 801         }
 802         return RATING_TABLE.get(family_friendly.lower())
 803
 804     def _twitter_search_player(self, html):
 805         return self._html_search_meta('twitter:player', html,
 806                                       'twitter card player')
 807
 808     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 809         json_ld = self._search_regex(
 810             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 811             html, 'JSON-LD', group='json_ld', **kwargs)
 812         if not json_ld:
 813             return {}
 814         return self._json_ld(
 815             json_ld, video_id, fatal=kwargs.get('fatal', True),
 816             expected_type=expected_type)
 817
 818     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 819         if isinstance(json_ld, compat_str):
 820             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 821         if not json_ld:
 822             return {}
 823         info = {}
 824         if json_ld.get('@context') == 'http://schema.org':
 825             item_type = json_ld.get('@type')
 826             if expected_type is not None and expected_type != item_type:
 827                 return info
 828             if item_type == 'TVEpisode':
 829                 info.update({
 830                     'episode': unescapeHTML(json_ld.get('name')),
 831                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 832                     'description': unescapeHTML(json_ld.get('description')),
 833                 })
 834                 part_of_season = json_ld.get('partOfSeason')
 835                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 836                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 837                 part_of_series = json_ld.get('partOfSeries')
 838                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 839                     info['series'] = unescapeHTML(part_of_series.get('name'))
 840             elif item_type == 'Article':
 841                 info.update({
 842                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 843                     'title': unescapeHTML(json_ld.get('headline')),
 844                     'description': unescapeHTML(json_ld.get('articleBody')),
 845                 })
 846             elif item_type == 'VideoObject':
 847                 info.update({
 848                     'url': json_ld.get('contentUrl'),
 849                     'title': unescapeHTML(json_ld.get('name')),
 850                     'description': unescapeHTML(json_ld.get('description')),
 851                     'thumbnail': json_ld.get('thumbnailUrl'),
 852                     'duration': parse_duration(json_ld.get('duration')),
 853                     'timestamp': unified_timestamp(json_ld.get('uploadDate')),
 854                     'filesize': float_or_none(json_ld.get('contentSize')),
 855                     'tbr': int_or_none(json_ld.get('bitrate')),
 856                     'width': int_or_none(json_ld.get('width')),
 857                     'height': int_or_none(json_ld.get('height')),
 858                 })
 859         return dict((k, v) for k, v in info.items() if v is not None)
 860
 861     @staticmethod
 862     def _hidden_inputs(html):
 863         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 864         hidden_inputs = {}
 865         for input in re.findall(r'(?i)<input([^>]+)>', html):
 866             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 867                 continue
 868             name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
 869             if not name:
 870                 continue
 871             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 872             if not value:
 873                 continue
 874             hidden_inputs[name.group('value')] = value.group('value')
 875         return hidden_inputs
 876
 877     def _form_hidden_inputs(self, form_id, html):
 878         form = self._search_regex(
 879             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 880             html, '%s form' % form_id, group='form')
 881         return self._hidden_inputs(form)
 882
 883     def _sort_formats(self, formats, field_preference=None):
 884         if not formats:
 885             raise ExtractorError('No video formats found')
 886
 887         for f in formats:
 888             # Automatically determine tbr when missing based on abr and vbr (improves
 889             # formats sorting in some cases)
 890             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 891                 f['tbr'] = f['abr'] + f['vbr']
 892
 893         def _formats_key(f):
 894             # TODO remove the following workaround
 895             from ..utils import determine_ext
 896             if not f.get('ext') and 'url' in f:
 897                 f['ext'] = determine_ext(f['url'])
 898
 899             if isinstance(field_preference, (list, tuple)):
 900                 return tuple(
 901                     f.get(field)
 902                     if f.get(field) is not None
 903                     else ('' if field == 'format_id' else -1)
 904                     for field in field_preference)
 905
 906             preference = f.get('preference')
 907             if preference is None:
 908                 preference = 0
 909                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 910                     preference -= 0.5
 911
 912             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 913
 914             if f.get('vcodec') == 'none':  # audio only
 915                 preference -= 50
 916                 if self._downloader.params.get('prefer_free_formats'):
 917                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 918                 else:
 919                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 920                 ext_preference = 0
 921                 try:
 922                     audio_ext_preference = ORDER.index(f['ext'])
 923                 except ValueError:
 924                     audio_ext_preference = -1
 925             else:
 926                 if f.get('acodec') == 'none':  # video only
 927                     preference -= 40
 928                 if self._downloader.params.get('prefer_free_formats'):
 929                     ORDER = ['flv', 'mp4', 'webm']
 930                 else:
 931                     ORDER = ['webm', 'flv', 'mp4']
 932                 try:
 933                     ext_preference = ORDER.index(f['ext'])
 934                 except ValueError:
 935                     ext_preference = -1
 936                 audio_ext_preference = 0
 937
 938             return (
 939                 preference,
 940                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 941                 f.get('quality') if f.get('quality') is not None else -1,
 942                 f.get('tbr') if f.get('tbr') is not None else -1,
 943                 f.get('filesize') if f.get('filesize') is not None else -1,
 944                 f.get('vbr') if f.get('vbr') is not None else -1,
 945                 f.get('height') if f.get('height') is not None else -1,
 946                 f.get('width') if f.get('width') is not None else -1,
 947                 proto_preference,
 948                 ext_preference,
 949                 f.get('abr') if f.get('abr') is not None else -1,
 950                 audio_ext_preference,
 951                 f.get('fps') if f.get('fps') is not None else -1,
 952                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 953                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 954                 f.get('format_id') if f.get('format_id') is not None else '',
 955             )
 956         formats.sort(key=_formats_key)
 957
 958     def _check_formats(self, formats, video_id):
 959         if formats:
 960             formats[:] = filter(
 961                 lambda f: self._is_valid_url(
 962                     f['url'], video_id,
 963                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 964                 formats)
 965
 966     @staticmethod
 967     def _remove_duplicate_formats(formats):
 968         format_urls = set()
 969         unique_formats = []
 970         for f in formats:
 971             if f['url'] not in format_urls:
 972                 format_urls.add(f['url'])
 973                 unique_formats.append(f)
 974         formats[:] = unique_formats
 975
 976     def _is_valid_url(self, url, video_id, item='video'):
 977         url = self._proto_relative_url(url, scheme='http:')
 978         # For now assume non HTTP(S) URLs always valid
 979         if not (url.startswith('http://') or url.startswith('https://')):
 980             return True
 981         try:
 982             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 983             return True
 984         except ExtractorError as e:
 985             if isinstance(e.cause, compat_urllib_error.URLError):
 986                 self.to_screen(
 987                     '%s: %s URL is invalid, skipping' % (video_id, item))
 988                 return False
 989             raise
 990
 991     def http_scheme(self):
 992         """ Either "http:" or "https:", depending on the user's preferences """
 993         return (
 994             'http:'
 995             if self._downloader.params.get('prefer_insecure', False)
 996             else 'https:')
 997
 998     def _proto_relative_url(self, url, scheme=None):
 999         if url is None:
1000             return url
1001         if url.startswith('//'):
1002             if scheme is None:
1003                 scheme = self.http_scheme()
1004             return scheme + url
1005         else:
1006             return url
1007
1008     def _sleep(self, timeout, video_id, msg_template=None):
1009         if msg_template is None:
1010             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1011         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1012         self.to_screen(msg)
1013         time.sleep(timeout)
1014
1015     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1016                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1017                              fatal=True, m3u8_id=None):
1018         manifest = self._download_xml(
1019             manifest_url, video_id, 'Downloading f4m manifest',
1020             'Unable to download f4m manifest',
1021             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1022             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1023             transform_source=transform_source,
1024             fatal=fatal)
1025
1026         if manifest is False:
1027             return []
1028
1029         return self._parse_f4m_formats(
1030             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1031             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1032
1033     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1034                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1035                            fatal=True, m3u8_id=None):
1036         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1037         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1038         if akamai_pv is not None and ';' in akamai_pv.text:
1039             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1040             if playerVerificationChallenge.strip() != '':
1041                 return []
1042
1043         formats = []
1044         manifest_version = '1.0'
1045         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1046         if not media_nodes:
1047             manifest_version = '2.0'
1048             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1049         # Remove unsupported DRM protected media from final formats
1050         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1051         media_nodes = remove_encrypted_media(media_nodes)
1052         if not media_nodes:
1053             return formats
1054         base_url = xpath_text(
1055             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1056             'base URL', default=None)
1057         if base_url:
1058             base_url = base_url.strip()
1059
1060         bootstrap_info = xpath_element(
1061             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1062             'bootstrap info', default=None)
1063
1064         for i, media_el in enumerate(media_nodes):
1065             tbr = int_or_none(media_el.attrib.get('bitrate'))
1066             width = int_or_none(media_el.attrib.get('width'))
1067             height = int_or_none(media_el.attrib.get('height'))
1068             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1069             # If <bootstrapInfo> is present, the specified f4m is a
1070             # stream-level manifest, and only set-level manifests may refer to
1071             # external resources.  See section 11.4 and section 4 of F4M spec
1072             if bootstrap_info is None:
1073                 media_url = None
1074                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1075                 if manifest_version == '2.0':
1076                     media_url = media_el.attrib.get('href')
1077                 if media_url is None:
1078                     media_url = media_el.attrib.get('url')
1079                 if not media_url:
1080                     continue
1081                 manifest_url = (
1082                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1083                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1084                 # If media_url is itself a f4m manifest do the recursive extraction
1085                 # since bitrates in parent manifest (this one) and media_url manifest
1086                 # may differ leading to inability to resolve the format by requested
1087                 # bitrate in f4m downloader
1088                 ext = determine_ext(manifest_url)
1089                 if ext == 'f4m':
1090                     f4m_formats = self._extract_f4m_formats(
1091                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1092                         transform_source=transform_source, fatal=fatal)
1093                     # Sometimes stream-level manifest contains single media entry that
1094                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1095                     # At the same time parent's media entry in set-level manifest may
1096                     # contain it. We will copy it from parent in such cases.
1097                     if len(f4m_formats) == 1:
1098                         f = f4m_formats[0]
1099                         f.update({
1100                             'tbr': f.get('tbr') or tbr,
1101                             'width': f.get('width') or width,
1102                             'height': f.get('height') or height,
1103                             'format_id': f.get('format_id') if not tbr else format_id,
1104                         })
1105                     formats.extend(f4m_formats)
1106                     continue
1107                 elif ext == 'm3u8':
1108                     formats.extend(self._extract_m3u8_formats(
1109                         manifest_url, video_id, 'mp4', preference=preference,
1110                         m3u8_id=m3u8_id, fatal=fatal))
1111                     continue
1112             formats.append({
1113                 'format_id': format_id,
1114                 'url': manifest_url,
1115                 'ext': 'flv' if bootstrap_info is not None else None,
1116                 'tbr': tbr,
1117                 'width': width,
1118                 'height': height,
1119                 'preference': preference,
1120             })
1121         return formats
1122
1123     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1124         return {
1125             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1126             'url': m3u8_url,
1127             'ext': ext,
1128             'protocol': 'm3u8',
1129             'preference': preference - 1 if preference else -1,
1130             'resolution': 'multiple',
1131             'format_note': 'Quality selection URL',
1132         }
1133
1134     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1135                               entry_protocol='m3u8', preference=None,
1136                               m3u8_id=None, note=None, errnote=None,
1137                               fatal=True, live=False):
1138
1139         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1140
1141         format_url = lambda u: (
1142             u
1143             if re.match(r'^https?://', u)
1144             else compat_urlparse.urljoin(m3u8_url, u))
1145
1146         res = self._download_webpage_handle(
1147             m3u8_url, video_id,
1148             note=note or 'Downloading m3u8 information',
1149             errnote=errnote or 'Failed to download m3u8 information',
1150             fatal=fatal)
1151         if res is False:
1152             return []
1153         m3u8_doc, urlh = res
1154         m3u8_url = urlh.geturl()
1155
1156         # We should try extracting formats only from master playlists [1], i.e.
1157         # playlists that describe available qualities. On the other hand media
1158         # playlists [2] should be returned as is since they contain just the media
1159         # without qualities renditions.
1160         # Fortunately, master playlist can be easily distinguished from media
1161         # playlist based on particular tags availability. As of [1, 2] master
1162         # playlist tags MUST NOT appear in a media playist and vice versa.
1163         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1164         # and MUST NOT appear in master playlist thus we can clearly detect media
1165         # playlist with this criterion.
1166         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1167         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1168         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1169         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1170             return [{
1171                 'url': m3u8_url,
1172                 'format_id': m3u8_id,
1173                 'ext': ext,
1174                 'protocol': entry_protocol,
1175                 'preference': preference,
1176             }]
1177         last_info = None
1178         last_media = None
1179         for line in m3u8_doc.splitlines():
1180             if line.startswith('#EXT-X-STREAM-INF:'):
1181                 last_info = parse_m3u8_attributes(line)
1182             elif line.startswith('#EXT-X-MEDIA:'):
1183                 last_media = parse_m3u8_attributes(line)
1184             elif line.startswith('#') or not line.strip():
1185                 continue
1186             else:
1187                 if last_info is None:
1188                     formats.append({'url': format_url(line)})
1189                     continue
1190                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1191                 format_id = []
1192                 if m3u8_id:
1193                     format_id.append(m3u8_id)
1194                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
1195                 # Despite specification does not mention NAME attribute for
1196                 # EXT-X-STREAM-INF it still sometimes may be present
1197                 stream_name = last_info.get('NAME') or last_media_name
1198                 # Bandwidth of live streams may differ over time thus making
1199                 # format_id unpredictable. So it's better to keep provided
1200                 # format_id intact.
1201                 if not live:
1202                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1203                 f = {
1204                     'format_id': '-'.join(format_id),
1205                     'url': format_url(line.strip()),
1206                     'tbr': tbr,
1207                     'ext': ext,
1208                     'protocol': entry_protocol,
1209                     'preference': preference,
1210                 }
1211                 resolution = last_info.get('RESOLUTION')
1212                 if resolution:
1213                     width_str, height_str = resolution.split('x')
1214                     f['width'] = int(width_str)
1215                     f['height'] = int(height_str)
1216                 codecs = last_info.get('CODECS')
1217                 if codecs:
1218                     vcodec, acodec = [None] * 2
1219                     va_codecs = codecs.split(',')
1220                     if len(va_codecs) == 1:
1221                         # Audio only entries usually come with single codec and
1222                         # no resolution. For more robustness we also check it to
1223                         # be mp4 audio.
1224                         if not resolution and va_codecs[0].startswith('mp4a'):
1225                             vcodec, acodec = 'none', va_codecs[0]
1226                         else:
1227                             vcodec = va_codecs[0]
1228                     else:
1229                         vcodec, acodec = va_codecs[:2]
1230                     f.update({
1231                         'acodec': acodec,
1232                         'vcodec': vcodec,
1233                     })
1234                 if last_media is not None:
1235                     f['m3u8_media'] = last_media
1236                     last_media = None
1237                 formats.append(f)
1238                 last_info = {}
1239         return formats
1240
1241     @staticmethod
1242     def _xpath_ns(path, namespace=None):
1243         if not namespace:
1244             return path
1245         out = []
1246         for c in path.split('/'):
1247             if not c or c == '.':
1248                 out.append(c)
1249             else:
1250                 out.append('{%s}%s' % (namespace, c))
1251         return '/'.join(out)
1252
1253     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1254         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1255
1256         if smil is False:
1257             assert not fatal
1258             return []
1259
1260         namespace = self._parse_smil_namespace(smil)
1261
1262         return self._parse_smil_formats(
1263             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1264
1265     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1266         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1267         if smil is False:
1268             return {}
1269         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1270
1271     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1272         return self._download_xml(
1273             smil_url, video_id, 'Downloading SMIL file',
1274             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1275
1276     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1277         namespace = self._parse_smil_namespace(smil)
1278
1279         formats = self._parse_smil_formats(
1280             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1281         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1282
1283         video_id = os.path.splitext(url_basename(smil_url))[0]
1284         title = None
1285         description = None
1286         upload_date = None
1287         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1288             name = meta.attrib.get('name')
1289             content = meta.attrib.get('content')
1290             if not name or not content:
1291                 continue
1292             if not title and name == 'title':
1293                 title = content
1294             elif not description and name in ('description', 'abstract'):
1295                 description = content
1296             elif not upload_date and name == 'date':
1297                 upload_date = unified_strdate(content)
1298
1299         thumbnails = [{
1300             'id': image.get('type'),
1301             'url': image.get('src'),
1302             'width': int_or_none(image.get('width')),
1303             'height': int_or_none(image.get('height')),
1304         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1305
1306         return {
1307             'id': video_id,
1308             'title': title or video_id,
1309             'description': description,
1310             'upload_date': upload_date,
1311             'thumbnails': thumbnails,
1312             'formats': formats,
1313             'subtitles': subtitles,
1314         }
1315
1316     def _parse_smil_namespace(self, smil):
1317         return self._search_regex(
1318             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1319
1320     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1321         base = smil_url
1322         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1323             b = meta.get('base') or meta.get('httpBase')
1324             if b:
1325                 base = b
1326                 break
1327
1328         formats = []
1329         rtmp_count = 0
1330         http_count = 0
1331         m3u8_count = 0
1332
1333         srcs = []
1334         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1335         for medium in media:
1336             src = medium.get('src')
1337             if not src or src in srcs:
1338                 continue
1339             srcs.append(src)
1340
1341             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1342             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1343             width = int_or_none(medium.get('width'))
1344             height = int_or_none(medium.get('height'))
1345             proto = medium.get('proto')
1346             ext = medium.get('ext')
1347             src_ext = determine_ext(src)
1348             streamer = medium.get('streamer') or base
1349
1350             if proto == 'rtmp' or streamer.startswith('rtmp'):
1351                 rtmp_count += 1
1352                 formats.append({
1353                     'url': streamer,
1354                     'play_path': src,
1355                     'ext': 'flv',
1356                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1357                     'tbr': bitrate,
1358                     'filesize': filesize,
1359                     'width': width,
1360                     'height': height,
1361                 })
1362                 if transform_rtmp_url:
1363                     streamer, src = transform_rtmp_url(streamer, src)
1364                     formats[-1].update({
1365                         'url': streamer,
1366                         'play_path': src,
1367                     })
1368                 continue
1369
1370             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1371             src_url = src_url.strip()
1372
1373             if proto == 'm3u8' or src_ext == 'm3u8':
1374                 m3u8_formats = self._extract_m3u8_formats(
1375                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1376                 if len(m3u8_formats) == 1:
1377                     m3u8_count += 1
1378                     m3u8_formats[0].update({
1379                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1380                         'tbr': bitrate,
1381                         'width': width,
1382                         'height': height,
1383                     })
1384                 formats.extend(m3u8_formats)
1385                 continue
1386
1387             if src_ext == 'f4m':
1388                 f4m_url = src_url
1389                 if not f4m_params:
1390                     f4m_params = {
1391                         'hdcore': '3.2.0',
1392                         'plugin': 'flowplayer-3.2.0.1',
1393                     }
1394                 f4m_url += '&' if '?' in f4m_url else '?'
1395                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1396                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1397                 continue
1398
1399             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1400                 http_count += 1
1401                 formats.append({
1402                     'url': src_url,
1403                     'ext': ext or src_ext or 'flv',
1404                     'format_id': 'http-%d' % (bitrate or http_count),
1405                     'tbr': bitrate,
1406                     'filesize': filesize,
1407                     'width': width,
1408                     'height': height,
1409                 })
1410                 continue
1411
1412         return formats
1413
1414     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1415         urls = []
1416         subtitles = {}
1417         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1418             src = textstream.get('src')
1419             if not src or src in urls:
1420                 continue
1421             urls.append(src)
1422             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1423             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1424             subtitles.setdefault(lang, []).append({
1425                 'url': src,
1426                 'ext': ext,
1427             })
1428         return subtitles
1429
1430     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1431         xspf = self._download_xml(
1432             playlist_url, playlist_id, 'Downloading xpsf playlist',
1433             'Unable to download xspf manifest', fatal=fatal)
1434         if xspf is False:
1435             return []
1436         return self._parse_xspf(xspf, playlist_id)
1437
1438     def _parse_xspf(self, playlist, playlist_id):
1439         NS_MAP = {
1440             'xspf': 'http://xspf.org/ns/0/',
1441             's1': 'http://static.streamone.nl/player/ns/0',
1442         }
1443
1444         entries = []
1445         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1446             title = xpath_text(
1447                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1448             description = xpath_text(
1449                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1450             thumbnail = xpath_text(
1451                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1452             duration = float_or_none(
1453                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1454
1455             formats = [{
1456                 'url': location.text,
1457                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1458                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1459                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1460             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1461             self._sort_formats(formats)
1462
1463             entries.append({
1464                 'id': playlist_id,
1465                 'title': title,
1466                 'description': description,
1467                 'thumbnail': thumbnail,
1468                 'duration': duration,
1469                 'formats': formats,
1470             })
1471         return entries
1472
1473     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1474         res = self._download_webpage_handle(
1475             mpd_url, video_id,
1476             note=note or 'Downloading MPD manifest',
1477             errnote=errnote or 'Failed to download MPD manifest',
1478             fatal=fatal)
1479         if res is False:
1480             return []
1481         mpd, urlh = res
1482         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1483
1484         return self._parse_mpd_formats(
1485             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1486
1487     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1488         if mpd_doc.get('type') == 'dynamic':
1489             return []
1490
1491         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1492
1493         def _add_ns(path):
1494             return self._xpath_ns(path, namespace)
1495
1496         def is_drm_protected(element):
1497             return element.find(_add_ns('ContentProtection')) is not None
1498
1499         def extract_multisegment_info(element, ms_parent_info):
1500             ms_info = ms_parent_info.copy()
1501             segment_list = element.find(_add_ns('SegmentList'))
1502             if segment_list is not None:
1503                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1504                 if segment_urls_e:
1505                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1506                 initialization = segment_list.find(_add_ns('Initialization'))
1507                 if initialization is not None:
1508                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1509             else:
1510                 segment_template = element.find(_add_ns('SegmentTemplate'))
1511                 if segment_template is not None:
1512                     start_number = segment_template.get('startNumber')
1513                     if start_number:
1514                         ms_info['start_number'] = int(start_number)
1515                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1516                     if segment_timeline is not None:
1517                         s_e = segment_timeline.findall(_add_ns('S'))
1518                         if s_e:
1519                             ms_info['total_number'] = 0
1520                             for s in s_e:
1521                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1522                     else:
1523                         timescale = segment_template.get('timescale')
1524                         if timescale:
1525                             ms_info['timescale'] = int(timescale)
1526                         segment_duration = segment_template.get('duration')
1527                         if segment_duration:
1528                             ms_info['segment_duration'] = int(segment_duration)
1529                     media_template = segment_template.get('media')
1530                     if media_template:
1531                         ms_info['media_template'] = media_template
1532                     initialization = segment_template.get('initialization')
1533                     if initialization:
1534                         ms_info['initialization_url'] = initialization
1535                     else:
1536                         initialization = segment_template.find(_add_ns('Initialization'))
1537                         if initialization is not None:
1538                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1539             return ms_info
1540
1541         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1542         formats = []
1543         for period in mpd_doc.findall(_add_ns('Period')):
1544             period_duration = parse_duration(period.get('duration')) or mpd_duration
1545             period_ms_info = extract_multisegment_info(period, {
1546                 'start_number': 1,
1547                 'timescale': 1,
1548             })
1549             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1550                 if is_drm_protected(adaptation_set):
1551                     continue
1552                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1553                 for representation in adaptation_set.findall(_add_ns('Representation')):
1554                     if is_drm_protected(representation):
1555                         continue
1556                     representation_attrib = adaptation_set.attrib.copy()
1557                     representation_attrib.update(representation.attrib)
1558                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1559                     mime_type = representation_attrib['mimeType']
1560                     content_type = mime_type.split('/')[0]
1561                     if content_type == 'text':
1562                         # TODO implement WebVTT downloading
1563                         pass
1564                     elif content_type == 'video' or content_type == 'audio':
1565                         base_url = ''
1566                         for element in (representation, adaptation_set, period, mpd_doc):
1567                             base_url_e = element.find(_add_ns('BaseURL'))
1568                             if base_url_e is not None:
1569                                 base_url = base_url_e.text + base_url
1570                                 if re.match(r'^https?://', base_url):
1571                                     break
1572                         if mpd_base_url and not re.match(r'^https?://', base_url):
1573                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1574                                 mpd_base_url += '/'
1575                             base_url = mpd_base_url + base_url
1576                         representation_id = representation_attrib.get('id')
1577                         lang = representation_attrib.get('lang')
1578                         url_el = representation.find(_add_ns('BaseURL'))
1579                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1580                         f = {
1581                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1582                             'url': base_url,
1583                             'ext': mimetype2ext(mime_type),
1584                             'width': int_or_none(representation_attrib.get('width')),
1585                             'height': int_or_none(representation_attrib.get('height')),
1586                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1587                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1588                             'fps': int_or_none(representation_attrib.get('frameRate')),
1589                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1590                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1591                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1592                             'format_note': 'DASH %s' % content_type,
1593                             'filesize': filesize,
1594                         }
1595                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1596                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1597                             if 'total_number' not in representation_ms_info and 'segment_duration':
1598                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1599                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1600                             media_template = representation_ms_info['media_template']
1601                             media_template = media_template.replace('$RepresentationID$', representation_id)
1602                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
1603                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1604                             media_template.replace('$$', '$')
1605                             representation_ms_info['segment_urls'] = [
1606                                 media_template % {
1607                                     'Number': segment_number,
1608                                     'Bandwidth': representation_attrib.get('bandwidth')}
1609                                 for segment_number in range(
1610                                     representation_ms_info['start_number'],
1611                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1612                         if 'segment_urls' in representation_ms_info:
1613                             f.update({
1614                                 'segment_urls': representation_ms_info['segment_urls'],
1615                                 'protocol': 'http_dash_segments',
1616                             })
1617                             if 'initialization_url' in representation_ms_info:
1618                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1619                                 f.update({
1620                                     'initialization_url': initialization_url,
1621                                 })
1622                                 if not f.get('url'):
1623                                     f['url'] = initialization_url
1624                         try:
1625                             existing_format = next(
1626                                 fo for fo in formats
1627                                 if fo['format_id'] == representation_id)
1628                         except StopIteration:
1629                             full_info = formats_dict.get(representation_id, {}).copy()
1630                             full_info.update(f)
1631                             formats.append(full_info)
1632                         else:
1633                             existing_format.update(f)
1634                     else:
1635                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1636         return formats
1637
1638     def _live_title(self, name):
1639         """ Generate the title for a live video """
1640         now = datetime.datetime.now()
1641         now_str = now.strftime('%Y-%m-%d %H:%M')
1642         return name + ' ' + now_str
1643
1644     def _int(self, v, name, fatal=False, **kwargs):
1645         res = int_or_none(v, **kwargs)
1646         if 'get_attr' in kwargs:
1647             print(getattr(v, kwargs['get_attr']))
1648         if res is None:
1649             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1650             if fatal:
1651                 raise ExtractorError(msg)
1652             else:
1653                 self._downloader.report_warning(msg)
1654         return res
1655
1656     def _float(self, v, name, fatal=False, **kwargs):
1657         res = float_or_none(v, **kwargs)
1658         if res is None:
1659             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1660             if fatal:
1661                 raise ExtractorError(msg)
1662             else:
1663                 self._downloader.report_warning(msg)
1664         return res
1665
1666     def _set_cookie(self, domain, name, value, expire_time=None):
1667         cookie = compat_cookiejar.Cookie(
1668             0, name, value, None, None, domain, None,
1669             None, '/', True, False, expire_time, '', None, None, None)
1670         self._downloader.cookiejar.set_cookie(cookie)
1671
1672     def _get_cookies(self, url):
1673         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1674         req = sanitized_Request(url)
1675         self._downloader.cookiejar.add_cookie_header(req)
1676         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1677
1678     def get_testcases(self, include_onlymatching=False):
1679         t = getattr(self, '_TEST', None)
1680         if t:
1681             assert not hasattr(self, '_TESTS'), \
1682                 '%s has _TEST and _TESTS' % type(self).__name__
1683             tests = [t]
1684         else:
1685             tests = getattr(self, '_TESTS', [])
1686         for t in tests:
1687             if not include_onlymatching and t.get('only_matching', False):
1688                 continue
1689             t['name'] = type(self).__name__[:-len('IE')]
1690             yield t
1691
1692     def is_suitable(self, age_limit):
1693         """ Test whether the extractor is generally suitable for the given
1694         age limit (i.e. pornographic sites are not, all others usually are) """
1695
1696         any_restricted = False
1697         for tc in self.get_testcases(include_onlymatching=False):
1698             if 'playlist' in tc:
1699                 tc = tc['playlist'][0]
1700             is_restricted = age_restricted(
1701                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1702             if not is_restricted:
1703                 return True
1704             any_restricted = any_restricted or is_restricted
1705         return not any_restricted
1706
1707     def extract_subtitles(self, *args, **kwargs):
1708         if (self._downloader.params.get('writesubtitles', False) or
1709                 self._downloader.params.get('listsubtitles')):
1710             return self._get_subtitles(*args, **kwargs)
1711         return {}
1712
1713     def _get_subtitles(self, *args, **kwargs):
1714         raise NotImplementedError('This method must be implemented by subclasses')
1715
1716     @staticmethod
1717     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1718         """ Merge subtitle items for one language. Items with duplicated URLs
1719         will be dropped. """
1720         list1_urls = set([item['url'] for item in subtitle_list1])
1721         ret = list(subtitle_list1)
1722         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1723         return ret
1724
1725     @classmethod
1726     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1727         """ Merge two subtitle dictionaries, language by language. """
1728         ret = dict(subtitle_dict1)
1729         for lang in subtitle_dict2:
1730             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1731         return ret
1732
1733     def extract_automatic_captions(self, *args, **kwargs):
1734         if (self._downloader.params.get('writeautomaticsub', False) or
1735                 self._downloader.params.get('listsubtitles')):
1736             return self._get_automatic_captions(*args, **kwargs)
1737         return {}
1738
1739     def _get_automatic_captions(self, *args, **kwargs):
1740         raise NotImplementedError('This method must be implemented by subclasses')
1741
1742     def mark_watched(self, *args, **kwargs):
1743         if (self._downloader.params.get('mark_watched', False) and
1744                 (self._get_login_info()[0] is not None or
1745                     self._downloader.params.get('cookiefile') is not None)):
1746             self._mark_watched(*args, **kwargs)
1747
1748     def _mark_watched(self, *args, **kwargs):
1749         raise NotImplementedError('This method must be implemented by subclasses')
1750
1751     def geo_verification_headers(self):
1752         headers = {}
1753         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
1754         if geo_verification_proxy:
1755             headers['Ytdl-request-proxy'] = geo_verification_proxy
1756         return headers
1757
1758
1759 class SearchInfoExtractor(InfoExtractor):
1760     """
1761     Base class for paged search queries extractors.
1762     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1763     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1764     """
1765
1766     @classmethod
1767     def _make_valid_url(cls):
1768         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1769
1770     @classmethod
1771     def suitable(cls, url):
1772         return re.match(cls._make_valid_url(), url) is not None
1773
1774     def _real_extract(self, query):
1775         mobj = re.match(self._make_valid_url(), query)
1776         if mobj is None:
1777             raise ExtractorError('Invalid search query "%s"' % query)
1778
1779         prefix = mobj.group('prefix')
1780         query = mobj.group('query')
1781         if prefix == '':
1782             return self._get_n_results(query, 1)
1783         elif prefix == 'all':
1784             return self._get_n_results(query, self._MAX_RESULTS)
1785         else:
1786             n = int(prefix)
1787             if n <= 0:
1788                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1789             elif n > self._MAX_RESULTS:
1790                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1791                 n = self._MAX_RESULTS
1792             return self._get_n_results(query, n)
1793
1794     def _get_n_results(self, query, n):
1795         """Get a specified number of results for a query"""
1796         raise NotImplementedError('This method must be implemented by subclasses')
1797
1798     @property
1799     def SEARCH_KEY(self):
1800         return self._SEARCH_KEY