_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     NO_DEFAULT,
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     clean_html,
  45     compiled_regex_type,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     error_to_compat_str,
  50     ExtractorError,
  51     extract_attributes,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     js_to_json,
  58     JSON_LD_RE,
  59     mimetype2ext,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     strip_or_none,
  72     unescapeHTML,
  73     unified_strdate,
  74     unified_timestamp,
  75     update_Request,
  76     update_url_query,
  77     urljoin,
  78     url_basename,
  79     url_or_none,
  80     xpath_element,
  81     xpath_text,
  82     xpath_with_ns,
  83 )
  84
  85
  86 class InfoExtractor(object):
  87     """Information Extractor class.
  88
  89     Information extractors are the classes that, given a URL, extract
  90     information about the video (or videos) the URL refers to. This
  91     information includes the real video URL, the video title, author and
  92     others. The information is stored in a dictionary which is then
  93     passed to the YoutubeDL. The YoutubeDL processes this
  94     information possibly downloading the video to the file system, among
  95     other possible outcomes.
  96
  97     The type field determines the type of the result.
  98     By far the most common value (and the default if _type is missing) is
  99     "video", which indicates a single video.
 100
 101     For a video, the dictionaries must include the following fields:
 102
 103     id:             Video identifier.
 104     title:          Video title, unescaped.
 105
 106     Additionally, it must contain either a formats entry or a url one:
 107
 108     formats:        A list of dictionaries for each format available, ordered
 109                     from worst to best quality.
 110
 111                     Potential fields:
 112                     * url        The mandatory URL representing the media:
 113                                    for plain file media - HTTP URL of this file,
 114                                    for RTMP - RTMP URL,
 115                                    for HLS - URL of the M3U8 media playlist,
 116                                    for HDS - URL of the F4M manifest,
 117                                    for DASH
 118                                      - HTTP URL to plain file media (in case of
 119                                        unfragmented media)
 120                                      - URL of the MPD manifest or base URL
 121                                        representing the media if MPD manifest
 122                                        is parsed from a string (in case of
 123                                        fragmented media)
 124                                    for MSS - URL of the ISM manifest.
 125                     * manifest_url
 126                                  The URL of the manifest file in case of
 127                                  fragmented media:
 128                                    for HLS - URL of the M3U8 master playlist,
 129                                    for HDS - URL of the F4M manifest,
 130                                    for DASH - URL of the MPD manifest,
 131                                    for MSS - URL of the ISM manifest.
 132                     * ext        Will be calculated from URL if missing
 133                     * format     A human-readable description of the format
 134                                  ("mp4 container with h264/opus").
 135                                  Calculated from the format_id, width, height.
 136                                  and format_note fields if missing.
 137                     * format_id  A short description of the format
 138                                  ("mp4_h264_opus" or "19").
 139                                 Technically optional, but strongly recommended.
 140                     * format_note Additional info about the format
 141                                  ("3D" or "DASH video")
 142                     * width      Width of the video, if known
 143                     * height     Height of the video, if known
 144                     * resolution Textual description of width and height
 145                     * tbr        Average bitrate of audio and video in KBit/s
 146                     * abr        Average audio bitrate in KBit/s
 147                     * acodec     Name of the audio codec in use
 148                     * asr        Audio sampling rate in Hertz
 149                     * vbr        Average video bitrate in KBit/s
 150                     * fps        Frame rate
 151                     * vcodec     Name of the video codec in use
 152                     * container  Name of the container format
 153                     * filesize   The number of bytes, if known in advance
 154                     * filesize_approx  An estimate for the number of bytes
 155                     * player_url SWF Player URL (used for rtmpdump).
 156                     * protocol   The protocol that will be used for the actual
 157                                  download, lower-case.
 158                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 159                                  "m3u8", "m3u8_native" or "http_dash_segments".
 160                     * fragment_base_url
 161                                  Base URL for fragments. Each fragment's path
 162                                  value (if present) will be relative to
 163                                  this URL.
 164                     * fragments  A list of fragments of a fragmented media.
 165                                  Each fragment entry must contain either an url
 166                                  or a path. If an url is present it should be
 167                                  considered by a client. Otherwise both path and
 168                                  fragment_base_url must be present. Here is
 169                                  the list of all potential fields:
 170                                  * "url" - fragment's URL
 171                                  * "path" - fragment's path relative to
 172                                             fragment_base_url
 173                                  * "duration" (optional, int or float)
 174                                  * "filesize" (optional, int)
 175                     * preference Order number of this format. If this field is
 176                                  present and not None, the formats get sorted
 177                                  by this field, regardless of all other values.
 178                                  -1 for default (order by other properties),
 179                                  -2 or smaller for less than default.
 180                                  < -1000 to hide the format (if there is
 181                                     another one which is strictly better)
 182                     * language   Language code, e.g. "de" or "en-US".
 183                     * language_preference  Is this in the language mentioned in
 184                                  the URL?
 185                                  10 if it's what the URL is about,
 186                                  -1 for default (don't know),
 187                                  -10 otherwise, other values reserved for now.
 188                     * quality    Order number of the video quality of this
 189                                  format, irrespective of the file format.
 190                                  -1 for default (order by other properties),
 191                                  -2 or smaller for less than default.
 192                     * source_preference  Order number for this video source
 193                                   (quality takes higher priority)
 194                                  -1 for default (order by other properties),
 195                                  -2 or smaller for less than default.
 196                     * http_headers  A dictionary of additional HTTP headers
 197                                  to add to the request.
 198                     * stretched_ratio  If given and not 1, indicates that the
 199                                  video's pixels are not square.
 200                                  width : height ratio as float.
 201                     * no_resume  The server does not support resuming the
 202                                  (HTTP or RTMP) download. Boolean.
 203                     * downloader_options  A dictionary of downloader options as
 204                                  described in FileDownloader
 205
 206     url:            Final video URL.
 207     ext:            Video filename extension.
 208     format:         The video format, defaults to ext (used for --get-format)
 209     player_url:     SWF Player URL (used for rtmpdump).
 210
 211     The following fields are optional:
 212
 213     alt_title:      A secondary title of the video.
 214     display_id      An alternative identifier for the video, not necessarily
 215                     unique, but available before title. Typically, id is
 216                     something like "4234987", title "Dancing naked mole rats",
 217                     and display_id "dancing-naked-mole-rats"
 218     thumbnails:     A list of dictionaries, with the following entries:
 219                         * "id" (optional, string) - Thumbnail format ID
 220                         * "url"
 221                         * "preference" (optional, int) - quality of the image
 222                         * "width" (optional, int)
 223                         * "height" (optional, int)
 224                         * "resolution" (optional, string "{width}x{height}",
 225                                         deprecated)
 226                         * "filesize" (optional, int)
 227     thumbnail:      Full URL to a video thumbnail image.
 228     description:    Full video description.
 229     uploader:       Full name of the video uploader.
 230     license:        License name the video is licensed under.
 231     creator:        The creator of the video.
 232     release_date:   The date (YYYYMMDD) when the video was released.
 233     timestamp:      UNIX timestamp of the moment the video became available.
 234     upload_date:    Video upload date (YYYYMMDD).
 235                     If not explicitly set, calculated from timestamp.
 236     uploader_id:    Nickname or id of the video uploader.
 237     uploader_url:   Full URL to a personal webpage of the video uploader.
 238     channel:        Full name of the channel the video is uploaded on.
 239                     Note that channel fields may or may not repeat uploader
 240                     fields. This depends on a particular extractor.
 241     channel_id:     Id of the channel.
 242     channel_url:    Full URL to a channel webpage.
 243     location:       Physical location where the video was filmed.
 244     subtitles:      The available subtitles as a dictionary in the format
 245                     {tag: subformats}. "tag" is usually a language code, and
 246                     "subformats" is a list sorted from lower to higher
 247                     preference, each element is a dictionary with the "ext"
 248                     entry and one of:
 249                         * "data": The subtitles file contents
 250                         * "url": A URL pointing to the subtitles file
 251                     "ext" will be calculated from URL if missing
 252     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 253                     automatically generated captions
 254     duration:       Length of the video in seconds, as an integer or float.
 255     view_count:     How many users have watched the video on the platform.
 256     like_count:     Number of positive ratings of the video
 257     dislike_count:  Number of negative ratings of the video
 258     repost_count:   Number of reposts of the video
 259     average_rating: Average rating give by users, the scale used depends on the webpage
 260     comment_count:  Number of comments on the video
 261     comments:       A list of comments, each with one or more of the following
 262                     properties (all but one of text or html optional):
 263                         * "author" - human-readable name of the comment author
 264                         * "author_id" - user ID of the comment author
 265                         * "id" - Comment ID
 266                         * "html" - Comment as HTML
 267                         * "text" - Plain text of the comment
 268                         * "timestamp" - UNIX timestamp of comment
 269                         * "parent" - ID of the comment this one is replying to.
 270                                      Set to "root" to indicate that this is a
 271                                      comment to the original video.
 272     age_limit:      Age restriction for the video, as an integer (years)
 273     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 274                     should allow to get the same result again. (It will be set
 275                     by YoutubeDL if it's missing)
 276     categories:     A list of categories that the video falls in, for example
 277                     ["Sports", "Berlin"]
 278     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 279     is_live:        True, False, or None (=unknown). Whether this video is a
 280                     live stream that goes on instead of a fixed-length video.
 281     start_time:     Time in seconds where the reproduction should start, as
 282                     specified in the URL.
 283     end_time:       Time in seconds where the reproduction should end, as
 284                     specified in the URL.
 285     chapters:       A list of dictionaries, with the following entries:
 286                         * "start_time" - The start time of the chapter in seconds
 287                         * "end_time" - The end time of the chapter in seconds
 288                         * "title" (optional, string)
 289
 290     The following fields should only be used when the video belongs to some logical
 291     chapter or section:
 292
 293     chapter:        Name or title of the chapter the video belongs to.
 294     chapter_number: Number of the chapter the video belongs to, as an integer.
 295     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 296
 297     The following fields should only be used when the video is an episode of some
 298     series, programme or podcast:
 299
 300     series:         Title of the series or programme the video episode belongs to.
 301     season:         Title of the season the video episode belongs to.
 302     season_number:  Number of the season the video episode belongs to, as an integer.
 303     season_id:      Id of the season the video episode belongs to, as a unicode string.
 304     episode:        Title of the video episode. Unlike mandatory video title field,
 305                     this field should denote the exact title of the video episode
 306                     without any kind of decoration.
 307     episode_number: Number of the video episode within a season, as an integer.
 308     episode_id:     Id of the video episode, as a unicode string.
 309
 310     The following fields should only be used when the media is a track or a part of
 311     a music album:
 312
 313     track:          Title of the track.
 314     track_number:   Number of the track within an album or a disc, as an integer.
 315     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 316                     as a unicode string.
 317     artist:         Artist(s) of the track.
 318     genre:          Genre(s) of the track.
 319     album:          Title of the album the track belongs to.
 320     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 321     album_artist:   List of all artists appeared on the album (e.g.
 322                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 323                     and compilations).
 324     disc_number:    Number of the disc or other physical medium the track belongs to,
 325                     as an integer.
 326     release_year:   Year (YYYY) when the album was released.
 327
 328     Unless mentioned otherwise, the fields should be Unicode strings.
 329
 330     Unless mentioned otherwise, None is equivalent to absence of information.
 331
 332
 333     _type "playlist" indicates multiple videos.
 334     There must be a key "entries", which is a list, an iterable, or a PagedList
 335     object, each element of which is a valid dictionary by this specification.
 336
 337     Additionally, playlists can have "id", "title", "description", "uploader",
 338     "uploader_id", "uploader_url" attributes with the same semantics as videos
 339     (see above).
 340
 341
 342     _type "multi_video" indicates that there are multiple videos that
 343     form a single show, for examples multiple acts of an opera or TV episode.
 344     It must have an entries key like a playlist and contain all the keys
 345     required for a video at the same time.
 346
 347
 348     _type "url" indicates that the video must be extracted from another
 349     location, possibly by a different extractor. Its only required key is:
 350     "url" - the next URL to extract.
 351     The key "ie_key" can be set to the class name (minus the trailing "IE",
 352     e.g. "Youtube") if the extractor class is known in advance.
 353     Additionally, the dictionary may have any properties of the resolved entity
 354     known in advance, for example "title" if the title of the referred video is
 355     known ahead of time.
 356
 357
 358     _type "url_transparent" entities have the same specification as "url", but
 359     indicate that the given additional information is more precise than the one
 360     associated with the resolved URL.
 361     This is useful when a site employs a video service that hosts the video and
 362     its technical metadata, but that video service does not embed a useful
 363     title, description etc.
 364
 365
 366     Subclasses of this one should re-define the _real_initialize() and
 367     _real_extract() methods and define a _VALID_URL regexp.
 368     Probably, they should also be added to the list of extractors.
 369
 370     _GEO_BYPASS attribute may be set to False in order to disable
 371     geo restriction bypass mechanisms for a particular extractor.
 372     Though it won't disable explicit geo restriction bypass based on
 373     country code provided with geo_bypass_country.
 374
 375     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 376     countries for this extractor. One of these countries will be used by
 377     geo restriction bypass mechanism right away in order to bypass
 378     geo restriction, of course, if the mechanism is not disabled.
 379
 380     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 381     IP blocks in CIDR notation for this extractor. One of these IP blocks
 382     will be used by geo restriction bypass mechanism similarly
 383     to _GEO_COUNTRIES.
 384
 385     Finally, the _WORKING attribute should be set to False for broken IEs
 386     in order to warn the users and skip the tests.
 387     """
 388
 389     _ready = False
 390     _downloader = None
 391     _x_forwarded_for_ip = None
 392     _GEO_BYPASS = True
 393     _GEO_COUNTRIES = None
 394     _GEO_IP_BLOCKS = None
 395     _WORKING = True
 396
 397     def __init__(self, downloader=None):
 398         """Constructor. Receives an optional downloader."""
 399         self._ready = False
 400         self._x_forwarded_for_ip = None
 401         self.set_downloader(downloader)
 402
 403     @classmethod
 404     def suitable(cls, url):
 405         """Receives a URL and returns True if suitable for this IE."""
 406
 407         # This does not use has/getattr intentionally - we want to know whether
 408         # we have cached the regexp for *this* class, whereas getattr would also
 409         # match the superclass
 410         if '_VALID_URL_RE' not in cls.__dict__:
 411             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 412         return cls._VALID_URL_RE.match(url) is not None
 413
 414     @classmethod
 415     def _match_id(cls, url):
 416         if '_VALID_URL_RE' not in cls.__dict__:
 417             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 418         m = cls._VALID_URL_RE.match(url)
 419         assert m
 420         return compat_str(m.group('id'))
 421
 422     @classmethod
 423     def working(cls):
 424         """Getter method for _WORKING."""
 425         return cls._WORKING
 426
 427     def initialize(self):
 428         """Initializes an instance (authentication, etc)."""
 429         self._initialize_geo_bypass({
 430             'countries': self._GEO_COUNTRIES,
 431             'ip_blocks': self._GEO_IP_BLOCKS,
 432         })
 433         if not self._ready:
 434             self._real_initialize()
 435             self._ready = True
 436
 437     def _initialize_geo_bypass(self, geo_bypass_context):
 438         """
 439         Initialize geo restriction bypass mechanism.
 440
 441         This method is used to initialize geo bypass mechanism based on faking
 442         X-Forwarded-For HTTP header. A random country from provided country list
 443         is selected and a random IP belonging to this country is generated. This
 444         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 445         HTTP requests.
 446
 447         This method will be used for initial geo bypass mechanism initialization
 448         during the instance initialization with _GEO_COUNTRIES and
 449         _GEO_IP_BLOCKS.
 450
 451         You may also manually call it from extractor's code if geo bypass
 452         information is not available beforehand (e.g. obtained during
 453         extraction) or due to some other reason. In this case you should pass
 454         this information in geo bypass context passed as first argument. It may
 455         contain following fields:
 456
 457         countries:  List of geo unrestricted countries (similar
 458                     to _GEO_COUNTRIES)
 459         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 460                     (similar to _GEO_IP_BLOCKS)
 461
 462         """
 463         if not self._x_forwarded_for_ip:
 464
 465             # Geo bypass mechanism is explicitly disabled by user
 466             if not self._downloader.params.get('geo_bypass', True):
 467                 return
 468
 469             if not geo_bypass_context:
 470                 geo_bypass_context = {}
 471
 472             # Backward compatibility: previously _initialize_geo_bypass
 473             # expected a list of countries, some 3rd party code may still use
 474             # it this way
 475             if isinstance(geo_bypass_context, (list, tuple)):
 476                 geo_bypass_context = {
 477                     'countries': geo_bypass_context,
 478                 }
 479
 480             # The whole point of geo bypass mechanism is to fake IP
 481             # as X-Forwarded-For HTTP header based on some IP block or
 482             # country code.
 483
 484             # Path 1: bypassing based on IP block in CIDR notation
 485
 486             # Explicit IP block specified by user, use it right away
 487             # regardless of whether extractor is geo bypassable or not
 488             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 489
 490             # Otherwise use random IP block from geo bypass context but only
 491             # if extractor is known as geo bypassable
 492             if not ip_block:
 493                 ip_blocks = geo_bypass_context.get('ip_blocks')
 494                 if self._GEO_BYPASS and ip_blocks:
 495                     ip_block = random.choice(ip_blocks)
 496
 497             if ip_block:
 498                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 499                 if self._downloader.params.get('verbose', False):
 500                     self._downloader.to_screen(
 501                         '[debug] Using fake IP %s as X-Forwarded-For.'
 502                         % self._x_forwarded_for_ip)
 503                 return
 504
 505             # Path 2: bypassing based on country code
 506
 507             # Explicit country code specified by user, use it right away
 508             # regardless of whether extractor is geo bypassable or not
 509             country = self._downloader.params.get('geo_bypass_country', None)
 510
 511             # Otherwise use random country code from geo bypass context but
 512             # only if extractor is known as geo bypassable
 513             if not country:
 514                 countries = geo_bypass_context.get('countries')
 515                 if self._GEO_BYPASS and countries:
 516                     country = random.choice(countries)
 517
 518             if country:
 519                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 520                 if self._downloader.params.get('verbose', False):
 521                     self._downloader.to_screen(
 522                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 523                         % (self._x_forwarded_for_ip, country.upper()))
 524
 525     def extract(self, url):
 526         """Extracts URL information and returns it in list of dicts."""
 527         try:
 528             for _ in range(2):
 529                 try:
 530                     self.initialize()
 531                     ie_result = self._real_extract(url)
 532                     if self._x_forwarded_for_ip:
 533                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 534                     return ie_result
 535                 except GeoRestrictedError as e:
 536                     if self.__maybe_fake_ip_and_retry(e.countries):
 537                         continue
 538                     raise
 539         except ExtractorError:
 540             raise
 541         except compat_http_client.IncompleteRead as e:
 542             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 543         except (KeyError, StopIteration) as e:
 544             raise ExtractorError('An extractor error has occurred.', cause=e)
 545
 546     def __maybe_fake_ip_and_retry(self, countries):
 547         if (not self._downloader.params.get('geo_bypass_country', None)
 548                 and self._GEO_BYPASS
 549                 and self._downloader.params.get('geo_bypass', True)
 550                 and not self._x_forwarded_for_ip
 551                 and countries):
 552             country_code = random.choice(countries)
 553             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 554             if self._x_forwarded_for_ip:
 555                 self.report_warning(
 556                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 557                     % (self._x_forwarded_for_ip, country_code.upper()))
 558                 return True
 559         return False
 560
 561     def set_downloader(self, downloader):
 562         """Sets the downloader for this IE."""
 563         self._downloader = downloader
 564
 565     def _real_initialize(self):
 566         """Real initialization process. Redefine in subclasses."""
 567         pass
 568
 569     def _real_extract(self, url):
 570         """Real extraction process. Redefine in subclasses."""
 571         pass
 572
 573     @classmethod
 574     def ie_key(cls):
 575         """A string for getting the InfoExtractor with get_info_extractor"""
 576         return compat_str(cls.__name__[:-2])
 577
 578     @property
 579     def IE_NAME(self):
 580         return compat_str(type(self).__name__[:-2])
 581
 582     @staticmethod
 583     def __can_accept_status_code(err, expected_status):
 584         assert isinstance(err, compat_urllib_error.HTTPError)
 585         if expected_status is None:
 586             return False
 587         if isinstance(expected_status, compat_integer_types):
 588             return err.code == expected_status
 589         elif isinstance(expected_status, (list, tuple)):
 590             return err.code in expected_status
 591         elif callable(expected_status):
 592             return expected_status(err.code) is True
 593         else:
 594             assert False
 595
 596     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 597         """
 598         Return the response handle.
 599
 600         See _download_webpage docstring for arguments specification.
 601         """
 602         if note is None:
 603             self.report_download_webpage(video_id)
 604         elif note is not False:
 605             if video_id is None:
 606                 self.to_screen('%s' % (note,))
 607             else:
 608                 self.to_screen('%s: %s' % (video_id, note))
 609
 610         # Some sites check X-Forwarded-For HTTP header in order to figure out
 611         # the origin of the client behind proxy. This allows bypassing geo
 612         # restriction by faking this header's value to IP that belongs to some
 613         # geo unrestricted country. We will do so once we encounter any
 614         # geo restriction error.
 615         if self._x_forwarded_for_ip:
 616             if 'X-Forwarded-For' not in headers:
 617                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 618
 619         if isinstance(url_or_request, compat_urllib_request.Request):
 620             url_or_request = update_Request(
 621                 url_or_request, data=data, headers=headers, query=query)
 622         else:
 623             if query:
 624                 url_or_request = update_url_query(url_or_request, query)
 625             if data is not None or headers:
 626                 url_or_request = sanitized_Request(url_or_request, data, headers)
 627         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 628         if hasattr(ssl, 'CertificateError'):
 629             exceptions.append(ssl.CertificateError)
 630         try:
 631             return self._downloader.urlopen(url_or_request)
 632         except tuple(exceptions) as err:
 633             if isinstance(err, compat_urllib_error.HTTPError):
 634                 if self.__can_accept_status_code(err, expected_status):
 635                     # Retain reference to error to prevent file object from
 636                     # being closed before it can be read. Works around the
 637                     # effects of <https://bugs.python.org/issue15002>
 638                     # introduced in Python 3.4.1.
 639                     err.fp._error = err
 640                     return err.fp
 641
 642             if errnote is False:
 643                 return False
 644             if errnote is None:
 645                 errnote = 'Unable to download webpage'
 646
 647             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 648             if fatal:
 649                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 650             else:
 651                 self._downloader.report_warning(errmsg)
 652                 return False
 653
 654     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 655         """
 656         Return a tuple (page content as string, URL handle).
 657
 658         See _download_webpage docstring for arguments specification.
 659         """
 660         # Strip hashes from the URL (#1038)
 661         if isinstance(url_or_request, (compat_str, str)):
 662             url_or_request = url_or_request.partition('#')[0]
 663
 664         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 665         if urlh is False:
 666             assert not fatal
 667             return False
 668         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 669         return (content, urlh)
 670
 671     @staticmethod
 672     def _guess_encoding_from_content(content_type, webpage_bytes):
 673         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 674         if m:
 675             encoding = m.group(1)
 676         else:
 677             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 678                           webpage_bytes[:1024])
 679             if m:
 680                 encoding = m.group(1).decode('ascii')
 681             elif webpage_bytes.startswith(b'\xff\xfe'):
 682                 encoding = 'utf-16'
 683             else:
 684                 encoding = 'utf-8'
 685
 686         return encoding
 687
 688     def __check_blocked(self, content):
 689         first_block = content[:512]
 690         if ('<title>Access to this site is blocked</title>' in content
 691                 and 'Websense' in first_block):
 692             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 693             blocked_iframe = self._html_search_regex(
 694                 r'<iframe src="([^"]+)"', content,
 695                 'Websense information URL', default=None)
 696             if blocked_iframe:
 697                 msg += ' Visit %s for more details' % blocked_iframe
 698             raise ExtractorError(msg, expected=True)
 699         if '<title>The URL you requested has been blocked</title>' in first_block:
 700             msg = (
 701                 'Access to this webpage has been blocked by Indian censorship. '
 702                 'Use a VPN or proxy server (with --proxy) to route around it.')
 703             block_msg = self._html_search_regex(
 704                 r'</h1><p>(.*?)</p>',
 705                 content, 'block message', default=None)
 706             if block_msg:
 707                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 708             raise ExtractorError(msg, expected=True)
 709         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 710                 and 'blocklist.rkn.gov.ru' in content):
 711             raise ExtractorError(
 712                 'Access to this webpage has been blocked by decision of the Russian government. '
 713                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 714                 expected=True)
 715
 716     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 717         content_type = urlh.headers.get('Content-Type', '')
 718         webpage_bytes = urlh.read()
 719         if prefix is not None:
 720             webpage_bytes = prefix + webpage_bytes
 721         if not encoding:
 722             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 723         if self._downloader.params.get('dump_intermediate_pages', False):
 724             self.to_screen('Dumping request to ' + urlh.geturl())
 725             dump = base64.b64encode(webpage_bytes).decode('ascii')
 726             self._downloader.to_screen(dump)
 727         if self._downloader.params.get('write_pages', False):
 728             basen = '%s_%s' % (video_id, urlh.geturl())
 729             if len(basen) > 240:
 730                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 731                 basen = basen[:240 - len(h)] + h
 732             raw_filename = basen + '.dump'
 733             filename = sanitize_filename(raw_filename, restricted=True)
 734             self.to_screen('Saving request to ' + filename)
 735             # Working around MAX_PATH limitation on Windows (see
 736             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 737             if compat_os_name == 'nt':
 738                 absfilepath = os.path.abspath(filename)
 739                 if len(absfilepath) > 259:
 740                     filename = '\\\\?\\' + absfilepath
 741             with open(filename, 'wb') as outf:
 742                 outf.write(webpage_bytes)
 743
 744         try:
 745             content = webpage_bytes.decode(encoding, 'replace')
 746         except LookupError:
 747             content = webpage_bytes.decode('utf-8', 'replace')
 748
 749         self.__check_blocked(content)
 750
 751         return content
 752
 753     def _download_webpage(
 754             self, url_or_request, video_id, note=None, errnote=None,
 755             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 756             headers={}, query={}, expected_status=None):
 757         """
 758         Return the data of the page as a string.
 759
 760         Arguments:
 761         url_or_request -- plain text URL as a string or
 762             a compat_urllib_request.Requestobject
 763         video_id -- Video/playlist/item identifier (string)
 764
 765         Keyword arguments:
 766         note -- note printed before downloading (string)
 767         errnote -- note printed in case of an error (string)
 768         fatal -- flag denoting whether error should be considered fatal,
 769             i.e. whether it should cause ExtractionError to be raised,
 770             otherwise a warning will be reported and extraction continued
 771         tries -- number of tries
 772         timeout -- sleep interval between tries
 773         encoding -- encoding for a page content decoding, guessed automatically
 774             when not explicitly specified
 775         data -- POST data (bytes)
 776         headers -- HTTP headers (dict)
 777         query -- URL query (dict)
 778         expected_status -- allows to accept failed HTTP requests (non 2xx
 779             status code) by explicitly specifying a set of accepted status
 780             codes. Can be any of the following entities:
 781                 - an integer type specifying an exact failed status code to
 782                   accept
 783                 - a list or a tuple of integer types specifying a list of
 784                   failed status codes to accept
 785                 - a callable accepting an actual failed status code and
 786                   returning True if it should be accepted
 787             Note that this argument does not affect success status codes (2xx)
 788             which are always accepted.
 789         """
 790
 791         success = False
 792         try_count = 0
 793         while success is False:
 794             try:
 795                 res = self._download_webpage_handle(
 796                     url_or_request, video_id, note, errnote, fatal,
 797                     encoding=encoding, data=data, headers=headers, query=query,
 798                     expected_status=expected_status)
 799                 success = True
 800             except compat_http_client.IncompleteRead as e:
 801                 try_count += 1
 802                 if try_count >= tries:
 803                     raise e
 804                 self._sleep(timeout, video_id)
 805         if res is False:
 806             return res
 807         else:
 808             content, _ = res
 809             return content
 810
 811     def _download_xml_handle(
 812             self, url_or_request, video_id, note='Downloading XML',
 813             errnote='Unable to download XML', transform_source=None,
 814             fatal=True, encoding=None, data=None, headers={}, query={},
 815             expected_status=None):
 816         """
 817         Return a tuple (xml as an compat_etree_Element, URL handle).
 818
 819         See _download_webpage docstring for arguments specification.
 820         """
 821         res = self._download_webpage_handle(
 822             url_or_request, video_id, note, errnote, fatal=fatal,
 823             encoding=encoding, data=data, headers=headers, query=query,
 824             expected_status=expected_status)
 825         if res is False:
 826             return res
 827         xml_string, urlh = res
 828         return self._parse_xml(
 829             xml_string, video_id, transform_source=transform_source,
 830             fatal=fatal), urlh
 831
 832     def _download_xml(
 833             self, url_or_request, video_id,
 834             note='Downloading XML', errnote='Unable to download XML',
 835             transform_source=None, fatal=True, encoding=None,
 836             data=None, headers={}, query={}, expected_status=None):
 837         """
 838         Return the xml as an compat_etree_Element.
 839
 840         See _download_webpage docstring for arguments specification.
 841         """
 842         res = self._download_xml_handle(
 843             url_or_request, video_id, note=note, errnote=errnote,
 844             transform_source=transform_source, fatal=fatal, encoding=encoding,
 845             data=data, headers=headers, query=query,
 846             expected_status=expected_status)
 847         return res if res is False else res[0]
 848
 849     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 850         if transform_source:
 851             xml_string = transform_source(xml_string)
 852         try:
 853             return compat_etree_fromstring(xml_string.encode('utf-8'))
 854         except compat_xml_parse_error as ve:
 855             errmsg = '%s: Failed to parse XML ' % video_id
 856             if fatal:
 857                 raise ExtractorError(errmsg, cause=ve)
 858             else:
 859                 self.report_warning(errmsg + str(ve))
 860
 861     def _download_json_handle(
 862             self, url_or_request, video_id, note='Downloading JSON metadata',
 863             errnote='Unable to download JSON metadata', transform_source=None,
 864             fatal=True, encoding=None, data=None, headers={}, query={},
 865             expected_status=None):
 866         """
 867         Return a tuple (JSON object, URL handle).
 868
 869         See _download_webpage docstring for arguments specification.
 870         """
 871         res = self._download_webpage_handle(
 872             url_or_request, video_id, note, errnote, fatal=fatal,
 873             encoding=encoding, data=data, headers=headers, query=query,
 874             expected_status=expected_status)
 875         if res is False:
 876             return res
 877         json_string, urlh = res
 878         return self._parse_json(
 879             json_string, video_id, transform_source=transform_source,
 880             fatal=fatal), urlh
 881
 882     def _download_json(
 883             self, url_or_request, video_id, note='Downloading JSON metadata',
 884             errnote='Unable to download JSON metadata', transform_source=None,
 885             fatal=True, encoding=None, data=None, headers={}, query={},
 886             expected_status=None):
 887         """
 888         Return the JSON object as a dict.
 889
 890         See _download_webpage docstring for arguments specification.
 891         """
 892         res = self._download_json_handle(
 893             url_or_request, video_id, note=note, errnote=errnote,
 894             transform_source=transform_source, fatal=fatal, encoding=encoding,
 895             data=data, headers=headers, query=query,
 896             expected_status=expected_status)
 897         return res if res is False else res[0]
 898
 899     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 900         if transform_source:
 901             json_string = transform_source(json_string)
 902         try:
 903             return json.loads(json_string)
 904         except ValueError as ve:
 905             errmsg = '%s: Failed to parse JSON ' % video_id
 906             if fatal:
 907                 raise ExtractorError(errmsg, cause=ve)
 908             else:
 909                 self.report_warning(errmsg + str(ve))
 910
 911     def report_warning(self, msg, video_id=None):
 912         idstr = '' if video_id is None else '%s: ' % video_id
 913         self._downloader.report_warning(
 914             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 915
 916     def to_screen(self, msg):
 917         """Print msg to screen, prefixing it with '[ie_name]'"""
 918         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 919
 920     def report_extraction(self, id_or_name):
 921         """Report information extraction."""
 922         self.to_screen('%s: Extracting information' % id_or_name)
 923
 924     def report_download_webpage(self, video_id):
 925         """Report webpage download."""
 926         self.to_screen('%s: Downloading webpage' % video_id)
 927
 928     def report_age_confirmation(self):
 929         """Report attempt to confirm age."""
 930         self.to_screen('Confirming age')
 931
 932     def report_login(self):
 933         """Report attempt to log in."""
 934         self.to_screen('Logging in')
 935
 936     @staticmethod
 937     def raise_login_required(msg='This video is only available for registered users'):
 938         raise ExtractorError(
 939             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 940             expected=True)
 941
 942     @staticmethod
 943     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 944         raise GeoRestrictedError(msg, countries=countries)
 945
 946     # Methods for following #608
 947     @staticmethod
 948     def url_result(url, ie=None, video_id=None, video_title=None):
 949         """Returns a URL that points to a page that should be processed"""
 950         # TODO: ie should be the class used for getting the info
 951         video_info = {'_type': 'url',
 952                       'url': url,
 953                       'ie_key': ie}
 954         if video_id is not None:
 955             video_info['id'] = video_id
 956         if video_title is not None:
 957             video_info['title'] = video_title
 958         return video_info
 959
 960     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 961         urls = orderedSet(
 962             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 963             for m in matches)
 964         return self.playlist_result(
 965             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 966
 967     @staticmethod
 968     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 969         """Returns a playlist"""
 970         video_info = {'_type': 'playlist',
 971                       'entries': entries}
 972         if playlist_id:
 973             video_info['id'] = playlist_id
 974         if playlist_title:
 975             video_info['title'] = playlist_title
 976         if playlist_description:
 977             video_info['description'] = playlist_description
 978         return video_info
 979
 980     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 981         """
 982         Perform a regex search on the given string, using a single or a list of
 983         patterns returning the first matching group.
 984         In case of failure return a default value or raise a WARNING or a
 985         RegexNotFoundError, depending on fatal, specifying the field name.
 986         """
 987         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 988             mobj = re.search(pattern, string, flags)
 989         else:
 990             for p in pattern:
 991                 mobj = re.search(p, string, flags)
 992                 if mobj:
 993                     break
 994
 995         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 996             _name = '\033[0;34m%s\033[0m' % name
 997         else:
 998             _name = name
 999
1000         if mobj:
1001             if group is None:
1002                 # return the first matching group
1003                 return next(g for g in mobj.groups() if g is not None)
1004             else:
1005                 return mobj.group(group)
1006         elif default is not NO_DEFAULT:
1007             return default
1008         elif fatal:
1009             raise RegexNotFoundError('Unable to extract %s' % _name)
1010         else:
1011             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1012             return None
1013
1014     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1015         """
1016         Like _search_regex, but strips HTML tags and unescapes entities.
1017         """
1018         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1019         if res:
1020             return clean_html(res).strip()
1021         else:
1022             return res
1023
1024     def _get_netrc_login_info(self, netrc_machine=None):
1025         username = None
1026         password = None
1027         netrc_machine = netrc_machine or self._NETRC_MACHINE
1028
1029         if self._downloader.params.get('usenetrc', False):
1030             try:
1031                 info = netrc.netrc().authenticators(netrc_machine)
1032                 if info is not None:
1033                     username = info[0]
1034                     password = info[2]
1035                 else:
1036                     raise netrc.NetrcParseError(
1037                         'No authenticators for %s' % netrc_machine)
1038             except (IOError, netrc.NetrcParseError) as err:
1039                 self._downloader.report_warning(
1040                     'parsing .netrc: %s' % error_to_compat_str(err))
1041
1042         return username, password
1043
1044     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1045         """
1046         Get the login info as (username, password)
1047         First look for the manually specified credentials using username_option
1048         and password_option as keys in params dictionary. If no such credentials
1049         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1050         value.
1051         If there's no info available, return (None, None)
1052         """
1053         if self._downloader is None:
1054             return (None, None)
1055
1056         downloader_params = self._downloader.params
1057
1058         # Attempt to use provided username and password or .netrc data
1059         if downloader_params.get(username_option) is not None:
1060             username = downloader_params[username_option]
1061             password = downloader_params[password_option]
1062         else:
1063             username, password = self._get_netrc_login_info(netrc_machine)
1064
1065         return username, password
1066
1067     def _get_tfa_info(self, note='two-factor verification code'):
1068         """
1069         Get the two-factor authentication info
1070         TODO - asking the user will be required for sms/phone verify
1071         currently just uses the command line option
1072         If there's no info available, return None
1073         """
1074         if self._downloader is None:
1075             return None
1076         downloader_params = self._downloader.params
1077
1078         if downloader_params.get('twofactor') is not None:
1079             return downloader_params['twofactor']
1080
1081         return compat_getpass('Type %s and press [Return]: ' % note)
1082
1083     # Helper functions for extracting OpenGraph info
1084     @staticmethod
1085     def _og_regexes(prop):
1086         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1087         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1088                        % {'prop': re.escape(prop)})
1089         template = r'<meta[^>]+?%s[^>]+?%s'
1090         return [
1091             template % (property_re, content_re),
1092             template % (content_re, property_re),
1093         ]
1094
1095     @staticmethod
1096     def _meta_regex(prop):
1097         return r'''(?isx)<meta
1098                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1099                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1100
1101     def _og_search_property(self, prop, html, name=None, **kargs):
1102         if not isinstance(prop, (list, tuple)):
1103             prop = [prop]
1104         if name is None:
1105             name = 'OpenGraph %s' % prop[0]
1106         og_regexes = []
1107         for p in prop:
1108             og_regexes.extend(self._og_regexes(p))
1109         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1110         if escaped is None:
1111             return None
1112         return unescapeHTML(escaped)
1113
1114     def _og_search_thumbnail(self, html, **kargs):
1115         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1116
1117     def _og_search_description(self, html, **kargs):
1118         return self._og_search_property('description', html, fatal=False, **kargs)
1119
1120     def _og_search_title(self, html, **kargs):
1121         return self._og_search_property('title', html, **kargs)
1122
1123     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1124         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1125         if secure:
1126             regexes = self._og_regexes('video:secure_url') + regexes
1127         return self._html_search_regex(regexes, html, name, **kargs)
1128
1129     def _og_search_url(self, html, **kargs):
1130         return self._og_search_property('url', html, **kargs)
1131
1132     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1133         if not isinstance(name, (list, tuple)):
1134             name = [name]
1135         if display_name is None:
1136             display_name = name[0]
1137         return self._html_search_regex(
1138             [self._meta_regex(n) for n in name],
1139             html, display_name, fatal=fatal, group='content', **kwargs)
1140
1141     def _dc_search_uploader(self, html):
1142         return self._html_search_meta('dc.creator', html, 'uploader')
1143
1144     def _rta_search(self, html):
1145         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1146         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1147                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1148                      html):
1149             return 18
1150         return 0
1151
1152     def _media_rating_search(self, html):
1153         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1154         rating = self._html_search_meta('rating', html)
1155
1156         if not rating:
1157             return None
1158
1159         RATING_TABLE = {
1160             'safe for kids': 0,
1161             'general': 8,
1162             '14 years': 14,
1163             'mature': 17,
1164             'restricted': 19,
1165         }
1166         return RATING_TABLE.get(rating.lower())
1167
1168     def _family_friendly_search(self, html):
1169         # See http://schema.org/VideoObject
1170         family_friendly = self._html_search_meta(
1171             'isFamilyFriendly', html, default=None)
1172
1173         if not family_friendly:
1174             return None
1175
1176         RATING_TABLE = {
1177             '1': 0,
1178             'true': 0,
1179             '0': 18,
1180             'false': 18,
1181         }
1182         return RATING_TABLE.get(family_friendly.lower())
1183
1184     def _twitter_search_player(self, html):
1185         return self._html_search_meta('twitter:player', html,
1186                                       'twitter card player')
1187
1188     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1189         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1190         default = kwargs.get('default', NO_DEFAULT)
1191         # JSON-LD may be malformed and thus `fatal` should be respected.
1192         # At the same time `default` may be passed that assumes `fatal=False`
1193         # for _search_regex. Let's simulate the same behavior here as well.
1194         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1195         json_ld = []
1196         for mobj in json_ld_list:
1197             json_ld_item = self._parse_json(
1198                 mobj.group('json_ld'), video_id, fatal=fatal)
1199             if not json_ld_item:
1200                 continue
1201             if isinstance(json_ld_item, dict):
1202                 json_ld.append(json_ld_item)
1203             elif isinstance(json_ld_item, (list, tuple)):
1204                 json_ld.extend(json_ld_item)
1205         if json_ld:
1206             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1207         if json_ld:
1208             return json_ld
1209         if default is not NO_DEFAULT:
1210             return default
1211         elif fatal:
1212             raise RegexNotFoundError('Unable to extract JSON-LD')
1213         else:
1214             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1215             return {}
1216
1217     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1218         if isinstance(json_ld, compat_str):
1219             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1220         if not json_ld:
1221             return {}
1222         info = {}
1223         if not isinstance(json_ld, (list, tuple, dict)):
1224             return info
1225         if isinstance(json_ld, dict):
1226             json_ld = [json_ld]
1227
1228         INTERACTION_TYPE_MAP = {
1229             'CommentAction': 'comment',
1230             'AgreeAction': 'like',
1231             'DisagreeAction': 'dislike',
1232             'LikeAction': 'like',
1233             'DislikeAction': 'dislike',
1234             'ListenAction': 'view',
1235             'WatchAction': 'view',
1236             'ViewAction': 'view',
1237         }
1238
1239         def extract_interaction_statistic(e):
1240             interaction_statistic = e.get('interactionStatistic')
1241             if not isinstance(interaction_statistic, list):
1242                 return
1243             for is_e in interaction_statistic:
1244                 if not isinstance(is_e, dict):
1245                     continue
1246                 if is_e.get('@type') != 'InteractionCounter':
1247                     continue
1248                 interaction_type = is_e.get('interactionType')
1249                 if not isinstance(interaction_type, compat_str):
1250                     continue
1251                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1252                 if interaction_count is None:
1253                     continue
1254                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1255                 if not count_kind:
1256                     continue
1257                 count_key = '%s_count' % count_kind
1258                 if info.get(count_key) is not None:
1259                     continue
1260                 info[count_key] = interaction_count
1261
1262         def extract_video_object(e):
1263             assert e['@type'] == 'VideoObject'
1264             info.update({
1265                 'url': url_or_none(e.get('contentUrl')),
1266                 'title': unescapeHTML(e.get('name')),
1267                 'description': unescapeHTML(e.get('description')),
1268                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1269                 'duration': parse_duration(e.get('duration')),
1270                 'timestamp': unified_timestamp(e.get('uploadDate')),
1271                 'uploader': str_or_none(e.get('author')),
1272                 'filesize': float_or_none(e.get('contentSize')),
1273                 'tbr': int_or_none(e.get('bitrate')),
1274                 'width': int_or_none(e.get('width')),
1275                 'height': int_or_none(e.get('height')),
1276                 'view_count': int_or_none(e.get('interactionCount')),
1277             })
1278             extract_interaction_statistic(e)
1279
1280         for e in json_ld:
1281             if '@context' in e:
1282                 item_type = e.get('@type')
1283                 if expected_type is not None and expected_type != item_type:
1284                     continue
1285                 if item_type in ('TVEpisode', 'Episode'):
1286                     episode_name = unescapeHTML(e.get('name'))
1287                     info.update({
1288                         'episode': episode_name,
1289                         'episode_number': int_or_none(e.get('episodeNumber')),
1290                         'description': unescapeHTML(e.get('description')),
1291                     })
1292                     if not info.get('title') and episode_name:
1293                         info['title'] = episode_name
1294                     part_of_season = e.get('partOfSeason')
1295                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1296                         info.update({
1297                             'season': unescapeHTML(part_of_season.get('name')),
1298                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1299                         })
1300                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1301                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1302                         info['series'] = unescapeHTML(part_of_series.get('name'))
1303                 elif item_type == 'Movie':
1304                     info.update({
1305                         'title': unescapeHTML(e.get('name')),
1306                         'description': unescapeHTML(e.get('description')),
1307                         'duration': parse_duration(e.get('duration')),
1308                         'timestamp': unified_timestamp(e.get('dateCreated')),
1309                     })
1310                 elif item_type in ('Article', 'NewsArticle'):
1311                     info.update({
1312                         'timestamp': parse_iso8601(e.get('datePublished')),
1313                         'title': unescapeHTML(e.get('headline')),
1314                         'description': unescapeHTML(e.get('articleBody')),
1315                     })
1316                 elif item_type == 'VideoObject':
1317                     extract_video_object(e)
1318                     if expected_type is None:
1319                         continue
1320                     else:
1321                         break
1322                 video = e.get('video')
1323                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1324                     extract_video_object(video)
1325                 if expected_type is None:
1326                     continue
1327                 else:
1328                     break
1329         return dict((k, v) for k, v in info.items() if v is not None)
1330
1331     @staticmethod
1332     def _hidden_inputs(html):
1333         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1334         hidden_inputs = {}
1335         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1336             attrs = extract_attributes(input)
1337             if not input:
1338                 continue
1339             if attrs.get('type') not in ('hidden', 'submit'):
1340                 continue
1341             name = attrs.get('name') or attrs.get('id')
1342             value = attrs.get('value')
1343             if name and value is not None:
1344                 hidden_inputs[name] = value
1345         return hidden_inputs
1346
1347     def _form_hidden_inputs(self, form_id, html):
1348         form = self._search_regex(
1349             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1350             html, '%s form' % form_id, group='form')
1351         return self._hidden_inputs(form)
1352
1353     def _sort_formats(self, formats, field_preference=None):
1354         if not formats:
1355             raise ExtractorError('No video formats found')
1356
1357         for f in formats:
1358             # Automatically determine tbr when missing based on abr and vbr (improves
1359             # formats sorting in some cases)
1360             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1361                 f['tbr'] = f['abr'] + f['vbr']
1362
1363         def _formats_key(f):
1364             # TODO remove the following workaround
1365             from ..utils import determine_ext
1366             if not f.get('ext') and 'url' in f:
1367                 f['ext'] = determine_ext(f['url'])
1368
1369             if isinstance(field_preference, (list, tuple)):
1370                 return tuple(
1371                     f.get(field)
1372                     if f.get(field) is not None
1373                     else ('' if field == 'format_id' else -1)
1374                     for field in field_preference)
1375
1376             preference = f.get('preference')
1377             if preference is None:
1378                 preference = 0
1379                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1380                     preference -= 0.5
1381
1382             protocol = f.get('protocol') or determine_protocol(f)
1383             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1384
1385             if f.get('vcodec') == 'none':  # audio only
1386                 preference -= 50
1387                 if self._downloader.params.get('prefer_free_formats'):
1388                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1389                 else:
1390                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1391                 ext_preference = 0
1392                 try:
1393                     audio_ext_preference = ORDER.index(f['ext'])
1394                 except ValueError:
1395                     audio_ext_preference = -1
1396             else:
1397                 if f.get('acodec') == 'none':  # video only
1398                     preference -= 40
1399                 if self._downloader.params.get('prefer_free_formats'):
1400                     ORDER = ['flv', 'mp4', 'webm']
1401                 else:
1402                     ORDER = ['webm', 'flv', 'mp4']
1403                 try:
1404                     ext_preference = ORDER.index(f['ext'])
1405                 except ValueError:
1406                     ext_preference = -1
1407                 audio_ext_preference = 0
1408
1409             return (
1410                 preference,
1411                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1412                 f.get('quality') if f.get('quality') is not None else -1,
1413                 f.get('tbr') if f.get('tbr') is not None else -1,
1414                 f.get('filesize') if f.get('filesize') is not None else -1,
1415                 f.get('vbr') if f.get('vbr') is not None else -1,
1416                 f.get('height') if f.get('height') is not None else -1,
1417                 f.get('width') if f.get('width') is not None else -1,
1418                 proto_preference,
1419                 ext_preference,
1420                 f.get('abr') if f.get('abr') is not None else -1,
1421                 audio_ext_preference,
1422                 f.get('fps') if f.get('fps') is not None else -1,
1423                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1424                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1425                 f.get('format_id') if f.get('format_id') is not None else '',
1426             )
1427         formats.sort(key=_formats_key)
1428
1429     def _check_formats(self, formats, video_id):
1430         if formats:
1431             formats[:] = filter(
1432                 lambda f: self._is_valid_url(
1433                     f['url'], video_id,
1434                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1435                 formats)
1436
1437     @staticmethod
1438     def _remove_duplicate_formats(formats):
1439         format_urls = set()
1440         unique_formats = []
1441         for f in formats:
1442             if f['url'] not in format_urls:
1443                 format_urls.add(f['url'])
1444                 unique_formats.append(f)
1445         formats[:] = unique_formats
1446
1447     def _is_valid_url(self, url, video_id, item='video', headers={}):
1448         url = self._proto_relative_url(url, scheme='http:')
1449         # For now assume non HTTP(S) URLs always valid
1450         if not (url.startswith('http://') or url.startswith('https://')):
1451             return True
1452         try:
1453             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1454             return True
1455         except ExtractorError:
1456             self.to_screen(
1457                 '%s: %s URL is invalid, skipping' % (video_id, item))
1458             return False
1459
1460     def http_scheme(self):
1461         """ Either "http:" or "https:", depending on the user's preferences """
1462         return (
1463             'http:'
1464             if self._downloader.params.get('prefer_insecure', False)
1465             else 'https:')
1466
1467     def _proto_relative_url(self, url, scheme=None):
1468         if url is None:
1469             return url
1470         if url.startswith('//'):
1471             if scheme is None:
1472                 scheme = self.http_scheme()
1473             return scheme + url
1474         else:
1475             return url
1476
1477     def _sleep(self, timeout, video_id, msg_template=None):
1478         if msg_template is None:
1479             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1480         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1481         self.to_screen(msg)
1482         time.sleep(timeout)
1483
1484     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1485                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1486                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1487         manifest = self._download_xml(
1488             manifest_url, video_id, 'Downloading f4m manifest',
1489             'Unable to download f4m manifest',
1490             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1491             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1492             transform_source=transform_source,
1493             fatal=fatal, data=data, headers=headers, query=query)
1494
1495         if manifest is False:
1496             return []
1497
1498         return self._parse_f4m_formats(
1499             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1500             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1501
1502     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1503                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1504                            fatal=True, m3u8_id=None):
1505         if not isinstance(manifest, compat_etree_Element) and not fatal:
1506             return []
1507
1508         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1509         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1510         if akamai_pv is not None and ';' in akamai_pv.text:
1511             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1512             if playerVerificationChallenge.strip() != '':
1513                 return []
1514
1515         formats = []
1516         manifest_version = '1.0'
1517         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1518         if not media_nodes:
1519             manifest_version = '2.0'
1520             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1521         # Remove unsupported DRM protected media from final formats
1522         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1523         media_nodes = remove_encrypted_media(media_nodes)
1524         if not media_nodes:
1525             return formats
1526
1527         manifest_base_url = get_base_url(manifest)
1528
1529         bootstrap_info = xpath_element(
1530             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1531             'bootstrap info', default=None)
1532
1533         vcodec = None
1534         mime_type = xpath_text(
1535             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1536             'base URL', default=None)
1537         if mime_type and mime_type.startswith('audio/'):
1538             vcodec = 'none'
1539
1540         for i, media_el in enumerate(media_nodes):
1541             tbr = int_or_none(media_el.attrib.get('bitrate'))
1542             width = int_or_none(media_el.attrib.get('width'))
1543             height = int_or_none(media_el.attrib.get('height'))
1544             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1545             # If <bootstrapInfo> is present, the specified f4m is a
1546             # stream-level manifest, and only set-level manifests may refer to
1547             # external resources.  See section 11.4 and section 4 of F4M spec
1548             if bootstrap_info is None:
1549                 media_url = None
1550                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1551                 if manifest_version == '2.0':
1552                     media_url = media_el.attrib.get('href')
1553                 if media_url is None:
1554                     media_url = media_el.attrib.get('url')
1555                 if not media_url:
1556                     continue
1557                 manifest_url = (
1558                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1559                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1560                 # If media_url is itself a f4m manifest do the recursive extraction
1561                 # since bitrates in parent manifest (this one) and media_url manifest
1562                 # may differ leading to inability to resolve the format by requested
1563                 # bitrate in f4m downloader
1564                 ext = determine_ext(manifest_url)
1565                 if ext == 'f4m':
1566                     f4m_formats = self._extract_f4m_formats(
1567                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1568                         transform_source=transform_source, fatal=fatal)
1569                     # Sometimes stream-level manifest contains single media entry that
1570                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1571                     # At the same time parent's media entry in set-level manifest may
1572                     # contain it. We will copy it from parent in such cases.
1573                     if len(f4m_formats) == 1:
1574                         f = f4m_formats[0]
1575                         f.update({
1576                             'tbr': f.get('tbr') or tbr,
1577                             'width': f.get('width') or width,
1578                             'height': f.get('height') or height,
1579                             'format_id': f.get('format_id') if not tbr else format_id,
1580                             'vcodec': vcodec,
1581                         })
1582                     formats.extend(f4m_formats)
1583                     continue
1584                 elif ext == 'm3u8':
1585                     formats.extend(self._extract_m3u8_formats(
1586                         manifest_url, video_id, 'mp4', preference=preference,
1587                         m3u8_id=m3u8_id, fatal=fatal))
1588                     continue
1589             formats.append({
1590                 'format_id': format_id,
1591                 'url': manifest_url,
1592                 'manifest_url': manifest_url,
1593                 'ext': 'flv' if bootstrap_info is not None else None,
1594                 'protocol': 'f4m',
1595                 'tbr': tbr,
1596                 'width': width,
1597                 'height': height,
1598                 'vcodec': vcodec,
1599                 'preference': preference,
1600             })
1601         return formats
1602
1603     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1604         return {
1605             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1606             'url': m3u8_url,
1607             'ext': ext,
1608             'protocol': 'm3u8',
1609             'preference': preference - 100 if preference else -100,
1610             'resolution': 'multiple',
1611             'format_note': 'Quality selection URL',
1612         }
1613
1614     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1615                               entry_protocol='m3u8', preference=None,
1616                               m3u8_id=None, note=None, errnote=None,
1617                               fatal=True, live=False, data=None, headers={},
1618                               query={}):
1619         res = self._download_webpage_handle(
1620             m3u8_url, video_id,
1621             note=note or 'Downloading m3u8 information',
1622             errnote=errnote or 'Failed to download m3u8 information',
1623             fatal=fatal, data=data, headers=headers, query=query)
1624
1625         if res is False:
1626             return []
1627
1628         m3u8_doc, urlh = res
1629         m3u8_url = urlh.geturl()
1630
1631         return self._parse_m3u8_formats(
1632             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1633             preference=preference, m3u8_id=m3u8_id, live=live)
1634
1635     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1636                             entry_protocol='m3u8', preference=None,
1637                             m3u8_id=None, live=False):
1638         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1639             return []
1640
1641         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1642             return []
1643
1644         formats = []
1645
1646         format_url = lambda u: (
1647             u
1648             if re.match(r'^https?://', u)
1649             else compat_urlparse.urljoin(m3u8_url, u))
1650
1651         # References:
1652         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1653         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1654         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1655
1656         # We should try extracting formats only from master playlists [1, 4.3.4],
1657         # i.e. playlists that describe available qualities. On the other hand
1658         # media playlists [1, 4.3.3] should be returned as is since they contain
1659         # just the media without qualities renditions.
1660         # Fortunately, master playlist can be easily distinguished from media
1661         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1662         # master playlist tags MUST NOT appear in a media playist and vice versa.
1663         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1664         # media playlist and MUST NOT appear in master playlist thus we can
1665         # clearly detect media playlist with this criterion.
1666
1667         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1668             return [{
1669                 'url': m3u8_url,
1670                 'format_id': m3u8_id,
1671                 'ext': ext,
1672                 'protocol': entry_protocol,
1673                 'preference': preference,
1674             }]
1675
1676         groups = {}
1677         last_stream_inf = {}
1678
1679         def extract_media(x_media_line):
1680             media = parse_m3u8_attributes(x_media_line)
1681             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1682             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1683             if not (media_type and group_id and name):
1684                 return
1685             groups.setdefault(group_id, []).append(media)
1686             if media_type not in ('VIDEO', 'AUDIO'):
1687                 return
1688             media_url = media.get('URI')
1689             if media_url:
1690                 format_id = []
1691                 for v in (m3u8_id, group_id, name):
1692                     if v:
1693                         format_id.append(v)
1694                 f = {
1695                     'format_id': '-'.join(format_id),
1696                     'url': format_url(media_url),
1697                     'manifest_url': m3u8_url,
1698                     'language': media.get('LANGUAGE'),
1699                     'ext': ext,
1700                     'protocol': entry_protocol,
1701                     'preference': preference,
1702                 }
1703                 if media_type == 'AUDIO':
1704                     f['vcodec'] = 'none'
1705                 formats.append(f)
1706
1707         def build_stream_name():
1708             # Despite specification does not mention NAME attribute for
1709             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1710             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1711             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1712             stream_name = last_stream_inf.get('NAME')
1713             if stream_name:
1714                 return stream_name
1715             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1716             # from corresponding rendition group
1717             stream_group_id = last_stream_inf.get('VIDEO')
1718             if not stream_group_id:
1719                 return
1720             stream_group = groups.get(stream_group_id)
1721             if not stream_group:
1722                 return stream_group_id
1723             rendition = stream_group[0]
1724             return rendition.get('NAME') or stream_group_id
1725
1726         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1727         # chance to detect video only formats when EXT-X-STREAM-INF tags
1728         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1729         for line in m3u8_doc.splitlines():
1730             if line.startswith('#EXT-X-MEDIA:'):
1731                 extract_media(line)
1732
1733         for line in m3u8_doc.splitlines():
1734             if line.startswith('#EXT-X-STREAM-INF:'):
1735                 last_stream_inf = parse_m3u8_attributes(line)
1736             elif line.startswith('#') or not line.strip():
1737                 continue
1738             else:
1739                 tbr = float_or_none(
1740                     last_stream_inf.get('AVERAGE-BANDWIDTH')
1741                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
1742                 format_id = []
1743                 if m3u8_id:
1744                     format_id.append(m3u8_id)
1745                 stream_name = build_stream_name()
1746                 # Bandwidth of live streams may differ over time thus making
1747                 # format_id unpredictable. So it's better to keep provided
1748                 # format_id intact.
1749                 if not live:
1750                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1751                 manifest_url = format_url(line.strip())
1752                 f = {
1753                     'format_id': '-'.join(format_id),
1754                     'url': manifest_url,
1755                     'manifest_url': m3u8_url,
1756                     'tbr': tbr,
1757                     'ext': ext,
1758                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1759                     'protocol': entry_protocol,
1760                     'preference': preference,
1761                 }
1762                 resolution = last_stream_inf.get('RESOLUTION')
1763                 if resolution:
1764                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1765                     if mobj:
1766                         f['width'] = int(mobj.group('width'))
1767                         f['height'] = int(mobj.group('height'))
1768                 # Unified Streaming Platform
1769                 mobj = re.search(
1770                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1771                 if mobj:
1772                     abr, vbr = mobj.groups()
1773                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1774                     f.update({
1775                         'vbr': vbr,
1776                         'abr': abr,
1777                     })
1778                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1779                 f.update(codecs)
1780                 audio_group_id = last_stream_inf.get('AUDIO')
1781                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1782                 # references a rendition group MUST have a CODECS attribute.
1783                 # However, this is not always respected, for example, [2]
1784                 # contains EXT-X-STREAM-INF tag which references AUDIO
1785                 # rendition group but does not have CODECS and despite
1786                 # referencing an audio group it represents a complete
1787                 # (with audio and video) format. So, for such cases we will
1788                 # ignore references to rendition groups and treat them
1789                 # as complete formats.
1790                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1791                     audio_group = groups.get(audio_group_id)
1792                     if audio_group and audio_group[0].get('URI'):
1793                         # TODO: update acodec for audio only formats with
1794                         # the same GROUP-ID
1795                         f['acodec'] = 'none'
1796                 formats.append(f)
1797
1798                 # for DailyMotion
1799                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1800                 if progressive_uri:
1801                     http_f = f.copy()
1802                     del http_f['manifest_url']
1803                     http_f.update({
1804                         'format_id': f['format_id'].replace('hls-', 'http-'),
1805                         'protocol': 'http',
1806                         'url': progressive_uri,
1807                     })
1808                     formats.append(http_f)
1809
1810                 last_stream_inf = {}
1811         return formats
1812
1813     @staticmethod
1814     def _xpath_ns(path, namespace=None):
1815         if not namespace:
1816             return path
1817         out = []
1818         for c in path.split('/'):
1819             if not c or c == '.':
1820                 out.append(c)
1821             else:
1822                 out.append('{%s}%s' % (namespace, c))
1823         return '/'.join(out)
1824
1825     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1826         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1827
1828         if smil is False:
1829             assert not fatal
1830             return []
1831
1832         namespace = self._parse_smil_namespace(smil)
1833
1834         return self._parse_smil_formats(
1835             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1836
1837     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1838         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1839         if smil is False:
1840             return {}
1841         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1842
1843     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1844         return self._download_xml(
1845             smil_url, video_id, 'Downloading SMIL file',
1846             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1847
1848     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1849         namespace = self._parse_smil_namespace(smil)
1850
1851         formats = self._parse_smil_formats(
1852             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1853         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1854
1855         video_id = os.path.splitext(url_basename(smil_url))[0]
1856         title = None
1857         description = None
1858         upload_date = None
1859         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1860             name = meta.attrib.get('name')
1861             content = meta.attrib.get('content')
1862             if not name or not content:
1863                 continue
1864             if not title and name == 'title':
1865                 title = content
1866             elif not description and name in ('description', 'abstract'):
1867                 description = content
1868             elif not upload_date and name == 'date':
1869                 upload_date = unified_strdate(content)
1870
1871         thumbnails = [{
1872             'id': image.get('type'),
1873             'url': image.get('src'),
1874             'width': int_or_none(image.get('width')),
1875             'height': int_or_none(image.get('height')),
1876         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1877
1878         return {
1879             'id': video_id,
1880             'title': title or video_id,
1881             'description': description,
1882             'upload_date': upload_date,
1883             'thumbnails': thumbnails,
1884             'formats': formats,
1885             'subtitles': subtitles,
1886         }
1887
1888     def _parse_smil_namespace(self, smil):
1889         return self._search_regex(
1890             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1891
1892     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1893         base = smil_url
1894         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1895             b = meta.get('base') or meta.get('httpBase')
1896             if b:
1897                 base = b
1898                 break
1899
1900         formats = []
1901         rtmp_count = 0
1902         http_count = 0
1903         m3u8_count = 0
1904
1905         srcs = []
1906         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1907         for medium in media:
1908             src = medium.get('src')
1909             if not src or src in srcs:
1910                 continue
1911             srcs.append(src)
1912
1913             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1914             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1915             width = int_or_none(medium.get('width'))
1916             height = int_or_none(medium.get('height'))
1917             proto = medium.get('proto')
1918             ext = medium.get('ext')
1919             src_ext = determine_ext(src)
1920             streamer = medium.get('streamer') or base
1921
1922             if proto == 'rtmp' or streamer.startswith('rtmp'):
1923                 rtmp_count += 1
1924                 formats.append({
1925                     'url': streamer,
1926                     'play_path': src,
1927                     'ext': 'flv',
1928                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1929                     'tbr': bitrate,
1930                     'filesize': filesize,
1931                     'width': width,
1932                     'height': height,
1933                 })
1934                 if transform_rtmp_url:
1935                     streamer, src = transform_rtmp_url(streamer, src)
1936                     formats[-1].update({
1937                         'url': streamer,
1938                         'play_path': src,
1939                     })
1940                 continue
1941
1942             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1943             src_url = src_url.strip()
1944
1945             if proto == 'm3u8' or src_ext == 'm3u8':
1946                 m3u8_formats = self._extract_m3u8_formats(
1947                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1948                 if len(m3u8_formats) == 1:
1949                     m3u8_count += 1
1950                     m3u8_formats[0].update({
1951                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1952                         'tbr': bitrate,
1953                         'width': width,
1954                         'height': height,
1955                     })
1956                 formats.extend(m3u8_formats)
1957             elif src_ext == 'f4m':
1958                 f4m_url = src_url
1959                 if not f4m_params:
1960                     f4m_params = {
1961                         'hdcore': '3.2.0',
1962                         'plugin': 'flowplayer-3.2.0.1',
1963                     }
1964                 f4m_url += '&' if '?' in f4m_url else '?'
1965                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1966                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1967             elif src_ext == 'mpd':
1968                 formats.extend(self._extract_mpd_formats(
1969                     src_url, video_id, mpd_id='dash', fatal=False))
1970             elif re.search(r'\.ism/[Mm]anifest', src_url):
1971                 formats.extend(self._extract_ism_formats(
1972                     src_url, video_id, ism_id='mss', fatal=False))
1973             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1974                 http_count += 1
1975                 formats.append({
1976                     'url': src_url,
1977                     'ext': ext or src_ext or 'flv',
1978                     'format_id': 'http-%d' % (bitrate or http_count),
1979                     'tbr': bitrate,
1980                     'filesize': filesize,
1981                     'width': width,
1982                     'height': height,
1983                 })
1984
1985         return formats
1986
1987     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1988         urls = []
1989         subtitles = {}
1990         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1991             src = textstream.get('src')
1992             if not src or src in urls:
1993                 continue
1994             urls.append(src)
1995             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1996             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1997             subtitles.setdefault(lang, []).append({
1998                 'url': src,
1999                 'ext': ext,
2000             })
2001         return subtitles
2002
2003     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2004         xspf = self._download_xml(
2005             xspf_url, playlist_id, 'Downloading xpsf playlist',
2006             'Unable to download xspf manifest', fatal=fatal)
2007         if xspf is False:
2008             return []
2009         return self._parse_xspf(
2010             xspf, playlist_id, xspf_url=xspf_url,
2011             xspf_base_url=base_url(xspf_url))
2012
2013     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2014         NS_MAP = {
2015             'xspf': 'http://xspf.org/ns/0/',
2016             's1': 'http://static.streamone.nl/player/ns/0',
2017         }
2018
2019         entries = []
2020         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2021             title = xpath_text(
2022                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2023             description = xpath_text(
2024                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2025             thumbnail = xpath_text(
2026                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2027             duration = float_or_none(
2028                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2029
2030             formats = []
2031             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2032                 format_url = urljoin(xspf_base_url, location.text)
2033                 if not format_url:
2034                     continue
2035                 formats.append({
2036                     'url': format_url,
2037                     'manifest_url': xspf_url,
2038                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2039                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2040                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2041                 })
2042             self._sort_formats(formats)
2043
2044             entries.append({
2045                 'id': playlist_id,
2046                 'title': title,
2047                 'description': description,
2048                 'thumbnail': thumbnail,
2049                 'duration': duration,
2050                 'formats': formats,
2051             })
2052         return entries
2053
2054     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
2055         res = self._download_xml_handle(
2056             mpd_url, video_id,
2057             note=note or 'Downloading MPD manifest',
2058             errnote=errnote or 'Failed to download MPD manifest',
2059             fatal=fatal, data=data, headers=headers, query=query)
2060         if res is False:
2061             return []
2062         mpd_doc, urlh = res
2063         if mpd_doc is None:
2064             return []
2065         mpd_base_url = base_url(urlh.geturl())
2066
2067         return self._parse_mpd_formats(
2068             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2069             formats_dict=formats_dict, mpd_url=mpd_url)
2070
2071     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2072         """
2073         Parse formats from MPD manifest.
2074         References:
2075          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2076             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2077          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2078         """
2079         if mpd_doc.get('type') == 'dynamic':
2080             return []
2081
2082         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2083
2084         def _add_ns(path):
2085             return self._xpath_ns(path, namespace)
2086
2087         def is_drm_protected(element):
2088             return element.find(_add_ns('ContentProtection')) is not None
2089
2090         def extract_multisegment_info(element, ms_parent_info):
2091             ms_info = ms_parent_info.copy()
2092
2093             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2094             # common attributes and elements.  We will only extract relevant
2095             # for us.
2096             def extract_common(source):
2097                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2098                 if segment_timeline is not None:
2099                     s_e = segment_timeline.findall(_add_ns('S'))
2100                     if s_e:
2101                         ms_info['total_number'] = 0
2102                         ms_info['s'] = []
2103                         for s in s_e:
2104                             r = int(s.get('r', 0))
2105                             ms_info['total_number'] += 1 + r
2106                             ms_info['s'].append({
2107                                 't': int(s.get('t', 0)),
2108                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2109                                 'd': int(s.attrib['d']),
2110                                 'r': r,
2111                             })
2112                 start_number = source.get('startNumber')
2113                 if start_number:
2114                     ms_info['start_number'] = int(start_number)
2115                 timescale = source.get('timescale')
2116                 if timescale:
2117                     ms_info['timescale'] = int(timescale)
2118                 segment_duration = source.get('duration')
2119                 if segment_duration:
2120                     ms_info['segment_duration'] = float(segment_duration)
2121
2122             def extract_Initialization(source):
2123                 initialization = source.find(_add_ns('Initialization'))
2124                 if initialization is not None:
2125                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2126
2127             segment_list = element.find(_add_ns('SegmentList'))
2128             if segment_list is not None:
2129                 extract_common(segment_list)
2130                 extract_Initialization(segment_list)
2131                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2132                 if segment_urls_e:
2133                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2134             else:
2135                 segment_template = element.find(_add_ns('SegmentTemplate'))
2136                 if segment_template is not None:
2137                     extract_common(segment_template)
2138                     media = segment_template.get('media')
2139                     if media:
2140                         ms_info['media'] = media
2141                     initialization = segment_template.get('initialization')
2142                     if initialization:
2143                         ms_info['initialization'] = initialization
2144                     else:
2145                         extract_Initialization(segment_template)
2146             return ms_info
2147
2148         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2149         formats = []
2150         for period in mpd_doc.findall(_add_ns('Period')):
2151             period_duration = parse_duration(period.get('duration')) or mpd_duration
2152             period_ms_info = extract_multisegment_info(period, {
2153                 'start_number': 1,
2154                 'timescale': 1,
2155             })
2156             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2157                 if is_drm_protected(adaptation_set):
2158                     continue
2159                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2160                 for representation in adaptation_set.findall(_add_ns('Representation')):
2161                     if is_drm_protected(representation):
2162                         continue
2163                     representation_attrib = adaptation_set.attrib.copy()
2164                     representation_attrib.update(representation.attrib)
2165                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2166                     mime_type = representation_attrib['mimeType']
2167                     content_type = mime_type.split('/')[0]
2168                     if content_type == 'text':
2169                         # TODO implement WebVTT downloading
2170                         pass
2171                     elif content_type in ('video', 'audio'):
2172                         base_url = ''
2173                         for element in (representation, adaptation_set, period, mpd_doc):
2174                             base_url_e = element.find(_add_ns('BaseURL'))
2175                             if base_url_e is not None:
2176                                 base_url = base_url_e.text + base_url
2177                                 if re.match(r'^https?://', base_url):
2178                                     break
2179                         if mpd_base_url and not re.match(r'^https?://', base_url):
2180                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2181                                 mpd_base_url += '/'
2182                             base_url = mpd_base_url + base_url
2183                         representation_id = representation_attrib.get('id')
2184                         lang = representation_attrib.get('lang')
2185                         url_el = representation.find(_add_ns('BaseURL'))
2186                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2187                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2188                         f = {
2189                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2190                             'manifest_url': mpd_url,
2191                             'ext': mimetype2ext(mime_type),
2192                             'width': int_or_none(representation_attrib.get('width')),
2193                             'height': int_or_none(representation_attrib.get('height')),
2194                             'tbr': float_or_none(bandwidth, 1000),
2195                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2196                             'fps': int_or_none(representation_attrib.get('frameRate')),
2197                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2198                             'format_note': 'DASH %s' % content_type,
2199                             'filesize': filesize,
2200                             'container': mimetype2ext(mime_type) + '_dash',
2201                         }
2202                         f.update(parse_codecs(representation_attrib.get('codecs')))
2203                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2204
2205                         def prepare_template(template_name, identifiers):
2206                             tmpl = representation_ms_info[template_name]
2207                             # First of, % characters outside $...$ templates
2208                             # must be escaped by doubling for proper processing
2209                             # by % operator string formatting used further (see
2210                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2211                             t = ''
2212                             in_template = False
2213                             for c in tmpl:
2214                                 t += c
2215                                 if c == '$':
2216                                     in_template = not in_template
2217                                 elif c == '%' and not in_template:
2218                                     t += c
2219                             # Next, $...$ templates are translated to their
2220                             # %(...) counterparts to be used with % operator
2221                             t = t.replace('$RepresentationID$', representation_id)
2222                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2223                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2224                             t.replace('$$', '$')
2225                             return t
2226
2227                         # @initialization is a regular template like @media one
2228                         # so it should be handled just the same way (see
2229                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2230                         if 'initialization' in representation_ms_info:
2231                             initialization_template = prepare_template(
2232                                 'initialization',
2233                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2234                                 # $Time$ shall not be included for @initialization thus
2235                                 # only $Bandwidth$ remains
2236                                 ('Bandwidth', ))
2237                             representation_ms_info['initialization_url'] = initialization_template % {
2238                                 'Bandwidth': bandwidth,
2239                             }
2240
2241                         def location_key(location):
2242                             return 'url' if re.match(r'^https?://', location) else 'path'
2243
2244                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2245
2246                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2247                             media_location_key = location_key(media_template)
2248
2249                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2250                             # can't be used at the same time
2251                             if '%(Number' in media_template and 's' not in representation_ms_info:
2252                                 segment_duration = None
2253                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2254                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2255                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2256                                 representation_ms_info['fragments'] = [{
2257                                     media_location_key: media_template % {
2258                                         'Number': segment_number,
2259                                         'Bandwidth': bandwidth,
2260                                     },
2261                                     'duration': segment_duration,
2262                                 } for segment_number in range(
2263                                     representation_ms_info['start_number'],
2264                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2265                             else:
2266                                 # $Number*$ or $Time$ in media template with S list available
2267                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2268                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2269                                 representation_ms_info['fragments'] = []
2270                                 segment_time = 0
2271                                 segment_d = None
2272                                 segment_number = representation_ms_info['start_number']
2273
2274                                 def add_segment_url():
2275                                     segment_url = media_template % {
2276                                         'Time': segment_time,
2277                                         'Bandwidth': bandwidth,
2278                                         'Number': segment_number,
2279                                     }
2280                                     representation_ms_info['fragments'].append({
2281                                         media_location_key: segment_url,
2282                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2283                                     })
2284
2285                                 for num, s in enumerate(representation_ms_info['s']):
2286                                     segment_time = s.get('t') or segment_time
2287                                     segment_d = s['d']
2288                                     add_segment_url()
2289                                     segment_number += 1
2290                                     for r in range(s.get('r', 0)):
2291                                         segment_time += segment_d
2292                                         add_segment_url()
2293                                         segment_number += 1
2294                                     segment_time += segment_d
2295                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2296                             # No media template
2297                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2298                             # or any YouTube dashsegments video
2299                             fragments = []
2300                             segment_index = 0
2301                             timescale = representation_ms_info['timescale']
2302                             for s in representation_ms_info['s']:
2303                                 duration = float_or_none(s['d'], timescale)
2304                                 for r in range(s.get('r', 0) + 1):
2305                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2306                                     fragments.append({
2307                                         location_key(segment_uri): segment_uri,
2308                                         'duration': duration,
2309                                     })
2310                                     segment_index += 1
2311                             representation_ms_info['fragments'] = fragments
2312                         elif 'segment_urls' in representation_ms_info:
2313                             # Segment URLs with no SegmentTimeline
2314                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2315                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2316                             fragments = []
2317                             segment_duration = float_or_none(
2318                                 representation_ms_info['segment_duration'],
2319                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2320                             for segment_url in representation_ms_info['segment_urls']:
2321                                 fragment = {
2322                                     location_key(segment_url): segment_url,
2323                                 }
2324                                 if segment_duration:
2325                                     fragment['duration'] = segment_duration
2326                                 fragments.append(fragment)
2327                             representation_ms_info['fragments'] = fragments
2328                         # If there is a fragments key available then we correctly recognized fragmented media.
2329                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2330                         # assumption is not necessarily correct since we may simply have no support for
2331                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2332                         if 'fragments' in representation_ms_info:
2333                             f.update({
2334                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2335                                 'url': mpd_url or base_url,
2336                                 'fragment_base_url': base_url,
2337                                 'fragments': [],
2338                                 'protocol': 'http_dash_segments',
2339                             })
2340                             if 'initialization_url' in representation_ms_info:
2341                                 initialization_url = representation_ms_info['initialization_url']
2342                                 if not f.get('url'):
2343                                     f['url'] = initialization_url
2344                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2345                             f['fragments'].extend(representation_ms_info['fragments'])
2346                         else:
2347                             # Assuming direct URL to unfragmented media.
2348                             f['url'] = base_url
2349
2350                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2351                         # is not necessarily unique within a Period thus formats with
2352                         # the same `format_id` are quite possible. There are numerous examples
2353                         # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2354                         # https://github.com/ytdl-org/youtube-dl/issues/13919)
2355                         full_info = formats_dict.get(representation_id, {}).copy()
2356                         full_info.update(f)
2357                         formats.append(full_info)
2358                     else:
2359                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2360         return formats
2361
2362     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2363         res = self._download_xml_handle(
2364             ism_url, video_id,
2365             note=note or 'Downloading ISM manifest',
2366             errnote=errnote or 'Failed to download ISM manifest',
2367             fatal=fatal, data=data, headers=headers, query=query)
2368         if res is False:
2369             return []
2370         ism_doc, urlh = res
2371         if ism_doc is None:
2372             return []
2373
2374         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2375
2376     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2377         """
2378         Parse formats from ISM manifest.
2379         References:
2380          1. [MS-SSTR]: Smooth Streaming Protocol,
2381             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2382         """
2383         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2384             return []
2385
2386         duration = int(ism_doc.attrib['Duration'])
2387         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2388
2389         formats = []
2390         for stream in ism_doc.findall('StreamIndex'):
2391             stream_type = stream.get('Type')
2392             if stream_type not in ('video', 'audio'):
2393                 continue
2394             url_pattern = stream.attrib['Url']
2395             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2396             stream_name = stream.get('Name')
2397             for track in stream.findall('QualityLevel'):
2398                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2399                 # TODO: add support for WVC1 and WMAP
2400                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2401                     self.report_warning('%s is not a supported codec' % fourcc)
2402                     continue
2403                 tbr = int(track.attrib['Bitrate']) // 1000
2404                 # [1] does not mention Width and Height attributes. However,
2405                 # they're often present while MaxWidth and MaxHeight are
2406                 # missing, so should be used as fallbacks
2407                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2408                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2409                 sampling_rate = int_or_none(track.get('SamplingRate'))
2410
2411                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2412                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2413
2414                 fragments = []
2415                 fragment_ctx = {
2416                     'time': 0,
2417                 }
2418                 stream_fragments = stream.findall('c')
2419                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2420                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2421                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2422                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2423                     if not fragment_ctx['duration']:
2424                         try:
2425                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2426                         except IndexError:
2427                             next_fragment_time = duration
2428                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2429                     for _ in range(fragment_repeat):
2430                         fragments.append({
2431                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2432                             'duration': fragment_ctx['duration'] / stream_timescale,
2433                         })
2434                         fragment_ctx['time'] += fragment_ctx['duration']
2435
2436                 format_id = []
2437                 if ism_id:
2438                     format_id.append(ism_id)
2439                 if stream_name:
2440                     format_id.append(stream_name)
2441                 format_id.append(compat_str(tbr))
2442
2443                 formats.append({
2444                     'format_id': '-'.join(format_id),
2445                     'url': ism_url,
2446                     'manifest_url': ism_url,
2447                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2448                     'width': width,
2449                     'height': height,
2450                     'tbr': tbr,
2451                     'asr': sampling_rate,
2452                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2453                     'acodec': 'none' if stream_type == 'video' else fourcc,
2454                     'protocol': 'ism',
2455                     'fragments': fragments,
2456                     '_download_params': {
2457                         'duration': duration,
2458                         'timescale': stream_timescale,
2459                         'width': width or 0,
2460                         'height': height or 0,
2461                         'fourcc': fourcc,
2462                         'codec_private_data': track.get('CodecPrivateData'),
2463                         'sampling_rate': sampling_rate,
2464                         'channels': int_or_none(track.get('Channels', 2)),
2465                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2466                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2467                     },
2468                 })
2469         return formats
2470
2471     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2472         def absolute_url(item_url):
2473             return urljoin(base_url, item_url)
2474
2475         def parse_content_type(content_type):
2476             if not content_type:
2477                 return {}
2478             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2479             if ctr:
2480                 mimetype, codecs = ctr.groups()
2481                 f = parse_codecs(codecs)
2482                 f['ext'] = mimetype2ext(mimetype)
2483                 return f
2484             return {}
2485
2486         def _media_formats(src, cur_media_type, type_info={}):
2487             full_url = absolute_url(src)
2488             ext = type_info.get('ext') or determine_ext(full_url)
2489             if ext == 'm3u8':
2490                 is_plain_url = False
2491                 formats = self._extract_m3u8_formats(
2492                     full_url, video_id, ext='mp4',
2493                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2494                     preference=preference, fatal=False)
2495             elif ext == 'mpd':
2496                 is_plain_url = False
2497                 formats = self._extract_mpd_formats(
2498                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2499             else:
2500                 is_plain_url = True
2501                 formats = [{
2502                     'url': full_url,
2503                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2504                 }]
2505             return is_plain_url, formats
2506
2507         entries = []
2508         # amp-video and amp-audio are very similar to their HTML5 counterparts
2509         # so we wll include them right here (see
2510         # https://www.ampproject.org/docs/reference/components/amp-video)
2511         media_tags = [(media_tag, media_type, '')
2512                       for media_tag, media_type
2513                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2514         media_tags.extend(re.findall(
2515             # We only allow video|audio followed by a whitespace or '>'.
2516             # Allowing more characters may end up in significant slow down (see
2517             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2518             # http://www.porntrex.com/maps/videositemap.xml).
2519             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2520         for media_tag, media_type, media_content in media_tags:
2521             media_info = {
2522                 'formats': [],
2523                 'subtitles': {},
2524             }
2525             media_attributes = extract_attributes(media_tag)
2526             src = strip_or_none(media_attributes.get('src'))
2527             if src:
2528                 _, formats = _media_formats(src, media_type)
2529                 media_info['formats'].extend(formats)
2530             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2531             if media_content:
2532                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2533                     s_attr = extract_attributes(source_tag)
2534                     # data-video-src and data-src are non standard but seen
2535                     # several times in the wild
2536                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2537                     if not src:
2538                         continue
2539                     f = parse_content_type(s_attr.get('type'))
2540                     is_plain_url, formats = _media_formats(src, media_type, f)
2541                     if is_plain_url:
2542                         # width, height, res, label and title attributes are
2543                         # all not standard but seen several times in the wild
2544                         labels = [
2545                             s_attr.get(lbl)
2546                             for lbl in ('label', 'title')
2547                             if str_or_none(s_attr.get(lbl))
2548                         ]
2549                         width = int_or_none(s_attr.get('width'))
2550                         height = (int_or_none(s_attr.get('height'))
2551                                   or int_or_none(s_attr.get('res')))
2552                         if not width or not height:
2553                             for lbl in labels:
2554                                 resolution = parse_resolution(lbl)
2555                                 if not resolution:
2556                                     continue
2557                                 width = width or resolution.get('width')
2558                                 height = height or resolution.get('height')
2559                         for lbl in labels:
2560                             tbr = parse_bitrate(lbl)
2561                             if tbr:
2562                                 break
2563                         else:
2564                             tbr = None
2565                         f.update({
2566                             'width': width,
2567                             'height': height,
2568                             'tbr': tbr,
2569                             'format_id': s_attr.get('label') or s_attr.get('title'),
2570                         })
2571                         f.update(formats[0])
2572                         media_info['formats'].append(f)
2573                     else:
2574                         media_info['formats'].extend(formats)
2575                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2576                     track_attributes = extract_attributes(track_tag)
2577                     kind = track_attributes.get('kind')
2578                     if not kind or kind in ('subtitles', 'captions'):
2579                         src = strip_or_none(track_attributes.get('src'))
2580                         if not src:
2581                             continue
2582                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2583                         media_info['subtitles'].setdefault(lang, []).append({
2584                             'url': absolute_url(src),
2585                         })
2586             for f in media_info['formats']:
2587                 f.setdefault('http_headers', {})['Referer'] = base_url
2588             if media_info['formats'] or media_info['subtitles']:
2589                 entries.append(media_info)
2590         return entries
2591
2592     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2593         formats = []
2594         hdcore_sign = 'hdcore=3.7.0'
2595         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2596         hds_host = hosts.get('hds')
2597         if hds_host:
2598             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2599         if 'hdcore=' not in f4m_url:
2600             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2601         f4m_formats = self._extract_f4m_formats(
2602             f4m_url, video_id, f4m_id='hds', fatal=False)
2603         for entry in f4m_formats:
2604             entry.update({'extra_param_to_segment_url': hdcore_sign})
2605         formats.extend(f4m_formats)
2606         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2607         hls_host = hosts.get('hls')
2608         if hls_host:
2609             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2610         formats.extend(self._extract_m3u8_formats(
2611             m3u8_url, video_id, 'mp4', 'm3u8_native',
2612             m3u8_id='hls', fatal=False))
2613         return formats
2614
2615     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2616         query = compat_urlparse.urlparse(url).query
2617         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2618         mobj = re.search(
2619             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2620         url_base = mobj.group('url')
2621         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2622         formats = []
2623
2624         def manifest_url(manifest):
2625             m_url = '%s/%s' % (http_base_url, manifest)
2626             if query:
2627                 m_url += '?%s' % query
2628             return m_url
2629
2630         if 'm3u8' not in skip_protocols:
2631             formats.extend(self._extract_m3u8_formats(
2632                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2633                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2634         if 'f4m' not in skip_protocols:
2635             formats.extend(self._extract_f4m_formats(
2636                 manifest_url('manifest.f4m'),
2637                 video_id, f4m_id='hds', fatal=False))
2638         if 'dash' not in skip_protocols:
2639             formats.extend(self._extract_mpd_formats(
2640                 manifest_url('manifest.mpd'),
2641                 video_id, mpd_id='dash', fatal=False))
2642         if re.search(r'(?:/smil:|\.smil)', url_base):
2643             if 'smil' not in skip_protocols:
2644                 rtmp_formats = self._extract_smil_formats(
2645                     manifest_url('jwplayer.smil'),
2646                     video_id, fatal=False)
2647                 for rtmp_format in rtmp_formats:
2648                     rtsp_format = rtmp_format.copy()
2649                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2650                     del rtsp_format['play_path']
2651                     del rtsp_format['ext']
2652                     rtsp_format.update({
2653                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2654                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2655                         'protocol': 'rtsp',
2656                     })
2657                     formats.extend([rtmp_format, rtsp_format])
2658         else:
2659             for protocol in ('rtmp', 'rtsp'):
2660                 if protocol not in skip_protocols:
2661                     formats.append({
2662                         'url': '%s:%s' % (protocol, url_base),
2663                         'format_id': protocol,
2664                         'protocol': protocol,
2665                     })
2666         return formats
2667
2668     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2669         mobj = re.search(
2670             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2671             webpage)
2672         if mobj:
2673             try:
2674                 jwplayer_data = self._parse_json(mobj.group('options'),
2675                                                  video_id=video_id,
2676                                                  transform_source=transform_source)
2677             except ExtractorError:
2678                 pass
2679             else:
2680                 if isinstance(jwplayer_data, dict):
2681                     return jwplayer_data
2682
2683     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2684         jwplayer_data = self._find_jwplayer_data(
2685             webpage, video_id, transform_source=js_to_json)
2686         return self._parse_jwplayer_data(
2687             jwplayer_data, video_id, *args, **kwargs)
2688
2689     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2690                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2691         # JWPlayer backward compatibility: flattened playlists
2692         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2693         if 'playlist' not in jwplayer_data:
2694             jwplayer_data = {'playlist': [jwplayer_data]}
2695
2696         entries = []
2697
2698         # JWPlayer backward compatibility: single playlist item
2699         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2700         if not isinstance(jwplayer_data['playlist'], list):
2701             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2702
2703         for video_data in jwplayer_data['playlist']:
2704             # JWPlayer backward compatibility: flattened sources
2705             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2706             if 'sources' not in video_data:
2707                 video_data['sources'] = [video_data]
2708
2709             this_video_id = video_id or video_data['mediaid']
2710
2711             formats = self._parse_jwplayer_formats(
2712                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2713                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2714
2715             subtitles = {}
2716             tracks = video_data.get('tracks')
2717             if tracks and isinstance(tracks, list):
2718                 for track in tracks:
2719                     if not isinstance(track, dict):
2720                         continue
2721                     track_kind = track.get('kind')
2722                     if not track_kind or not isinstance(track_kind, compat_str):
2723                         continue
2724                     if track_kind.lower() not in ('captions', 'subtitles'):
2725                         continue
2726                     track_url = urljoin(base_url, track.get('file'))
2727                     if not track_url:
2728                         continue
2729                     subtitles.setdefault(track.get('label') or 'en', []).append({
2730                         'url': self._proto_relative_url(track_url)
2731                     })
2732
2733             entry = {
2734                 'id': this_video_id,
2735                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2736                 'description': clean_html(video_data.get('description')),
2737                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2738                 'timestamp': int_or_none(video_data.get('pubdate')),
2739                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2740                 'subtitles': subtitles,
2741             }
2742             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2743             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2744                 entry.update({
2745                     '_type': 'url_transparent',
2746                     'url': formats[0]['url'],
2747                 })
2748             else:
2749                 self._sort_formats(formats)
2750                 entry['formats'] = formats
2751             entries.append(entry)
2752         if len(entries) == 1:
2753             return entries[0]
2754         else:
2755             return self.playlist_result(entries)
2756
2757     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2758                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2759         urls = []
2760         formats = []
2761         for source in jwplayer_sources_data:
2762             if not isinstance(source, dict):
2763                 continue
2764             source_url = urljoin(
2765                 base_url, self._proto_relative_url(source.get('file')))
2766             if not source_url or source_url in urls:
2767                 continue
2768             urls.append(source_url)
2769             source_type = source.get('type') or ''
2770             ext = mimetype2ext(source_type) or determine_ext(source_url)
2771             if source_type == 'hls' or ext == 'm3u8':
2772                 formats.extend(self._extract_m3u8_formats(
2773                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2774                     m3u8_id=m3u8_id, fatal=False))
2775             elif source_type == 'dash' or ext == 'mpd':
2776                 formats.extend(self._extract_mpd_formats(
2777                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2778             elif ext == 'smil':
2779                 formats.extend(self._extract_smil_formats(
2780                     source_url, video_id, fatal=False))
2781             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2782             elif source_type.startswith('audio') or ext in (
2783                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2784                 formats.append({
2785                     'url': source_url,
2786                     'vcodec': 'none',
2787                     'ext': ext,
2788                 })
2789             else:
2790                 height = int_or_none(source.get('height'))
2791                 if height is None:
2792                     # Often no height is provided but there is a label in
2793                     # format like "1080p", "720p SD", or 1080.
2794                     height = int_or_none(self._search_regex(
2795                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2796                         'height', default=None))
2797                 a_format = {
2798                     'url': source_url,
2799                     'width': int_or_none(source.get('width')),
2800                     'height': height,
2801                     'tbr': int_or_none(source.get('bitrate')),
2802                     'ext': ext,
2803                 }
2804                 if source_url.startswith('rtmp'):
2805                     a_format['ext'] = 'flv'
2806                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2807                     # of jwplayer.flash.swf
2808                     rtmp_url_parts = re.split(
2809                         r'((?:mp4|mp3|flv):)', source_url, 1)
2810                     if len(rtmp_url_parts) == 3:
2811                         rtmp_url, prefix, play_path = rtmp_url_parts
2812                         a_format.update({
2813                             'url': rtmp_url,
2814                             'play_path': prefix + play_path,
2815                         })
2816                     if rtmp_params:
2817                         a_format.update(rtmp_params)
2818                 formats.append(a_format)
2819         return formats
2820
2821     def _live_title(self, name):
2822         """ Generate the title for a live video """
2823         now = datetime.datetime.now()
2824         now_str = now.strftime('%Y-%m-%d %H:%M')
2825         return name + ' ' + now_str
2826
2827     def _int(self, v, name, fatal=False, **kwargs):
2828         res = int_or_none(v, **kwargs)
2829         if 'get_attr' in kwargs:
2830             print(getattr(v, kwargs['get_attr']))
2831         if res is None:
2832             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2833             if fatal:
2834                 raise ExtractorError(msg)
2835             else:
2836                 self._downloader.report_warning(msg)
2837         return res
2838
2839     def _float(self, v, name, fatal=False, **kwargs):
2840         res = float_or_none(v, **kwargs)
2841         if res is None:
2842             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2843             if fatal:
2844                 raise ExtractorError(msg)
2845             else:
2846                 self._downloader.report_warning(msg)
2847         return res
2848
2849     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2850                     path='/', secure=False, discard=False, rest={}, **kwargs):
2851         cookie = compat_cookiejar_Cookie(
2852             0, name, value, port, port is not None, domain, True,
2853             domain.startswith('.'), path, True, secure, expire_time,
2854             discard, None, None, rest)
2855         self._downloader.cookiejar.set_cookie(cookie)
2856
2857     def _get_cookies(self, url):
2858         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2859         req = sanitized_Request(url)
2860         self._downloader.cookiejar.add_cookie_header(req)
2861         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2862
2863     def _apply_first_set_cookie_header(self, url_handle, cookie):
2864         """
2865         Apply first Set-Cookie header instead of the last. Experimental.
2866
2867         Some sites (e.g. [1-3]) may serve two cookies under the same name
2868         in Set-Cookie header and expect the first (old) one to be set rather
2869         than second (new). However, as of RFC6265 the newer one cookie
2870         should be set into cookie store what actually happens.
2871         We will workaround this issue by resetting the cookie to
2872         the first one manually.
2873         1. https://new.vk.com/
2874         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
2875         3. https://learning.oreilly.com/
2876         """
2877         for header, cookies in url_handle.headers.items():
2878             if header.lower() != 'set-cookie':
2879                 continue
2880             if sys.version_info[0] >= 3:
2881                 cookies = cookies.encode('iso-8859-1')
2882             cookies = cookies.decode('utf-8')
2883             cookie_value = re.search(
2884                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
2885             if cookie_value:
2886                 value, domain = cookie_value.groups()
2887                 self._set_cookie(domain, cookie, value)
2888                 break
2889
2890     def get_testcases(self, include_onlymatching=False):
2891         t = getattr(self, '_TEST', None)
2892         if t:
2893             assert not hasattr(self, '_TESTS'), \
2894                 '%s has _TEST and _TESTS' % type(self).__name__
2895             tests = [t]
2896         else:
2897             tests = getattr(self, '_TESTS', [])
2898         for t in tests:
2899             if not include_onlymatching and t.get('only_matching', False):
2900                 continue
2901             t['name'] = type(self).__name__[:-len('IE')]
2902             yield t
2903
2904     def is_suitable(self, age_limit):
2905         """ Test whether the extractor is generally suitable for the given
2906         age limit (i.e. pornographic sites are not, all others usually are) """
2907
2908         any_restricted = False
2909         for tc in self.get_testcases(include_onlymatching=False):
2910             if tc.get('playlist', []):
2911                 tc = tc['playlist'][0]
2912             is_restricted = age_restricted(
2913                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2914             if not is_restricted:
2915                 return True
2916             any_restricted = any_restricted or is_restricted
2917         return not any_restricted
2918
2919     def extract_subtitles(self, *args, **kwargs):
2920         if (self._downloader.params.get('writesubtitles', False)
2921                 or self._downloader.params.get('listsubtitles')):
2922             return self._get_subtitles(*args, **kwargs)
2923         return {}
2924
2925     def _get_subtitles(self, *args, **kwargs):
2926         raise NotImplementedError('This method must be implemented by subclasses')
2927
2928     @staticmethod
2929     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2930         """ Merge subtitle items for one language. Items with duplicated URLs
2931         will be dropped. """
2932         list1_urls = set([item['url'] for item in subtitle_list1])
2933         ret = list(subtitle_list1)
2934         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2935         return ret
2936
2937     @classmethod
2938     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2939         """ Merge two subtitle dictionaries, language by language. """
2940         ret = dict(subtitle_dict1)
2941         for lang in subtitle_dict2:
2942             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2943         return ret
2944
2945     def extract_automatic_captions(self, *args, **kwargs):
2946         if (self._downloader.params.get('writeautomaticsub', False)
2947                 or self._downloader.params.get('listsubtitles')):
2948             return self._get_automatic_captions(*args, **kwargs)
2949         return {}
2950
2951     def _get_automatic_captions(self, *args, **kwargs):
2952         raise NotImplementedError('This method must be implemented by subclasses')
2953
2954     def mark_watched(self, *args, **kwargs):
2955         if (self._downloader.params.get('mark_watched', False)
2956                 and (self._get_login_info()[0] is not None
2957                      or self._downloader.params.get('cookiefile') is not None)):
2958             self._mark_watched(*args, **kwargs)
2959
2960     def _mark_watched(self, *args, **kwargs):
2961         raise NotImplementedError('This method must be implemented by subclasses')
2962
2963     def geo_verification_headers(self):
2964         headers = {}
2965         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2966         if geo_verification_proxy:
2967             headers['Ytdl-request-proxy'] = geo_verification_proxy
2968         return headers
2969
2970     def _generic_id(self, url):
2971         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2972
2973     def _generic_title(self, url):
2974         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2975
2976
2977 class SearchInfoExtractor(InfoExtractor):
2978     """
2979     Base class for paged search queries extractors.
2980     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2981     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2982     """
2983
2984     @classmethod
2985     def _make_valid_url(cls):
2986         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2987
2988     @classmethod
2989     def suitable(cls, url):
2990         return re.match(cls._make_valid_url(), url) is not None
2991
2992     def _real_extract(self, query):
2993         mobj = re.match(self._make_valid_url(), query)
2994         if mobj is None:
2995             raise ExtractorError('Invalid search query "%s"' % query)
2996
2997         prefix = mobj.group('prefix')
2998         query = mobj.group('query')
2999         if prefix == '':
3000             return self._get_n_results(query, 1)
3001         elif prefix == 'all':
3002             return self._get_n_results(query, self._MAX_RESULTS)
3003         else:
3004             n = int(prefix)
3005             if n <= 0:
3006                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3007             elif n > self._MAX_RESULTS:
3008                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3009                 n = self._MAX_RESULTS
3010             return self._get_n_results(query, n)
3011
3012     def _get_n_results(self, query, n):
3013         """Get a specified number of results for a query"""
3014         raise NotImplementedError('This method must be implemented by subclasses')
3015
3016     @property
3017     def SEARCH_KEY(self):
3018         return self._SEARCH_KEY