git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177                     * downloader_options  A dictionary of downloader options as
 178                                  described in FileDownloader
 179
 180     url:            Final video URL.
 181     ext:            Video filename extension.
 182     format:         The video format, defaults to ext (used for --get-format)
 183     player_url:     SWF Player URL (used for rtmpdump).
 184
 185     The following fields are optional:
 186
 187     alt_title:      A secondary title of the video.
 188     display_id      An alternative identifier for the video, not necessarily
 189                     unique, but available before title. Typically, id is
 190                     something like "4234987", title "Dancing naked mole rats",
 191                     and display_id "dancing-naked-mole-rats"
 192     thumbnails:     A list of dictionaries, with the following entries:
 193                         * "id" (optional, string) - Thumbnail format ID
 194                         * "url"
 195                         * "preference" (optional, int) - quality of the image
 196                         * "width" (optional, int)
 197                         * "height" (optional, int)
 198                         * "resolution" (optional, string "{width}x{height"},
 199                                         deprecated)
 200                         * "filesize" (optional, int)
 201     thumbnail:      Full URL to a video thumbnail image.
 202     description:    Full video description.
 203     uploader:       Full name of the video uploader.
 204     license:        License name the video is licensed under.
 205     creator:        The creator of the video.
 206     release_date:   The date (YYYYMMDD) when the video was released.
 207     timestamp:      UNIX timestamp of the moment the video became available.
 208     upload_date:    Video upload date (YYYYMMDD).
 209                     If not explicitly set, calculated from timestamp.
 210     uploader_id:    Nickname or id of the video uploader.
 211     uploader_url:   Full URL to a personal webpage of the video uploader.
 212     location:       Physical location where the video was filmed.
 213     subtitles:      The available subtitles as a dictionary in the format
 214                     {tag: subformats}. "tag" is usually a language code, and
 215                     "subformats" is a list sorted from lower to higher
 216                     preference, each element is a dictionary with the "ext"
 217                     entry and one of:
 218                         * "data": The subtitles file contents
 219                         * "url": A URL pointing to the subtitles file
 220                     "ext" will be calculated from URL if missing
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 222                     automatically generated captions
 223     duration:       Length of the video in seconds, as an integer or float.
 224     view_count:     How many users have watched the video on the platform.
 225     like_count:     Number of positive ratings of the video
 226     dislike_count:  Number of negative ratings of the video
 227     repost_count:   Number of reposts of the video
 228     average_rating: Average rating give by users, the scale used depends on the webpage
 229     comment_count:  Number of comments on the video
 230     comments:       A list of comments, each with one or more of the following
 231                     properties (all but one of text or html optional):
 232                         * "author" - human-readable name of the comment author
 233                         * "author_id" - user ID of the comment author
 234                         * "id" - Comment ID
 235                         * "html" - Comment as HTML
 236                         * "text" - Plain text of the comment
 237                         * "timestamp" - UNIX timestamp of comment
 238                         * "parent" - ID of the comment this one is replying to.
 239                                      Set to "root" to indicate that this is a
 240                                      comment to the original video.
 241     age_limit:      Age restriction for the video, as an integer (years)
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 243                     should allow to get the same result again. (It will be set
 244                     by YoutubeDL if it's missing)
 245     categories:     A list of categories that the video falls in, for example
 246                     ["Sports", "Berlin"]
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 248     is_live:        True, False, or None (=unknown). Whether this video is a
 249                     live stream that goes on instead of a fixed-length video.
 250     start_time:     Time in seconds where the reproduction should start, as
 251                     specified in the URL.
 252     end_time:       Time in seconds where the reproduction should end, as
 253                     specified in the URL.
 254     chapters:       A list of dictionaries, with the following entries:
 255                         * "start_time" - The start time of the chapter in seconds
 256                         * "end_time" - The end time of the chapter in seconds
 257                         * "title" (optional, string)
 258
 259     The following fields should only be used when the video belongs to some logical
 260     chapter or section:
 261
 262     chapter:        Name or title of the chapter the video belongs to.
 263     chapter_number: Number of the chapter the video belongs to, as an integer.
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 265
 266     The following fields should only be used when the video is an episode of some
 267     series, programme or podcast:
 268
 269     series:         Title of the series or programme the video episode belongs to.
 270     season:         Title of the season the video episode belongs to.
 271     season_number:  Number of the season the video episode belongs to, as an integer.
 272     season_id:      Id of the season the video episode belongs to, as a unicode string.
 273     episode:        Title of the video episode. Unlike mandatory video title field,
 274                     this field should denote the exact title of the video episode
 275                     without any kind of decoration.
 276     episode_number: Number of the video episode within a season, as an integer.
 277     episode_id:     Id of the video episode, as a unicode string.
 278
 279     The following fields should only be used when the media is a track or a part of
 280     a music album:
 281
 282     track:          Title of the track.
 283     track_number:   Number of the track within an album or a disc, as an integer.
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 285                     as a unicode string.
 286     artist:         Artist(s) of the track.
 287     genre:          Genre(s) of the track.
 288     album:          Title of the album the track belongs to.
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 290     album_artist:   List of all artists appeared on the album (e.g.
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 292                     and compilations).
 293     disc_number:    Number of the disc or other physical medium the track belongs to,
 294                     as an integer.
 295     release_year:   Year (YYYY) when the album was released.
 296
 297     Unless mentioned otherwise, the fields should be Unicode strings.
 298
 299     Unless mentioned otherwise, None is equivalent to absence of information.
 300
 301
 302     _type "playlist" indicates multiple videos.
 303     There must be a key "entries", which is a list, an iterable, or a PagedList
 304     object, each element of which is a valid dictionary by this specification.
 305
 306     Additionally, playlists can have "id", "title", "description", "uploader",
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos
 308     (see above).
 309
 310
 311     _type "multi_video" indicates that there are multiple videos that
 312     form a single show, for examples multiple acts of an opera or TV episode.
 313     It must have an entries key like a playlist and contain all the keys
 314     required for a video at the same time.
 315
 316
 317     _type "url" indicates that the video must be extracted from another
 318     location, possibly by a different extractor. Its only required key is:
 319     "url" - the next URL to extract.
 320     The key "ie_key" can be set to the class name (minus the trailing "IE",
 321     e.g. "Youtube") if the extractor class is known in advance.
 322     Additionally, the dictionary may have any properties of the resolved entity
 323     known in advance, for example "title" if the title of the referred video is
 324     known ahead of time.
 325
 326
 327     _type "url_transparent" entities have the same specification as "url", but
 328     indicate that the given additional information is more precise than the one
 329     associated with the resolved URL.
 330     This is useful when a site employs a video service that hosts the video and
 331     its technical metadata, but that video service does not embed a useful
 332     title, description etc.
 333
 334
 335     Subclasses of this one should re-define the _real_initialize() and
 336     _real_extract() methods and define a _VALID_URL regexp.
 337     Probably, they should also be added to the list of extractors.
 338
 339     _GEO_BYPASS attribute may be set to False in order to disable
 340     geo restriction bypass mechanisms for a particular extractor.
 341     Though it won't disable explicit geo restriction bypass based on
 342     country code provided with geo_bypass_country. (experimental)
 343
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 345     countries for this extractor. One of these countries will be used by
 346     geo restriction bypass mechanism right away in order to bypass
 347     geo restriction, of course, if the mechanism is not disabled. (experimental)
 348
 349     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 350     IP blocks in CIDR notation for this extractor. One of these IP blocks
 351     will be used by geo restriction bypass mechanism similarly
 352     to _GEO_COUNTRIES. (experimental)
 353
 354     NB: both these geo attributes are experimental and may change in future
 355     or be completely removed.
 356
 357     Finally, the _WORKING attribute should be set to False for broken IEs
 358     in order to warn the users and skip the tests.
 359     """
 360
 361     _ready = False
 362     _downloader = None
 363     _x_forwarded_for_ip = None
 364     _GEO_BYPASS = True
 365     _GEO_COUNTRIES = None
 366     _GEO_IP_BLOCKS = None
 367     _WORKING = True
 368
 369     def __init__(self, downloader=None):
 370         """Constructor. Receives an optional downloader."""
 371         self._ready = False
 372         self._x_forwarded_for_ip = None
 373         self.set_downloader(downloader)
 374
 375     @classmethod
 376     def suitable(cls, url):
 377         """Receives a URL and returns True if suitable for this IE."""
 378
 379         # This does not use has/getattr intentionally - we want to know whether
 380         # we have cached the regexp for *this* class, whereas getattr would also
 381         # match the superclass
 382         if '_VALID_URL_RE' not in cls.__dict__:
 383             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 384         return cls._VALID_URL_RE.match(url) is not None
 385
 386     @classmethod
 387     def _match_id(cls, url):
 388         if '_VALID_URL_RE' not in cls.__dict__:
 389             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 390         m = cls._VALID_URL_RE.match(url)
 391         assert m
 392         return compat_str(m.group('id'))
 393
 394     @classmethod
 395     def working(cls):
 396         """Getter method for _WORKING."""
 397         return cls._WORKING
 398
 399     def initialize(self):
 400         """Initializes an instance (authentication, etc)."""
 401         self._initialize_geo_bypass({
 402             'countries': self._GEO_COUNTRIES,
 403             'ip_blocks': self._GEO_IP_BLOCKS,
 404         })
 405         if not self._ready:
 406             self._real_initialize()
 407             self._ready = True
 408
 409     def _initialize_geo_bypass(self, geo_bypass_context):
 410         """
 411         Initialize geo restriction bypass mechanism.
 412
 413         This method is used to initialize geo bypass mechanism based on faking
 414         X-Forwarded-For HTTP header. A random country from provided country list
 415         is selected and a random IP belonging to this country is generated. This
 416         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 417         HTTP requests.
 418
 419         This method will be used for initial geo bypass mechanism initialization
 420         during the instance initialization with _GEO_COUNTRIES and
 421         _GEO_IP_BLOCKS.
 422
 423         You may also manually call it from extractor's code if geo bypass
 424         information is not available beforehand (e.g. obtained during
 425         extraction) or due to some other reason. In this case you should pass
 426         this information in geo bypass context passed as first argument. It may
 427         contain following fields:
 428
 429         countries:  List of geo unrestricted countries (similar
 430                     to _GEO_COUNTRIES)
 431         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 432                     (similar to _GEO_IP_BLOCKS)
 433
 434         """
 435         if not self._x_forwarded_for_ip:
 436
 437             # Geo bypass mechanism is explicitly disabled by user
 438             if not self._downloader.params.get('geo_bypass', True):
 439                 return
 440
 441             if not geo_bypass_context:
 442                 geo_bypass_context = {}
 443
 444             # Backward compatibility: previously _initialize_geo_bypass
 445             # expected a list of countries, some 3rd party code may still use
 446             # it this way
 447             if isinstance(geo_bypass_context, (list, tuple)):
 448                 geo_bypass_context = {
 449                     'countries': geo_bypass_context,
 450                 }
 451
 452             # The whole point of geo bypass mechanism is to fake IP
 453             # as X-Forwarded-For HTTP header based on some IP block or
 454             # country code.
 455
 456             # Path 1: bypassing based on IP block in CIDR notation
 457
 458             # Explicit IP block specified by user, use it right away
 459             # regardless of whether extractor is geo bypassable or not
 460             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 461
 462             # Otherwise use random IP block from geo bypass context but only
 463             # if extractor is known as geo bypassable
 464             if not ip_block:
 465                 ip_blocks = geo_bypass_context.get('ip_blocks')
 466                 if self._GEO_BYPASS and ip_blocks:
 467                     ip_block = random.choice(ip_blocks)
 468
 469             if ip_block:
 470                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 471                 if self._downloader.params.get('verbose', False):
 472                     self._downloader.to_screen(
 473                         '[debug] Using fake IP %s as X-Forwarded-For.'
 474                         % self._x_forwarded_for_ip)
 475                 return
 476
 477             # Path 2: bypassing based on country code
 478
 479             # Explicit country code specified by user, use it right away
 480             # regardless of whether extractor is geo bypassable or not
 481             country = self._downloader.params.get('geo_bypass_country', None)
 482
 483             # Otherwise use random country code from geo bypass context but
 484             # only if extractor is known as geo bypassable
 485             if not country:
 486                 countries = geo_bypass_context.get('countries')
 487                 if self._GEO_BYPASS and countries:
 488                     country = random.choice(countries)
 489
 490             if country:
 491                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 492                 if self._downloader.params.get('verbose', False):
 493                     self._downloader.to_screen(
 494                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 495                         % (self._x_forwarded_for_ip, country.upper()))
 496
 497     def extract(self, url):
 498         """Extracts URL information and returns it in list of dicts."""
 499         try:
 500             for _ in range(2):
 501                 try:
 502                     self.initialize()
 503                     ie_result = self._real_extract(url)
 504                     if self._x_forwarded_for_ip:
 505                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 506                     return ie_result
 507                 except GeoRestrictedError as e:
 508                     if self.__maybe_fake_ip_and_retry(e.countries):
 509                         continue
 510                     raise
 511         except ExtractorError:
 512             raise
 513         except compat_http_client.IncompleteRead as e:
 514             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 515         except (KeyError, StopIteration) as e:
 516             raise ExtractorError('An extractor error has occurred.', cause=e)
 517
 518     def __maybe_fake_ip_and_retry(self, countries):
 519         if (not self._downloader.params.get('geo_bypass_country', None) and
 520                 self._GEO_BYPASS and
 521                 self._downloader.params.get('geo_bypass', True) and
 522                 not self._x_forwarded_for_ip and
 523                 countries):
 524             country_code = random.choice(countries)
 525             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 526             if self._x_forwarded_for_ip:
 527                 self.report_warning(
 528                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 529                     % (self._x_forwarded_for_ip, country_code.upper()))
 530                 return True
 531         return False
 532
 533     def set_downloader(self, downloader):
 534         """Sets the downloader for this IE."""
 535         self._downloader = downloader
 536
 537     def _real_initialize(self):
 538         """Real initialization process. Redefine in subclasses."""
 539         pass
 540
 541     def _real_extract(self, url):
 542         """Real extraction process. Redefine in subclasses."""
 543         pass
 544
 545     @classmethod
 546     def ie_key(cls):
 547         """A string for getting the InfoExtractor with get_info_extractor"""
 548         return compat_str(cls.__name__[:-2])
 549
 550     @property
 551     def IE_NAME(self):
 552         return compat_str(type(self).__name__[:-2])
 553
 554     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 555         """ Returns the response handle """
 556         if note is None:
 557             self.report_download_webpage(video_id)
 558         elif note is not False:
 559             if video_id is None:
 560                 self.to_screen('%s' % (note,))
 561             else:
 562                 self.to_screen('%s: %s' % (video_id, note))
 563
 564         # Some sites check X-Forwarded-For HTTP header in order to figure out
 565         # the origin of the client behind proxy. This allows bypassing geo
 566         # restriction by faking this header's value to IP that belongs to some
 567         # geo unrestricted country. We will do so once we encounter any
 568         # geo restriction error.
 569         if self._x_forwarded_for_ip:
 570             if 'X-Forwarded-For' not in headers:
 571                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 572
 573         if isinstance(url_or_request, compat_urllib_request.Request):
 574             url_or_request = update_Request(
 575                 url_or_request, data=data, headers=headers, query=query)
 576         else:
 577             if query:
 578                 url_or_request = update_url_query(url_or_request, query)
 579             if data is not None or headers:
 580                 url_or_request = sanitized_Request(url_or_request, data, headers)
 581         try:
 582             return self._downloader.urlopen(url_or_request)
 583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 584             if errnote is False:
 585                 return False
 586             if errnote is None:
 587                 errnote = 'Unable to download webpage'
 588
 589             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 590             if fatal:
 591                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 592             else:
 593                 self._downloader.report_warning(errmsg)
 594                 return False
 595
 596     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 597         """ Returns a tuple (page content as string, URL handle) """
 598         # Strip hashes from the URL (#1038)
 599         if isinstance(url_or_request, (compat_str, str)):
 600             url_or_request = url_or_request.partition('#')[0]
 601
 602         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 603         if urlh is False:
 604             assert not fatal
 605             return False
 606         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 607         return (content, urlh)
 608
 609     @staticmethod
 610     def _guess_encoding_from_content(content_type, webpage_bytes):
 611         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 612         if m:
 613             encoding = m.group(1)
 614         else:
 615             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 616                           webpage_bytes[:1024])
 617             if m:
 618                 encoding = m.group(1).decode('ascii')
 619             elif webpage_bytes.startswith(b'\xff\xfe'):
 620                 encoding = 'utf-16'
 621             else:
 622                 encoding = 'utf-8'
 623
 624         return encoding
 625
 626     def __check_blocked(self, content):
 627         first_block = content[:512]
 628         if ('<title>Access to this site is blocked</title>' in content and
 629                 'Websense' in first_block):
 630             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 631             blocked_iframe = self._html_search_regex(
 632                 r'<iframe src="([^"]+)"', content,
 633                 'Websense information URL', default=None)
 634             if blocked_iframe:
 635                 msg += ' Visit %s for more details' % blocked_iframe
 636             raise ExtractorError(msg, expected=True)
 637         if '<title>The URL you requested has been blocked</title>' in first_block:
 638             msg = (
 639                 'Access to this webpage has been blocked by Indian censorship. '
 640                 'Use a VPN or proxy server (with --proxy) to route around it.')
 641             block_msg = self._html_search_regex(
 642                 r'</h1><p>(.*?)</p>',
 643                 content, 'block message', default=None)
 644             if block_msg:
 645                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 646             raise ExtractorError(msg, expected=True)
 647         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 648                 'blocklist.rkn.gov.ru' in content):
 649             raise ExtractorError(
 650                 'Access to this webpage has been blocked by decision of the Russian government. '
 651                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 652                 expected=True)
 653
 654     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 655         content_type = urlh.headers.get('Content-Type', '')
 656         webpage_bytes = urlh.read()
 657         if prefix is not None:
 658             webpage_bytes = prefix + webpage_bytes
 659         if not encoding:
 660             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 661         if self._downloader.params.get('dump_intermediate_pages', False):
 662             self.to_screen('Dumping request to ' + urlh.geturl())
 663             dump = base64.b64encode(webpage_bytes).decode('ascii')
 664             self._downloader.to_screen(dump)
 665         if self._downloader.params.get('write_pages', False):
 666             basen = '%s_%s' % (video_id, urlh.geturl())
 667             if len(basen) > 240:
 668                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 669                 basen = basen[:240 - len(h)] + h
 670             raw_filename = basen + '.dump'
 671             filename = sanitize_filename(raw_filename, restricted=True)
 672             self.to_screen('Saving request to ' + filename)
 673             # Working around MAX_PATH limitation on Windows (see
 674             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 675             if compat_os_name == 'nt':
 676                 absfilepath = os.path.abspath(filename)
 677                 if len(absfilepath) > 259:
 678                     filename = '\\\\?\\' + absfilepath
 679             with open(filename, 'wb') as outf:
 680                 outf.write(webpage_bytes)
 681
 682         try:
 683             content = webpage_bytes.decode(encoding, 'replace')
 684         except LookupError:
 685             content = webpage_bytes.decode('utf-8', 'replace')
 686
 687         self.__check_blocked(content)
 688
 689         return content
 690
 691     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 692         """ Returns the data of the page as a string """
 693         success = False
 694         try_count = 0
 695         while success is False:
 696             try:
 697                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 698                 success = True
 699             except compat_http_client.IncompleteRead as e:
 700                 try_count += 1
 701                 if try_count >= tries:
 702                     raise e
 703                 self._sleep(timeout, video_id)
 704         if res is False:
 705             return res
 706         else:
 707             content, _ = res
 708             return content
 709
 710     def _download_xml_handle(
 711             self, url_or_request, video_id, note='Downloading XML',
 712             errnote='Unable to download XML', transform_source=None,
 713             fatal=True, encoding=None, data=None, headers={}, query={}):
 714         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
 715         res = self._download_webpage_handle(
 716             url_or_request, video_id, note, errnote, fatal=fatal,
 717             encoding=encoding, data=data, headers=headers, query=query)
 718         if res is False:
 719             return res
 720         xml_string, urlh = res
 721         return self._parse_xml(
 722             xml_string, video_id, transform_source=transform_source,
 723             fatal=fatal), urlh
 724
 725     def _download_xml(self, url_or_request, video_id,
 726                       note='Downloading XML', errnote='Unable to download XML',
 727                       transform_source=None, fatal=True, encoding=None,
 728                       data=None, headers={}, query={}):
 729         """Return the xml as an xml.etree.ElementTree.Element"""
 730         res = self._download_xml_handle(
 731             url_or_request, video_id, note=note, errnote=errnote,
 732             transform_source=transform_source, fatal=fatal, encoding=encoding,
 733             data=data, headers=headers, query=query)
 734         return res if res is False else res[0]
 735
 736     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 737         if transform_source:
 738             xml_string = transform_source(xml_string)
 739         try:
 740             return compat_etree_fromstring(xml_string.encode('utf-8'))
 741         except compat_xml_parse_error as ve:
 742             errmsg = '%s: Failed to parse XML ' % video_id
 743             if fatal:
 744                 raise ExtractorError(errmsg, cause=ve)
 745             else:
 746                 self.report_warning(errmsg + str(ve))
 747
 748     def _download_json_handle(
 749             self, url_or_request, video_id, note='Downloading JSON metadata',
 750             errnote='Unable to download JSON metadata', transform_source=None,
 751             fatal=True, encoding=None, data=None, headers={}, query={}):
 752         """Return a tuple (JSON object, URL handle)"""
 753         res = self._download_webpage_handle(
 754             url_or_request, video_id, note, errnote, fatal=fatal,
 755             encoding=encoding, data=data, headers=headers, query=query)
 756         if res is False:
 757             return res
 758         json_string, urlh = res
 759         return self._parse_json(
 760             json_string, video_id, transform_source=transform_source,
 761             fatal=fatal), urlh
 762
 763     def _download_json(
 764             self, url_or_request, video_id, note='Downloading JSON metadata',
 765             errnote='Unable to download JSON metadata', transform_source=None,
 766             fatal=True, encoding=None, data=None, headers={}, query={}):
 767         res = self._download_json_handle(
 768             url_or_request, video_id, note=note, errnote=errnote,
 769             transform_source=transform_source, fatal=fatal, encoding=encoding,
 770             data=data, headers=headers, query=query)
 771         return res if res is False else res[0]
 772
 773     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 774         if transform_source:
 775             json_string = transform_source(json_string)
 776         try:
 777             return json.loads(json_string)
 778         except ValueError as ve:
 779             errmsg = '%s: Failed to parse JSON ' % video_id
 780             if fatal:
 781                 raise ExtractorError(errmsg, cause=ve)
 782             else:
 783                 self.report_warning(errmsg + str(ve))
 784
 785     def report_warning(self, msg, video_id=None):
 786         idstr = '' if video_id is None else '%s: ' % video_id
 787         self._downloader.report_warning(
 788             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 789
 790     def to_screen(self, msg):
 791         """Print msg to screen, prefixing it with '[ie_name]'"""
 792         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 793
 794     def report_extraction(self, id_or_name):
 795         """Report information extraction."""
 796         self.to_screen('%s: Extracting information' % id_or_name)
 797
 798     def report_download_webpage(self, video_id):
 799         """Report webpage download."""
 800         self.to_screen('%s: Downloading webpage' % video_id)
 801
 802     def report_age_confirmation(self):
 803         """Report attempt to confirm age."""
 804         self.to_screen('Confirming age')
 805
 806     def report_login(self):
 807         """Report attempt to log in."""
 808         self.to_screen('Logging in')
 809
 810     @staticmethod
 811     def raise_login_required(msg='This video is only available for registered users'):
 812         raise ExtractorError(
 813             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 814             expected=True)
 815
 816     @staticmethod
 817     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 818         raise GeoRestrictedError(msg, countries=countries)
 819
 820     # Methods for following #608
 821     @staticmethod
 822     def url_result(url, ie=None, video_id=None, video_title=None):
 823         """Returns a URL that points to a page that should be processed"""
 824         # TODO: ie should be the class used for getting the info
 825         video_info = {'_type': 'url',
 826                       'url': url,
 827                       'ie_key': ie}
 828         if video_id is not None:
 829             video_info['id'] = video_id
 830         if video_title is not None:
 831             video_info['title'] = video_title
 832         return video_info
 833
 834     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 835         urls = orderedSet(
 836             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 837             for m in matches)
 838         return self.playlist_result(
 839             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 840
 841     @staticmethod
 842     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 843         """Returns a playlist"""
 844         video_info = {'_type': 'playlist',
 845                       'entries': entries}
 846         if playlist_id:
 847             video_info['id'] = playlist_id
 848         if playlist_title:
 849             video_info['title'] = playlist_title
 850         if playlist_description:
 851             video_info['description'] = playlist_description
 852         return video_info
 853
 854     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 855         """
 856         Perform a regex search on the given string, using a single or a list of
 857         patterns returning the first matching group.
 858         In case of failure return a default value or raise a WARNING or a
 859         RegexNotFoundError, depending on fatal, specifying the field name.
 860         """
 861         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 862             mobj = re.search(pattern, string, flags)
 863         else:
 864             for p in pattern:
 865                 mobj = re.search(p, string, flags)
 866                 if mobj:
 867                     break
 868
 869         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 870             _name = '\033[0;34m%s\033[0m' % name
 871         else:
 872             _name = name
 873
 874         if mobj:
 875             if group is None:
 876                 # return the first matching group
 877                 return next(g for g in mobj.groups() if g is not None)
 878             else:
 879                 return mobj.group(group)
 880         elif default is not NO_DEFAULT:
 881             return default
 882         elif fatal:
 883             raise RegexNotFoundError('Unable to extract %s' % _name)
 884         else:
 885             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 886             return None
 887
 888     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 889         """
 890         Like _search_regex, but strips HTML tags and unescapes entities.
 891         """
 892         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 893         if res:
 894             return clean_html(res).strip()
 895         else:
 896             return res
 897
 898     def _get_netrc_login_info(self, netrc_machine=None):
 899         username = None
 900         password = None
 901         netrc_machine = netrc_machine or self._NETRC_MACHINE
 902
 903         if self._downloader.params.get('usenetrc', False):
 904             try:
 905                 info = netrc.netrc().authenticators(netrc_machine)
 906                 if info is not None:
 907                     username = info[0]
 908                     password = info[2]
 909                 else:
 910                     raise netrc.NetrcParseError(
 911                         'No authenticators for %s' % netrc_machine)
 912             except (IOError, netrc.NetrcParseError) as err:
 913                 self._downloader.report_warning(
 914                     'parsing .netrc: %s' % error_to_compat_str(err))
 915
 916         return username, password
 917
 918     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 919         """
 920         Get the login info as (username, password)
 921         First look for the manually specified credentials using username_option
 922         and password_option as keys in params dictionary. If no such credentials
 923         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 924         value.
 925         If there's no info available, return (None, None)
 926         """
 927         if self._downloader is None:
 928             return (None, None)
 929
 930         downloader_params = self._downloader.params
 931
 932         # Attempt to use provided username and password or .netrc data
 933         if downloader_params.get(username_option) is not None:
 934             username = downloader_params[username_option]
 935             password = downloader_params[password_option]
 936         else:
 937             username, password = self._get_netrc_login_info(netrc_machine)
 938
 939         return username, password
 940
 941     def _get_tfa_info(self, note='two-factor verification code'):
 942         """
 943         Get the two-factor authentication info
 944         TODO - asking the user will be required for sms/phone verify
 945         currently just uses the command line option
 946         If there's no info available, return None
 947         """
 948         if self._downloader is None:
 949             return None
 950         downloader_params = self._downloader.params
 951
 952         if downloader_params.get('twofactor') is not None:
 953             return downloader_params['twofactor']
 954
 955         return compat_getpass('Type %s and press [Return]: ' % note)
 956
 957     # Helper functions for extracting OpenGraph info
 958     @staticmethod
 959     def _og_regexes(prop):
 960         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 961         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 962                        % {'prop': re.escape(prop)})
 963         template = r'<meta[^>]+?%s[^>]+?%s'
 964         return [
 965             template % (property_re, content_re),
 966             template % (content_re, property_re),
 967         ]
 968
 969     @staticmethod
 970     def _meta_regex(prop):
 971         return r'''(?isx)<meta
 972                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 973                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 974
 975     def _og_search_property(self, prop, html, name=None, **kargs):
 976         if not isinstance(prop, (list, tuple)):
 977             prop = [prop]
 978         if name is None:
 979             name = 'OpenGraph %s' % prop[0]
 980         og_regexes = []
 981         for p in prop:
 982             og_regexes.extend(self._og_regexes(p))
 983         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 984         if escaped is None:
 985             return None
 986         return unescapeHTML(escaped)
 987
 988     def _og_search_thumbnail(self, html, **kargs):
 989         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 990
 991     def _og_search_description(self, html, **kargs):
 992         return self._og_search_property('description', html, fatal=False, **kargs)
 993
 994     def _og_search_title(self, html, **kargs):
 995         return self._og_search_property('title', html, **kargs)
 996
 997     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 998         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 999         if secure:
1000             regexes = self._og_regexes('video:secure_url') + regexes
1001         return self._html_search_regex(regexes, html, name, **kargs)
1002
1003     def _og_search_url(self, html, **kargs):
1004         return self._og_search_property('url', html, **kargs)
1005
1006     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1007         if not isinstance(name, (list, tuple)):
1008             name = [name]
1009         if display_name is None:
1010             display_name = name[0]
1011         return self._html_search_regex(
1012             [self._meta_regex(n) for n in name],
1013             html, display_name, fatal=fatal, group='content', **kwargs)
1014
1015     def _dc_search_uploader(self, html):
1016         return self._html_search_meta('dc.creator', html, 'uploader')
1017
1018     def _rta_search(self, html):
1019         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1020         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1021                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1022                      html):
1023             return 18
1024         return 0
1025
1026     def _media_rating_search(self, html):
1027         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1028         rating = self._html_search_meta('rating', html)
1029
1030         if not rating:
1031             return None
1032
1033         RATING_TABLE = {
1034             'safe for kids': 0,
1035             'general': 8,
1036             '14 years': 14,
1037             'mature': 17,
1038             'restricted': 19,
1039         }
1040         return RATING_TABLE.get(rating.lower())
1041
1042     def _family_friendly_search(self, html):
1043         # See http://schema.org/VideoObject
1044         family_friendly = self._html_search_meta(
1045             'isFamilyFriendly', html, default=None)
1046
1047         if not family_friendly:
1048             return None
1049
1050         RATING_TABLE = {
1051             '1': 0,
1052             'true': 0,
1053             '0': 18,
1054             'false': 18,
1055         }
1056         return RATING_TABLE.get(family_friendly.lower())
1057
1058     def _twitter_search_player(self, html):
1059         return self._html_search_meta('twitter:player', html,
1060                                       'twitter card player')
1061
1062     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1063         json_ld = self._search_regex(
1064             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1065             html, 'JSON-LD', group='json_ld', **kwargs)
1066         default = kwargs.get('default', NO_DEFAULT)
1067         if not json_ld:
1068             return default if default is not NO_DEFAULT else {}
1069         # JSON-LD may be malformed and thus `fatal` should be respected.
1070         # At the same time `default` may be passed that assumes `fatal=False`
1071         # for _search_regex. Let's simulate the same behavior here as well.
1072         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1073         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1074
1075     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1076         if isinstance(json_ld, compat_str):
1077             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1078         if not json_ld:
1079             return {}
1080         info = {}
1081         if not isinstance(json_ld, (list, tuple, dict)):
1082             return info
1083         if isinstance(json_ld, dict):
1084             json_ld = [json_ld]
1085
1086         INTERACTION_TYPE_MAP = {
1087             'CommentAction': 'comment',
1088             'AgreeAction': 'like',
1089             'DisagreeAction': 'dislike',
1090             'LikeAction': 'like',
1091             'DislikeAction': 'dislike',
1092             'ListenAction': 'view',
1093             'WatchAction': 'view',
1094             'ViewAction': 'view',
1095         }
1096
1097         def extract_interaction_statistic(e):
1098             interaction_statistic = e.get('interactionStatistic')
1099             if not isinstance(interaction_statistic, list):
1100                 return
1101             for is_e in interaction_statistic:
1102                 if not isinstance(is_e, dict):
1103                     continue
1104                 if is_e.get('@type') != 'InteractionCounter':
1105                     continue
1106                 interaction_type = is_e.get('interactionType')
1107                 if not isinstance(interaction_type, compat_str):
1108                     continue
1109                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1110                 if interaction_count is None:
1111                     continue
1112                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1113                 if not count_kind:
1114                     continue
1115                 count_key = '%s_count' % count_kind
1116                 if info.get(count_key) is not None:
1117                     continue
1118                 info[count_key] = interaction_count
1119
1120         def extract_video_object(e):
1121             assert e['@type'] == 'VideoObject'
1122             info.update({
1123                 'url': e.get('contentUrl'),
1124                 'title': unescapeHTML(e.get('name')),
1125                 'description': unescapeHTML(e.get('description')),
1126                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1127                 'duration': parse_duration(e.get('duration')),
1128                 'timestamp': unified_timestamp(e.get('uploadDate')),
1129                 'filesize': float_or_none(e.get('contentSize')),
1130                 'tbr': int_or_none(e.get('bitrate')),
1131                 'width': int_or_none(e.get('width')),
1132                 'height': int_or_none(e.get('height')),
1133                 'view_count': int_or_none(e.get('interactionCount')),
1134             })
1135             extract_interaction_statistic(e)
1136
1137         for e in json_ld:
1138             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1139                 item_type = e.get('@type')
1140                 if expected_type is not None and expected_type != item_type:
1141                     return info
1142                 if item_type in ('TVEpisode', 'Episode'):
1143                     info.update({
1144                         'episode': unescapeHTML(e.get('name')),
1145                         'episode_number': int_or_none(e.get('episodeNumber')),
1146                         'description': unescapeHTML(e.get('description')),
1147                     })
1148                     part_of_season = e.get('partOfSeason')
1149                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1150                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1151                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1152                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1153                         info['series'] = unescapeHTML(part_of_series.get('name'))
1154                 elif item_type in ('Article', 'NewsArticle'):
1155                     info.update({
1156                         'timestamp': parse_iso8601(e.get('datePublished')),
1157                         'title': unescapeHTML(e.get('headline')),
1158                         'description': unescapeHTML(e.get('articleBody')),
1159                     })
1160                 elif item_type == 'VideoObject':
1161                     extract_video_object(e)
1162                     continue
1163                 video = e.get('video')
1164                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1165                     extract_video_object(video)
1166                 break
1167         return dict((k, v) for k, v in info.items() if v is not None)
1168
1169     @staticmethod
1170     def _hidden_inputs(html):
1171         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1172         hidden_inputs = {}
1173         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1174             attrs = extract_attributes(input)
1175             if not input:
1176                 continue
1177             if attrs.get('type') not in ('hidden', 'submit'):
1178                 continue
1179             name = attrs.get('name') or attrs.get('id')
1180             value = attrs.get('value')
1181             if name and value is not None:
1182                 hidden_inputs[name] = value
1183         return hidden_inputs
1184
1185     def _form_hidden_inputs(self, form_id, html):
1186         form = self._search_regex(
1187             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1188             html, '%s form' % form_id, group='form')
1189         return self._hidden_inputs(form)
1190
1191     def _sort_formats(self, formats, field_preference=None):
1192         if not formats:
1193             raise ExtractorError('No video formats found')
1194
1195         for f in formats:
1196             # Automatically determine tbr when missing based on abr and vbr (improves
1197             # formats sorting in some cases)
1198             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1199                 f['tbr'] = f['abr'] + f['vbr']
1200
1201         def _formats_key(f):
1202             # TODO remove the following workaround
1203             from ..utils import determine_ext
1204             if not f.get('ext') and 'url' in f:
1205                 f['ext'] = determine_ext(f['url'])
1206
1207             if isinstance(field_preference, (list, tuple)):
1208                 return tuple(
1209                     f.get(field)
1210                     if f.get(field) is not None
1211                     else ('' if field == 'format_id' else -1)
1212                     for field in field_preference)
1213
1214             preference = f.get('preference')
1215             if preference is None:
1216                 preference = 0
1217                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1218                     preference -= 0.5
1219
1220             protocol = f.get('protocol') or determine_protocol(f)
1221             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1222
1223             if f.get('vcodec') == 'none':  # audio only
1224                 preference -= 50
1225                 if self._downloader.params.get('prefer_free_formats'):
1226                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1227                 else:
1228                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1229                 ext_preference = 0
1230                 try:
1231                     audio_ext_preference = ORDER.index(f['ext'])
1232                 except ValueError:
1233                     audio_ext_preference = -1
1234             else:
1235                 if f.get('acodec') == 'none':  # video only
1236                     preference -= 40
1237                 if self._downloader.params.get('prefer_free_formats'):
1238                     ORDER = ['flv', 'mp4', 'webm']
1239                 else:
1240                     ORDER = ['webm', 'flv', 'mp4']
1241                 try:
1242                     ext_preference = ORDER.index(f['ext'])
1243                 except ValueError:
1244                     ext_preference = -1
1245                 audio_ext_preference = 0
1246
1247             return (
1248                 preference,
1249                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1250                 f.get('quality') if f.get('quality') is not None else -1,
1251                 f.get('tbr') if f.get('tbr') is not None else -1,
1252                 f.get('filesize') if f.get('filesize') is not None else -1,
1253                 f.get('vbr') if f.get('vbr') is not None else -1,
1254                 f.get('height') if f.get('height') is not None else -1,
1255                 f.get('width') if f.get('width') is not None else -1,
1256                 proto_preference,
1257                 ext_preference,
1258                 f.get('abr') if f.get('abr') is not None else -1,
1259                 audio_ext_preference,
1260                 f.get('fps') if f.get('fps') is not None else -1,
1261                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1262                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1263                 f.get('format_id') if f.get('format_id') is not None else '',
1264             )
1265         formats.sort(key=_formats_key)
1266
1267     def _check_formats(self, formats, video_id):
1268         if formats:
1269             formats[:] = filter(
1270                 lambda f: self._is_valid_url(
1271                     f['url'], video_id,
1272                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1273                 formats)
1274
1275     @staticmethod
1276     def _remove_duplicate_formats(formats):
1277         format_urls = set()
1278         unique_formats = []
1279         for f in formats:
1280             if f['url'] not in format_urls:
1281                 format_urls.add(f['url'])
1282                 unique_formats.append(f)
1283         formats[:] = unique_formats
1284
1285     def _is_valid_url(self, url, video_id, item='video', headers={}):
1286         url = self._proto_relative_url(url, scheme='http:')
1287         # For now assume non HTTP(S) URLs always valid
1288         if not (url.startswith('http://') or url.startswith('https://')):
1289             return True
1290         try:
1291             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1292             return True
1293         except ExtractorError as e:
1294             if isinstance(e.cause, compat_urllib_error.URLError):
1295                 self.to_screen(
1296                     '%s: %s URL is invalid, skipping' % (video_id, item))
1297                 return False
1298             raise
1299
1300     def http_scheme(self):
1301         """ Either "http:" or "https:", depending on the user's preferences """
1302         return (
1303             'http:'
1304             if self._downloader.params.get('prefer_insecure', False)
1305             else 'https:')
1306
1307     def _proto_relative_url(self, url, scheme=None):
1308         if url is None:
1309             return url
1310         if url.startswith('//'):
1311             if scheme is None:
1312                 scheme = self.http_scheme()
1313             return scheme + url
1314         else:
1315             return url
1316
1317     def _sleep(self, timeout, video_id, msg_template=None):
1318         if msg_template is None:
1319             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1320         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1321         self.to_screen(msg)
1322         time.sleep(timeout)
1323
1324     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1325                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1326                              fatal=True, m3u8_id=None):
1327         manifest = self._download_xml(
1328             manifest_url, video_id, 'Downloading f4m manifest',
1329             'Unable to download f4m manifest',
1330             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1331             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1332             transform_source=transform_source,
1333             fatal=fatal)
1334
1335         if manifest is False:
1336             return []
1337
1338         return self._parse_f4m_formats(
1339             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1340             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1341
1342     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1343                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1344                            fatal=True, m3u8_id=None):
1345         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1346         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1347         if akamai_pv is not None and ';' in akamai_pv.text:
1348             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1349             if playerVerificationChallenge.strip() != '':
1350                 return []
1351
1352         formats = []
1353         manifest_version = '1.0'
1354         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1355         if not media_nodes:
1356             manifest_version = '2.0'
1357             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1358         # Remove unsupported DRM protected media from final formats
1359         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1360         media_nodes = remove_encrypted_media(media_nodes)
1361         if not media_nodes:
1362             return formats
1363
1364         manifest_base_url = get_base_url(manifest)
1365
1366         bootstrap_info = xpath_element(
1367             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1368             'bootstrap info', default=None)
1369
1370         vcodec = None
1371         mime_type = xpath_text(
1372             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1373             'base URL', default=None)
1374         if mime_type and mime_type.startswith('audio/'):
1375             vcodec = 'none'
1376
1377         for i, media_el in enumerate(media_nodes):
1378             tbr = int_or_none(media_el.attrib.get('bitrate'))
1379             width = int_or_none(media_el.attrib.get('width'))
1380             height = int_or_none(media_el.attrib.get('height'))
1381             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1382             # If <bootstrapInfo> is present, the specified f4m is a
1383             # stream-level manifest, and only set-level manifests may refer to
1384             # external resources.  See section 11.4 and section 4 of F4M spec
1385             if bootstrap_info is None:
1386                 media_url = None
1387                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1388                 if manifest_version == '2.0':
1389                     media_url = media_el.attrib.get('href')
1390                 if media_url is None:
1391                     media_url = media_el.attrib.get('url')
1392                 if not media_url:
1393                     continue
1394                 manifest_url = (
1395                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1396                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1397                 # If media_url is itself a f4m manifest do the recursive extraction
1398                 # since bitrates in parent manifest (this one) and media_url manifest
1399                 # may differ leading to inability to resolve the format by requested
1400                 # bitrate in f4m downloader
1401                 ext = determine_ext(manifest_url)
1402                 if ext == 'f4m':
1403                     f4m_formats = self._extract_f4m_formats(
1404                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1405                         transform_source=transform_source, fatal=fatal)
1406                     # Sometimes stream-level manifest contains single media entry that
1407                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1408                     # At the same time parent's media entry in set-level manifest may
1409                     # contain it. We will copy it from parent in such cases.
1410                     if len(f4m_formats) == 1:
1411                         f = f4m_formats[0]
1412                         f.update({
1413                             'tbr': f.get('tbr') or tbr,
1414                             'width': f.get('width') or width,
1415                             'height': f.get('height') or height,
1416                             'format_id': f.get('format_id') if not tbr else format_id,
1417                             'vcodec': vcodec,
1418                         })
1419                     formats.extend(f4m_formats)
1420                     continue
1421                 elif ext == 'm3u8':
1422                     formats.extend(self._extract_m3u8_formats(
1423                         manifest_url, video_id, 'mp4', preference=preference,
1424                         m3u8_id=m3u8_id, fatal=fatal))
1425                     continue
1426             formats.append({
1427                 'format_id': format_id,
1428                 'url': manifest_url,
1429                 'manifest_url': manifest_url,
1430                 'ext': 'flv' if bootstrap_info is not None else None,
1431                 'protocol': 'f4m',
1432                 'tbr': tbr,
1433                 'width': width,
1434                 'height': height,
1435                 'vcodec': vcodec,
1436                 'preference': preference,
1437             })
1438         return formats
1439
1440     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1441         return {
1442             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1443             'url': m3u8_url,
1444             'ext': ext,
1445             'protocol': 'm3u8',
1446             'preference': preference - 100 if preference else -100,
1447             'resolution': 'multiple',
1448             'format_note': 'Quality selection URL',
1449         }
1450
1451     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1452                               entry_protocol='m3u8', preference=None,
1453                               m3u8_id=None, note=None, errnote=None,
1454                               fatal=True, live=False):
1455         res = self._download_webpage_handle(
1456             m3u8_url, video_id,
1457             note=note or 'Downloading m3u8 information',
1458             errnote=errnote or 'Failed to download m3u8 information',
1459             fatal=fatal)
1460
1461         if res is False:
1462             return []
1463
1464         m3u8_doc, urlh = res
1465         m3u8_url = urlh.geturl()
1466
1467         return self._parse_m3u8_formats(
1468             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1469             preference=preference, m3u8_id=m3u8_id, live=live)
1470
1471     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1472                             entry_protocol='m3u8', preference=None,
1473                             m3u8_id=None, live=False):
1474         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1475             return []
1476
1477         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1478             return []
1479
1480         formats = []
1481
1482         format_url = lambda u: (
1483             u
1484             if re.match(r'^https?://', u)
1485             else compat_urlparse.urljoin(m3u8_url, u))
1486
1487         # References:
1488         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1489         # 2. https://github.com/rg3/youtube-dl/issues/12211
1490
1491         # We should try extracting formats only from master playlists [1, 4.3.4],
1492         # i.e. playlists that describe available qualities. On the other hand
1493         # media playlists [1, 4.3.3] should be returned as is since they contain
1494         # just the media without qualities renditions.
1495         # Fortunately, master playlist can be easily distinguished from media
1496         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1497         # master playlist tags MUST NOT appear in a media playist and vice versa.
1498         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1499         # media playlist and MUST NOT appear in master playlist thus we can
1500         # clearly detect media playlist with this criterion.
1501
1502         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1503             return [{
1504                 'url': m3u8_url,
1505                 'format_id': m3u8_id,
1506                 'ext': ext,
1507                 'protocol': entry_protocol,
1508                 'preference': preference,
1509             }]
1510
1511         groups = {}
1512         last_stream_inf = {}
1513
1514         def extract_media(x_media_line):
1515             media = parse_m3u8_attributes(x_media_line)
1516             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1517             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1518             if not (media_type and group_id and name):
1519                 return
1520             groups.setdefault(group_id, []).append(media)
1521             if media_type not in ('VIDEO', 'AUDIO'):
1522                 return
1523             media_url = media.get('URI')
1524             if media_url:
1525                 format_id = []
1526                 for v in (m3u8_id, group_id, name):
1527                     if v:
1528                         format_id.append(v)
1529                 f = {
1530                     'format_id': '-'.join(format_id),
1531                     'url': format_url(media_url),
1532                     'manifest_url': m3u8_url,
1533                     'language': media.get('LANGUAGE'),
1534                     'ext': ext,
1535                     'protocol': entry_protocol,
1536                     'preference': preference,
1537                 }
1538                 if media_type == 'AUDIO':
1539                     f['vcodec'] = 'none'
1540                 formats.append(f)
1541
1542         def build_stream_name():
1543             # Despite specification does not mention NAME attribute for
1544             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1545             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1546             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1547             stream_name = last_stream_inf.get('NAME')
1548             if stream_name:
1549                 return stream_name
1550             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1551             # from corresponding rendition group
1552             stream_group_id = last_stream_inf.get('VIDEO')
1553             if not stream_group_id:
1554                 return
1555             stream_group = groups.get(stream_group_id)
1556             if not stream_group:
1557                 return stream_group_id
1558             rendition = stream_group[0]
1559             return rendition.get('NAME') or stream_group_id
1560
1561         for line in m3u8_doc.splitlines():
1562             if line.startswith('#EXT-X-STREAM-INF:'):
1563                 last_stream_inf = parse_m3u8_attributes(line)
1564             elif line.startswith('#EXT-X-MEDIA:'):
1565                 extract_media(line)
1566             elif line.startswith('#') or not line.strip():
1567                 continue
1568             else:
1569                 tbr = float_or_none(
1570                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1571                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1572                 format_id = []
1573                 if m3u8_id:
1574                     format_id.append(m3u8_id)
1575                 stream_name = build_stream_name()
1576                 # Bandwidth of live streams may differ over time thus making
1577                 # format_id unpredictable. So it's better to keep provided
1578                 # format_id intact.
1579                 if not live:
1580                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1581                 manifest_url = format_url(line.strip())
1582                 f = {
1583                     'format_id': '-'.join(format_id),
1584                     'url': manifest_url,
1585                     'manifest_url': m3u8_url,
1586                     'tbr': tbr,
1587                     'ext': ext,
1588                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1589                     'protocol': entry_protocol,
1590                     'preference': preference,
1591                 }
1592                 resolution = last_stream_inf.get('RESOLUTION')
1593                 if resolution:
1594                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1595                     if mobj:
1596                         f['width'] = int(mobj.group('width'))
1597                         f['height'] = int(mobj.group('height'))
1598                 # Unified Streaming Platform
1599                 mobj = re.search(
1600                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1601                 if mobj:
1602                     abr, vbr = mobj.groups()
1603                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1604                     f.update({
1605                         'vbr': vbr,
1606                         'abr': abr,
1607                     })
1608                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1609                 f.update(codecs)
1610                 audio_group_id = last_stream_inf.get('AUDIO')
1611                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1612                 # references a rendition group MUST have a CODECS attribute.
1613                 # However, this is not always respected, for example, [2]
1614                 # contains EXT-X-STREAM-INF tag which references AUDIO
1615                 # rendition group but does not have CODECS and despite
1616                 # referencing audio group an audio group, it represents
1617                 # a complete (with audio and video) format. So, for such cases
1618                 # we will ignore references to rendition groups and treat them
1619                 # as complete formats.
1620                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1621                     audio_group = groups.get(audio_group_id)
1622                     if audio_group and audio_group[0].get('URI'):
1623                         # TODO: update acodec for audio only formats with
1624                         # the same GROUP-ID
1625                         f['acodec'] = 'none'
1626                 formats.append(f)
1627                 last_stream_inf = {}
1628         return formats
1629
1630     @staticmethod
1631     def _xpath_ns(path, namespace=None):
1632         if not namespace:
1633             return path
1634         out = []
1635         for c in path.split('/'):
1636             if not c or c == '.':
1637                 out.append(c)
1638             else:
1639                 out.append('{%s}%s' % (namespace, c))
1640         return '/'.join(out)
1641
1642     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1643         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1644
1645         if smil is False:
1646             assert not fatal
1647             return []
1648
1649         namespace = self._parse_smil_namespace(smil)
1650
1651         return self._parse_smil_formats(
1652             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1653
1654     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1655         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1656         if smil is False:
1657             return {}
1658         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1659
1660     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1661         return self._download_xml(
1662             smil_url, video_id, 'Downloading SMIL file',
1663             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1664
1665     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1666         namespace = self._parse_smil_namespace(smil)
1667
1668         formats = self._parse_smil_formats(
1669             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1670         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1671
1672         video_id = os.path.splitext(url_basename(smil_url))[0]
1673         title = None
1674         description = None
1675         upload_date = None
1676         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1677             name = meta.attrib.get('name')
1678             content = meta.attrib.get('content')
1679             if not name or not content:
1680                 continue
1681             if not title and name == 'title':
1682                 title = content
1683             elif not description and name in ('description', 'abstract'):
1684                 description = content
1685             elif not upload_date and name == 'date':
1686                 upload_date = unified_strdate(content)
1687
1688         thumbnails = [{
1689             'id': image.get('type'),
1690             'url': image.get('src'),
1691             'width': int_or_none(image.get('width')),
1692             'height': int_or_none(image.get('height')),
1693         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1694
1695         return {
1696             'id': video_id,
1697             'title': title or video_id,
1698             'description': description,
1699             'upload_date': upload_date,
1700             'thumbnails': thumbnails,
1701             'formats': formats,
1702             'subtitles': subtitles,
1703         }
1704
1705     def _parse_smil_namespace(self, smil):
1706         return self._search_regex(
1707             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1708
1709     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1710         base = smil_url
1711         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1712             b = meta.get('base') or meta.get('httpBase')
1713             if b:
1714                 base = b
1715                 break
1716
1717         formats = []
1718         rtmp_count = 0
1719         http_count = 0
1720         m3u8_count = 0
1721
1722         srcs = []
1723         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1724         for medium in media:
1725             src = medium.get('src')
1726             if not src or src in srcs:
1727                 continue
1728             srcs.append(src)
1729
1730             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1731             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1732             width = int_or_none(medium.get('width'))
1733             height = int_or_none(medium.get('height'))
1734             proto = medium.get('proto')
1735             ext = medium.get('ext')
1736             src_ext = determine_ext(src)
1737             streamer = medium.get('streamer') or base
1738
1739             if proto == 'rtmp' or streamer.startswith('rtmp'):
1740                 rtmp_count += 1
1741                 formats.append({
1742                     'url': streamer,
1743                     'play_path': src,
1744                     'ext': 'flv',
1745                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1746                     'tbr': bitrate,
1747                     'filesize': filesize,
1748                     'width': width,
1749                     'height': height,
1750                 })
1751                 if transform_rtmp_url:
1752                     streamer, src = transform_rtmp_url(streamer, src)
1753                     formats[-1].update({
1754                         'url': streamer,
1755                         'play_path': src,
1756                     })
1757                 continue
1758
1759             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1760             src_url = src_url.strip()
1761
1762             if proto == 'm3u8' or src_ext == 'm3u8':
1763                 m3u8_formats = self._extract_m3u8_formats(
1764                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1765                 if len(m3u8_formats) == 1:
1766                     m3u8_count += 1
1767                     m3u8_formats[0].update({
1768                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1769                         'tbr': bitrate,
1770                         'width': width,
1771                         'height': height,
1772                     })
1773                 formats.extend(m3u8_formats)
1774                 continue
1775
1776             if src_ext == 'f4m':
1777                 f4m_url = src_url
1778                 if not f4m_params:
1779                     f4m_params = {
1780                         'hdcore': '3.2.0',
1781                         'plugin': 'flowplayer-3.2.0.1',
1782                     }
1783                 f4m_url += '&' if '?' in f4m_url else '?'
1784                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1785                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1786                 continue
1787
1788             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1789                 http_count += 1
1790                 formats.append({
1791                     'url': src_url,
1792                     'ext': ext or src_ext or 'flv',
1793                     'format_id': 'http-%d' % (bitrate or http_count),
1794                     'tbr': bitrate,
1795                     'filesize': filesize,
1796                     'width': width,
1797                     'height': height,
1798                 })
1799                 continue
1800
1801         return formats
1802
1803     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1804         urls = []
1805         subtitles = {}
1806         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1807             src = textstream.get('src')
1808             if not src or src in urls:
1809                 continue
1810             urls.append(src)
1811             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1812             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1813             subtitles.setdefault(lang, []).append({
1814                 'url': src,
1815                 'ext': ext,
1816             })
1817         return subtitles
1818
1819     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1820         xspf = self._download_xml(
1821             xspf_url, playlist_id, 'Downloading xpsf playlist',
1822             'Unable to download xspf manifest', fatal=fatal)
1823         if xspf is False:
1824             return []
1825         return self._parse_xspf(
1826             xspf, playlist_id, xspf_url=xspf_url,
1827             xspf_base_url=base_url(xspf_url))
1828
1829     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1830         NS_MAP = {
1831             'xspf': 'http://xspf.org/ns/0/',
1832             's1': 'http://static.streamone.nl/player/ns/0',
1833         }
1834
1835         entries = []
1836         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1837             title = xpath_text(
1838                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1839             description = xpath_text(
1840                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1841             thumbnail = xpath_text(
1842                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1843             duration = float_or_none(
1844                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1845
1846             formats = []
1847             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1848                 format_url = urljoin(xspf_base_url, location.text)
1849                 if not format_url:
1850                     continue
1851                 formats.append({
1852                     'url': format_url,
1853                     'manifest_url': xspf_url,
1854                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1855                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1856                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1857                 })
1858             self._sort_formats(formats)
1859
1860             entries.append({
1861                 'id': playlist_id,
1862                 'title': title,
1863                 'description': description,
1864                 'thumbnail': thumbnail,
1865                 'duration': duration,
1866                 'formats': formats,
1867             })
1868         return entries
1869
1870     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1871         res = self._download_xml_handle(
1872             mpd_url, video_id,
1873             note=note or 'Downloading MPD manifest',
1874             errnote=errnote or 'Failed to download MPD manifest',
1875             fatal=fatal)
1876         if res is False:
1877             return []
1878         mpd_doc, urlh = res
1879         mpd_base_url = base_url(urlh.geturl())
1880
1881         return self._parse_mpd_formats(
1882             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1883             formats_dict=formats_dict, mpd_url=mpd_url)
1884
1885     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1886         """
1887         Parse formats from MPD manifest.
1888         References:
1889          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1890             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1891          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1892         """
1893         if mpd_doc.get('type') == 'dynamic':
1894             return []
1895
1896         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1897
1898         def _add_ns(path):
1899             return self._xpath_ns(path, namespace)
1900
1901         def is_drm_protected(element):
1902             return element.find(_add_ns('ContentProtection')) is not None
1903
1904         def extract_multisegment_info(element, ms_parent_info):
1905             ms_info = ms_parent_info.copy()
1906
1907             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1908             # common attributes and elements.  We will only extract relevant
1909             # for us.
1910             def extract_common(source):
1911                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1912                 if segment_timeline is not None:
1913                     s_e = segment_timeline.findall(_add_ns('S'))
1914                     if s_e:
1915                         ms_info['total_number'] = 0
1916                         ms_info['s'] = []
1917                         for s in s_e:
1918                             r = int(s.get('r', 0))
1919                             ms_info['total_number'] += 1 + r
1920                             ms_info['s'].append({
1921                                 't': int(s.get('t', 0)),
1922                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1923                                 'd': int(s.attrib['d']),
1924                                 'r': r,
1925                             })
1926                 start_number = source.get('startNumber')
1927                 if start_number:
1928                     ms_info['start_number'] = int(start_number)
1929                 timescale = source.get('timescale')
1930                 if timescale:
1931                     ms_info['timescale'] = int(timescale)
1932                 segment_duration = source.get('duration')
1933                 if segment_duration:
1934                     ms_info['segment_duration'] = float(segment_duration)
1935
1936             def extract_Initialization(source):
1937                 initialization = source.find(_add_ns('Initialization'))
1938                 if initialization is not None:
1939                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1940
1941             segment_list = element.find(_add_ns('SegmentList'))
1942             if segment_list is not None:
1943                 extract_common(segment_list)
1944                 extract_Initialization(segment_list)
1945                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1946                 if segment_urls_e:
1947                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1948             else:
1949                 segment_template = element.find(_add_ns('SegmentTemplate'))
1950                 if segment_template is not None:
1951                     extract_common(segment_template)
1952                     media = segment_template.get('media')
1953                     if media:
1954                         ms_info['media'] = media
1955                     initialization = segment_template.get('initialization')
1956                     if initialization:
1957                         ms_info['initialization'] = initialization
1958                     else:
1959                         extract_Initialization(segment_template)
1960             return ms_info
1961
1962         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1963         formats = []
1964         for period in mpd_doc.findall(_add_ns('Period')):
1965             period_duration = parse_duration(period.get('duration')) or mpd_duration
1966             period_ms_info = extract_multisegment_info(period, {
1967                 'start_number': 1,
1968                 'timescale': 1,
1969             })
1970             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1971                 if is_drm_protected(adaptation_set):
1972                     continue
1973                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1974                 for representation in adaptation_set.findall(_add_ns('Representation')):
1975                     if is_drm_protected(representation):
1976                         continue
1977                     representation_attrib = adaptation_set.attrib.copy()
1978                     representation_attrib.update(representation.attrib)
1979                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1980                     mime_type = representation_attrib['mimeType']
1981                     content_type = mime_type.split('/')[0]
1982                     if content_type == 'text':
1983                         # TODO implement WebVTT downloading
1984                         pass
1985                     elif content_type in ('video', 'audio'):
1986                         base_url = ''
1987                         for element in (representation, adaptation_set, period, mpd_doc):
1988                             base_url_e = element.find(_add_ns('BaseURL'))
1989                             if base_url_e is not None:
1990                                 base_url = base_url_e.text + base_url
1991                                 if re.match(r'^https?://', base_url):
1992                                     break
1993                         if mpd_base_url and not re.match(r'^https?://', base_url):
1994                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1995                                 mpd_base_url += '/'
1996                             base_url = mpd_base_url + base_url
1997                         representation_id = representation_attrib.get('id')
1998                         lang = representation_attrib.get('lang')
1999                         url_el = representation.find(_add_ns('BaseURL'))
2000                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2001                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2002                         f = {
2003                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2004                             'url': base_url,
2005                             'manifest_url': mpd_url,
2006                             'ext': mimetype2ext(mime_type),
2007                             'width': int_or_none(representation_attrib.get('width')),
2008                             'height': int_or_none(representation_attrib.get('height')),
2009                             'tbr': float_or_none(bandwidth, 1000),
2010                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2011                             'fps': int_or_none(representation_attrib.get('frameRate')),
2012                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2013                             'format_note': 'DASH %s' % content_type,
2014                             'filesize': filesize,
2015                             'container': mimetype2ext(mime_type) + '_dash',
2016                         }
2017                         f.update(parse_codecs(representation_attrib.get('codecs')))
2018                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2019
2020                         def prepare_template(template_name, identifiers):
2021                             t = representation_ms_info[template_name]
2022                             t = t.replace('$RepresentationID$', representation_id)
2023                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2024                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2025                             t.replace('$$', '$')
2026                             return t
2027
2028                         # @initialization is a regular template like @media one
2029                         # so it should be handled just the same way (see
2030                         # https://github.com/rg3/youtube-dl/issues/11605)
2031                         if 'initialization' in representation_ms_info:
2032                             initialization_template = prepare_template(
2033                                 'initialization',
2034                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2035                                 # $Time$ shall not be included for @initialization thus
2036                                 # only $Bandwidth$ remains
2037                                 ('Bandwidth', ))
2038                             representation_ms_info['initialization_url'] = initialization_template % {
2039                                 'Bandwidth': bandwidth,
2040                             }
2041
2042                         def location_key(location):
2043                             return 'url' if re.match(r'^https?://', location) else 'path'
2044
2045                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2046
2047                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2048                             media_location_key = location_key(media_template)
2049
2050                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2051                             # can't be used at the same time
2052                             if '%(Number' in media_template and 's' not in representation_ms_info:
2053                                 segment_duration = None
2054                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2055                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2056                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2057                                 representation_ms_info['fragments'] = [{
2058                                     media_location_key: media_template % {
2059                                         'Number': segment_number,
2060                                         'Bandwidth': bandwidth,
2061                                     },
2062                                     'duration': segment_duration,
2063                                 } for segment_number in range(
2064                                     representation_ms_info['start_number'],
2065                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2066                             else:
2067                                 # $Number*$ or $Time$ in media template with S list available
2068                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2069                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2070                                 representation_ms_info['fragments'] = []
2071                                 segment_time = 0
2072                                 segment_d = None
2073                                 segment_number = representation_ms_info['start_number']
2074
2075                                 def add_segment_url():
2076                                     segment_url = media_template % {
2077                                         'Time': segment_time,
2078                                         'Bandwidth': bandwidth,
2079                                         'Number': segment_number,
2080                                     }
2081                                     representation_ms_info['fragments'].append({
2082                                         media_location_key: segment_url,
2083                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2084                                     })
2085
2086                                 for num, s in enumerate(representation_ms_info['s']):
2087                                     segment_time = s.get('t') or segment_time
2088                                     segment_d = s['d']
2089                                     add_segment_url()
2090                                     segment_number += 1
2091                                     for r in range(s.get('r', 0)):
2092                                         segment_time += segment_d
2093                                         add_segment_url()
2094                                         segment_number += 1
2095                                     segment_time += segment_d
2096                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2097                             # No media template
2098                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2099                             # or any YouTube dashsegments video
2100                             fragments = []
2101                             segment_index = 0
2102                             timescale = representation_ms_info['timescale']
2103                             for s in representation_ms_info['s']:
2104                                 duration = float_or_none(s['d'], timescale)
2105                                 for r in range(s.get('r', 0) + 1):
2106                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2107                                     fragments.append({
2108                                         location_key(segment_uri): segment_uri,
2109                                         'duration': duration,
2110                                     })
2111                                     segment_index += 1
2112                             representation_ms_info['fragments'] = fragments
2113                         elif 'segment_urls' in representation_ms_info:
2114                             # Segment URLs with no SegmentTimeline
2115                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2116                             # https://github.com/rg3/youtube-dl/pull/14844
2117                             fragments = []
2118                             segment_duration = float_or_none(
2119                                 representation_ms_info['segment_duration'],
2120                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2121                             for segment_url in representation_ms_info['segment_urls']:
2122                                 fragment = {
2123                                     location_key(segment_url): segment_url,
2124                                 }
2125                                 if segment_duration:
2126                                     fragment['duration'] = segment_duration
2127                                 fragments.append(fragment)
2128                             representation_ms_info['fragments'] = fragments
2129                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2130                         # No fragments key is present in this case.
2131                         if 'fragments' in representation_ms_info:
2132                             f.update({
2133                                 'fragment_base_url': base_url,
2134                                 'fragments': [],
2135                                 'protocol': 'http_dash_segments',
2136                             })
2137                             if 'initialization_url' in representation_ms_info:
2138                                 initialization_url = representation_ms_info['initialization_url']
2139                                 if not f.get('url'):
2140                                     f['url'] = initialization_url
2141                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2142                             f['fragments'].extend(representation_ms_info['fragments'])
2143                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2144                         # is not necessarily unique within a Period thus formats with
2145                         # the same `format_id` are quite possible. There are numerous examples
2146                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2147                         # https://github.com/rg3/youtube-dl/issues/13919)
2148                         full_info = formats_dict.get(representation_id, {}).copy()
2149                         full_info.update(f)
2150                         formats.append(full_info)
2151                     else:
2152                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2153         return formats
2154
2155     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2156         res = self._download_xml_handle(
2157             ism_url, video_id,
2158             note=note or 'Downloading ISM manifest',
2159             errnote=errnote or 'Failed to download ISM manifest',
2160             fatal=fatal)
2161         if res is False:
2162             return []
2163         ism_doc, urlh = res
2164
2165         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2166
2167     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2168         """
2169         Parse formats from ISM manifest.
2170         References:
2171          1. [MS-SSTR]: Smooth Streaming Protocol,
2172             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2173         """
2174         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2175             return []
2176
2177         duration = int(ism_doc.attrib['Duration'])
2178         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2179
2180         formats = []
2181         for stream in ism_doc.findall('StreamIndex'):
2182             stream_type = stream.get('Type')
2183             if stream_type not in ('video', 'audio'):
2184                 continue
2185             url_pattern = stream.attrib['Url']
2186             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2187             stream_name = stream.get('Name')
2188             for track in stream.findall('QualityLevel'):
2189                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2190                 # TODO: add support for WVC1 and WMAP
2191                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2192                     self.report_warning('%s is not a supported codec' % fourcc)
2193                     continue
2194                 tbr = int(track.attrib['Bitrate']) // 1000
2195                 # [1] does not mention Width and Height attributes. However,
2196                 # they're often present while MaxWidth and MaxHeight are
2197                 # missing, so should be used as fallbacks
2198                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2199                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2200                 sampling_rate = int_or_none(track.get('SamplingRate'))
2201
2202                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2203                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2204
2205                 fragments = []
2206                 fragment_ctx = {
2207                     'time': 0,
2208                 }
2209                 stream_fragments = stream.findall('c')
2210                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2211                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2212                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2213                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2214                     if not fragment_ctx['duration']:
2215                         try:
2216                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2217                         except IndexError:
2218                             next_fragment_time = duration
2219                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2220                     for _ in range(fragment_repeat):
2221                         fragments.append({
2222                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2223                             'duration': fragment_ctx['duration'] / stream_timescale,
2224                         })
2225                         fragment_ctx['time'] += fragment_ctx['duration']
2226
2227                 format_id = []
2228                 if ism_id:
2229                     format_id.append(ism_id)
2230                 if stream_name:
2231                     format_id.append(stream_name)
2232                 format_id.append(compat_str(tbr))
2233
2234                 formats.append({
2235                     'format_id': '-'.join(format_id),
2236                     'url': ism_url,
2237                     'manifest_url': ism_url,
2238                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2239                     'width': width,
2240                     'height': height,
2241                     'tbr': tbr,
2242                     'asr': sampling_rate,
2243                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2244                     'acodec': 'none' if stream_type == 'video' else fourcc,
2245                     'protocol': 'ism',
2246                     'fragments': fragments,
2247                     '_download_params': {
2248                         'duration': duration,
2249                         'timescale': stream_timescale,
2250                         'width': width or 0,
2251                         'height': height or 0,
2252                         'fourcc': fourcc,
2253                         'codec_private_data': track.get('CodecPrivateData'),
2254                         'sampling_rate': sampling_rate,
2255                         'channels': int_or_none(track.get('Channels', 2)),
2256                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2257                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2258                     },
2259                 })
2260         return formats
2261
2262     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2263         def absolute_url(item_url):
2264             return urljoin(base_url, item_url)
2265
2266         def parse_content_type(content_type):
2267             if not content_type:
2268                 return {}
2269             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2270             if ctr:
2271                 mimetype, codecs = ctr.groups()
2272                 f = parse_codecs(codecs)
2273                 f['ext'] = mimetype2ext(mimetype)
2274                 return f
2275             return {}
2276
2277         def _media_formats(src, cur_media_type, type_info={}):
2278             full_url = absolute_url(src)
2279             ext = type_info.get('ext') or determine_ext(full_url)
2280             if ext == 'm3u8':
2281                 is_plain_url = False
2282                 formats = self._extract_m3u8_formats(
2283                     full_url, video_id, ext='mp4',
2284                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2285                     preference=preference, fatal=False)
2286             elif ext == 'mpd':
2287                 is_plain_url = False
2288                 formats = self._extract_mpd_formats(
2289                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2290             else:
2291                 is_plain_url = True
2292                 formats = [{
2293                     'url': full_url,
2294                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2295                 }]
2296             return is_plain_url, formats
2297
2298         entries = []
2299         # amp-video and amp-audio are very similar to their HTML5 counterparts
2300         # so we wll include them right here (see
2301         # https://www.ampproject.org/docs/reference/components/amp-video)
2302         media_tags = [(media_tag, media_type, '')
2303                       for media_tag, media_type
2304                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2305         media_tags.extend(re.findall(
2306             # We only allow video|audio followed by a whitespace or '>'.
2307             # Allowing more characters may end up in significant slow down (see
2308             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2309             # http://www.porntrex.com/maps/videositemap.xml).
2310             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2311         for media_tag, media_type, media_content in media_tags:
2312             media_info = {
2313                 'formats': [],
2314                 'subtitles': {},
2315             }
2316             media_attributes = extract_attributes(media_tag)
2317             src = media_attributes.get('src')
2318             if src:
2319                 _, formats = _media_formats(src, media_type)
2320                 media_info['formats'].extend(formats)
2321             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2322             if media_content:
2323                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2324                     source_attributes = extract_attributes(source_tag)
2325                     src = source_attributes.get('src')
2326                     if not src:
2327                         continue
2328                     f = parse_content_type(source_attributes.get('type'))
2329                     is_plain_url, formats = _media_formats(src, media_type, f)
2330                     if is_plain_url:
2331                         # res attribute is not standard but seen several times
2332                         # in the wild
2333                         f.update({
2334                             'height': int_or_none(source_attributes.get('res')),
2335                             'format_id': source_attributes.get('label'),
2336                         })
2337                         f.update(formats[0])
2338                         media_info['formats'].append(f)
2339                     else:
2340                         media_info['formats'].extend(formats)
2341                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2342                     track_attributes = extract_attributes(track_tag)
2343                     kind = track_attributes.get('kind')
2344                     if not kind or kind in ('subtitles', 'captions'):
2345                         src = track_attributes.get('src')
2346                         if not src:
2347                             continue
2348                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2349                         media_info['subtitles'].setdefault(lang, []).append({
2350                             'url': absolute_url(src),
2351                         })
2352             if media_info['formats'] or media_info['subtitles']:
2353                 entries.append(media_info)
2354         return entries
2355
2356     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2357         formats = []
2358         hdcore_sign = 'hdcore=3.7.0'
2359         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2360         hds_host = hosts.get('hds')
2361         if hds_host:
2362             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2363         if 'hdcore=' not in f4m_url:
2364             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2365         f4m_formats = self._extract_f4m_formats(
2366             f4m_url, video_id, f4m_id='hds', fatal=False)
2367         for entry in f4m_formats:
2368             entry.update({'extra_param_to_segment_url': hdcore_sign})
2369         formats.extend(f4m_formats)
2370         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2371         hls_host = hosts.get('hls')
2372         if hls_host:
2373             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2374         formats.extend(self._extract_m3u8_formats(
2375             m3u8_url, video_id, 'mp4', 'm3u8_native',
2376             m3u8_id='hls', fatal=False))
2377         return formats
2378
2379     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2380         query = compat_urlparse.urlparse(url).query
2381         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2382         mobj = re.search(
2383             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2384         url_base = mobj.group('url')
2385         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2386         formats = []
2387
2388         def manifest_url(manifest):
2389             m_url = '%s/%s' % (http_base_url, manifest)
2390             if query:
2391                 m_url += '?%s' % query
2392             return m_url
2393
2394         if 'm3u8' not in skip_protocols:
2395             formats.extend(self._extract_m3u8_formats(
2396                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2397                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2398         if 'f4m' not in skip_protocols:
2399             formats.extend(self._extract_f4m_formats(
2400                 manifest_url('manifest.f4m'),
2401                 video_id, f4m_id='hds', fatal=False))
2402         if 'dash' not in skip_protocols:
2403             formats.extend(self._extract_mpd_formats(
2404                 manifest_url('manifest.mpd'),
2405                 video_id, mpd_id='dash', fatal=False))
2406         if re.search(r'(?:/smil:|\.smil)', url_base):
2407             if 'smil' not in skip_protocols:
2408                 rtmp_formats = self._extract_smil_formats(
2409                     manifest_url('jwplayer.smil'),
2410                     video_id, fatal=False)
2411                 for rtmp_format in rtmp_formats:
2412                     rtsp_format = rtmp_format.copy()
2413                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2414                     del rtsp_format['play_path']
2415                     del rtsp_format['ext']
2416                     rtsp_format.update({
2417                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2418                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2419                         'protocol': 'rtsp',
2420                     })
2421                     formats.extend([rtmp_format, rtsp_format])
2422         else:
2423             for protocol in ('rtmp', 'rtsp'):
2424                 if protocol not in skip_protocols:
2425                     formats.append({
2426                         'url': '%s:%s' % (protocol, url_base),
2427                         'format_id': protocol,
2428                         'protocol': protocol,
2429                     })
2430         return formats
2431
2432     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2433         mobj = re.search(
2434             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2435             webpage)
2436         if mobj:
2437             try:
2438                 jwplayer_data = self._parse_json(mobj.group('options'),
2439                                                  video_id=video_id,
2440                                                  transform_source=transform_source)
2441             except ExtractorError:
2442                 pass
2443             else:
2444                 if isinstance(jwplayer_data, dict):
2445                     return jwplayer_data
2446
2447     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2448         jwplayer_data = self._find_jwplayer_data(
2449             webpage, video_id, transform_source=js_to_json)
2450         return self._parse_jwplayer_data(
2451             jwplayer_data, video_id, *args, **kwargs)
2452
2453     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2454                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2455         # JWPlayer backward compatibility: flattened playlists
2456         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2457         if 'playlist' not in jwplayer_data:
2458             jwplayer_data = {'playlist': [jwplayer_data]}
2459
2460         entries = []
2461
2462         # JWPlayer backward compatibility: single playlist item
2463         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2464         if not isinstance(jwplayer_data['playlist'], list):
2465             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2466
2467         for video_data in jwplayer_data['playlist']:
2468             # JWPlayer backward compatibility: flattened sources
2469             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2470             if 'sources' not in video_data:
2471                 video_data['sources'] = [video_data]
2472
2473             this_video_id = video_id or video_data['mediaid']
2474
2475             formats = self._parse_jwplayer_formats(
2476                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2477                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2478
2479             subtitles = {}
2480             tracks = video_data.get('tracks')
2481             if tracks and isinstance(tracks, list):
2482                 for track in tracks:
2483                     if not isinstance(track, dict):
2484                         continue
2485                     track_kind = track.get('kind')
2486                     if not track_kind or not isinstance(track_kind, compat_str):
2487                         continue
2488                     if track_kind.lower() not in ('captions', 'subtitles'):
2489                         continue
2490                     track_url = urljoin(base_url, track.get('file'))
2491                     if not track_url:
2492                         continue
2493                     subtitles.setdefault(track.get('label') or 'en', []).append({
2494                         'url': self._proto_relative_url(track_url)
2495                     })
2496
2497             entry = {
2498                 'id': this_video_id,
2499                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2500                 'description': video_data.get('description'),
2501                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2502                 'timestamp': int_or_none(video_data.get('pubdate')),
2503                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2504                 'subtitles': subtitles,
2505             }
2506             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2507             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2508                 entry.update({
2509                     '_type': 'url_transparent',
2510                     'url': formats[0]['url'],
2511                 })
2512             else:
2513                 self._sort_formats(formats)
2514                 entry['formats'] = formats
2515             entries.append(entry)
2516         if len(entries) == 1:
2517             return entries[0]
2518         else:
2519             return self.playlist_result(entries)
2520
2521     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2522                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2523         urls = []
2524         formats = []
2525         for source in jwplayer_sources_data:
2526             if not isinstance(source, dict):
2527                 continue
2528             source_url = self._proto_relative_url(source.get('file'))
2529             if not source_url:
2530                 continue
2531             if base_url:
2532                 source_url = compat_urlparse.urljoin(base_url, source_url)
2533             if source_url in urls:
2534                 continue
2535             urls.append(source_url)
2536             source_type = source.get('type') or ''
2537             ext = mimetype2ext(source_type) or determine_ext(source_url)
2538             if source_type == 'hls' or ext == 'm3u8':
2539                 formats.extend(self._extract_m3u8_formats(
2540                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2541                     m3u8_id=m3u8_id, fatal=False))
2542             elif source_type == 'dash' or ext == 'mpd':
2543                 formats.extend(self._extract_mpd_formats(
2544                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2545             elif ext == 'smil':
2546                 formats.extend(self._extract_smil_formats(
2547                     source_url, video_id, fatal=False))
2548             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2549             elif source_type.startswith('audio') or ext in (
2550                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2551                 formats.append({
2552                     'url': source_url,
2553                     'vcodec': 'none',
2554                     'ext': ext,
2555                 })
2556             else:
2557                 height = int_or_none(source.get('height'))
2558                 if height is None:
2559                     # Often no height is provided but there is a label in
2560                     # format like "1080p", "720p SD", or 1080.
2561                     height = int_or_none(self._search_regex(
2562                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2563                         'height', default=None))
2564                 a_format = {
2565                     'url': source_url,
2566                     'width': int_or_none(source.get('width')),
2567                     'height': height,
2568                     'tbr': int_or_none(source.get('bitrate')),
2569                     'ext': ext,
2570                 }
2571                 if source_url.startswith('rtmp'):
2572                     a_format['ext'] = 'flv'
2573                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2574                     # of jwplayer.flash.swf
2575                     rtmp_url_parts = re.split(
2576                         r'((?:mp4|mp3|flv):)', source_url, 1)
2577                     if len(rtmp_url_parts) == 3:
2578                         rtmp_url, prefix, play_path = rtmp_url_parts
2579                         a_format.update({
2580                             'url': rtmp_url,
2581                             'play_path': prefix + play_path,
2582                         })
2583                     if rtmp_params:
2584                         a_format.update(rtmp_params)
2585                 formats.append(a_format)
2586         return formats
2587
2588     def _live_title(self, name):
2589         """ Generate the title for a live video """
2590         now = datetime.datetime.now()
2591         now_str = now.strftime('%Y-%m-%d %H:%M')
2592         return name + ' ' + now_str
2593
2594     def _int(self, v, name, fatal=False, **kwargs):
2595         res = int_or_none(v, **kwargs)
2596         if 'get_attr' in kwargs:
2597             print(getattr(v, kwargs['get_attr']))
2598         if res is None:
2599             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2600             if fatal:
2601                 raise ExtractorError(msg)
2602             else:
2603                 self._downloader.report_warning(msg)
2604         return res
2605
2606     def _float(self, v, name, fatal=False, **kwargs):
2607         res = float_or_none(v, **kwargs)
2608         if res is None:
2609             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2610             if fatal:
2611                 raise ExtractorError(msg)
2612             else:
2613                 self._downloader.report_warning(msg)
2614         return res
2615
2616     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2617                     path='/', secure=False, discard=False, rest={}, **kwargs):
2618         cookie = compat_cookiejar.Cookie(
2619             0, name, value, port, port is not None, domain, True,
2620             domain.startswith('.'), path, True, secure, expire_time,
2621             discard, None, None, rest)
2622         self._downloader.cookiejar.set_cookie(cookie)
2623
2624     def _get_cookies(self, url):
2625         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2626         req = sanitized_Request(url)
2627         self._downloader.cookiejar.add_cookie_header(req)
2628         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2629
2630     def get_testcases(self, include_onlymatching=False):
2631         t = getattr(self, '_TEST', None)
2632         if t:
2633             assert not hasattr(self, '_TESTS'), \
2634                 '%s has _TEST and _TESTS' % type(self).__name__
2635             tests = [t]
2636         else:
2637             tests = getattr(self, '_TESTS', [])
2638         for t in tests:
2639             if not include_onlymatching and t.get('only_matching', False):
2640                 continue
2641             t['name'] = type(self).__name__[:-len('IE')]
2642             yield t
2643
2644     def is_suitable(self, age_limit):
2645         """ Test whether the extractor is generally suitable for the given
2646         age limit (i.e. pornographic sites are not, all others usually are) """
2647
2648         any_restricted = False
2649         for tc in self.get_testcases(include_onlymatching=False):
2650             if tc.get('playlist', []):
2651                 tc = tc['playlist'][0]
2652             is_restricted = age_restricted(
2653                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2654             if not is_restricted:
2655                 return True
2656             any_restricted = any_restricted or is_restricted
2657         return not any_restricted
2658
2659     def extract_subtitles(self, *args, **kwargs):
2660         if (self._downloader.params.get('writesubtitles', False) or
2661                 self._downloader.params.get('listsubtitles')):
2662             return self._get_subtitles(*args, **kwargs)
2663         return {}
2664
2665     def _get_subtitles(self, *args, **kwargs):
2666         raise NotImplementedError('This method must be implemented by subclasses')
2667
2668     @staticmethod
2669     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2670         """ Merge subtitle items for one language. Items with duplicated URLs
2671         will be dropped. """
2672         list1_urls = set([item['url'] for item in subtitle_list1])
2673         ret = list(subtitle_list1)
2674         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2675         return ret
2676
2677     @classmethod
2678     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2679         """ Merge two subtitle dictionaries, language by language. """
2680         ret = dict(subtitle_dict1)
2681         for lang in subtitle_dict2:
2682             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2683         return ret
2684
2685     def extract_automatic_captions(self, *args, **kwargs):
2686         if (self._downloader.params.get('writeautomaticsub', False) or
2687                 self._downloader.params.get('listsubtitles')):
2688             return self._get_automatic_captions(*args, **kwargs)
2689         return {}
2690
2691     def _get_automatic_captions(self, *args, **kwargs):
2692         raise NotImplementedError('This method must be implemented by subclasses')
2693
2694     def mark_watched(self, *args, **kwargs):
2695         if (self._downloader.params.get('mark_watched', False) and
2696                 (self._get_login_info()[0] is not None or
2697                     self._downloader.params.get('cookiefile') is not None)):
2698             self._mark_watched(*args, **kwargs)
2699
2700     def _mark_watched(self, *args, **kwargs):
2701         raise NotImplementedError('This method must be implemented by subclasses')
2702
2703     def geo_verification_headers(self):
2704         headers = {}
2705         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2706         if geo_verification_proxy:
2707             headers['Ytdl-request-proxy'] = geo_verification_proxy
2708         return headers
2709
2710     def _generic_id(self, url):
2711         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2712
2713     def _generic_title(self, url):
2714         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2715
2716
2717 class SearchInfoExtractor(InfoExtractor):
2718     """
2719     Base class for paged search queries extractors.
2720     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2721     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2722     """
2723
2724     @classmethod
2725     def _make_valid_url(cls):
2726         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2727
2728     @classmethod
2729     def suitable(cls, url):
2730         return re.match(cls._make_valid_url(), url) is not None
2731
2732     def _real_extract(self, query):
2733         mobj = re.match(self._make_valid_url(), query)
2734         if mobj is None:
2735             raise ExtractorError('Invalid search query "%s"' % query)
2736
2737         prefix = mobj.group('prefix')
2738         query = mobj.group('query')
2739         if prefix == '':
2740             return self._get_n_results(query, 1)
2741         elif prefix == 'all':
2742             return self._get_n_results(query, self._MAX_RESULTS)
2743         else:
2744             n = int(prefix)
2745             if n <= 0:
2746                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2747             elif n > self._MAX_RESULTS:
2748                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2749                 n = self._MAX_RESULTS
2750             return self._get_n_results(query, n)
2751
2752     def _get_n_results(self, query, n):
2753         """Get a specified number of results for a query"""
2754         raise NotImplementedError('This method must be implemented by subclasses')
2755
2756     @property
2757     def SEARCH_KEY(self):
2758         return self._SEARCH_KEY