_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16 import xml
  17
  18 from ..compat import (
  19     compat_cookiejar,
  20     compat_cookies,
  21     compat_etree_fromstring,
  22     compat_getpass,
  23     compat_integer_types,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     error_to_compat_str,
  48     ExtractorError,
  49     extract_attributes,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     orderedSet,
  59     parse_codecs,
  60     parse_duration,
  61     parse_iso8601,
  62     parse_m3u8_attributes,
  63     RegexNotFoundError,
  64     sanitized_Request,
  65     sanitize_filename,
  66     unescapeHTML,
  67     unified_strdate,
  68     unified_timestamp,
  69     update_Request,
  70     update_url_query,
  71     urljoin,
  72     url_basename,
  73     url_or_none,
  74     xpath_element,
  75     xpath_text,
  76     xpath_with_ns,
  77 )
  78
  79
  80 class InfoExtractor(object):
  81     """Information Extractor class.
  82
  83     Information extractors are the classes that, given a URL, extract
  84     information about the video (or videos) the URL refers to. This
  85     information includes the real video URL, the video title, author and
  86     others. The information is stored in a dictionary which is then
  87     passed to the YoutubeDL. The YoutubeDL processes this
  88     information possibly downloading the video to the file system, among
  89     other possible outcomes.
  90
  91     The type field determines the type of the result.
  92     By far the most common value (and the default if _type is missing) is
  93     "video", which indicates a single video.
  94
  95     For a video, the dictionaries must include the following fields:
  96
  97     id:             Video identifier.
  98     title:          Video title, unescaped.
  99
 100     Additionally, it must contain either a formats entry or a url one:
 101
 102     formats:        A list of dictionaries for each format available, ordered
 103                     from worst to best quality.
 104
 105                     Potential fields:
 106                     * url        The mandatory URL representing the media:
 107                                    for plain file media - HTTP URL of this file,
 108                                    for RTMP - RTMP URL,
 109                                    for HLS - URL of the M3U8 media playlist,
 110                                    for HDS - URL of the F4M manifest,
 111                                    for DASH - URL of the MPD manifest,
 112                                    for MSS - URL of the ISM manifest.
 113                     * manifest_url
 114                                  The URL of the manifest file in case of
 115                                  fragmented media:
 116                                    for HLS - URL of the M3U8 master playlist,
 117                                    for HDS - URL of the F4M manifest,
 118                                    for DASH - URL of the MPD manifest,
 119                                    for MSS - URL of the ISM manifest.
 120                     * ext        Will be calculated from URL if missing
 121                     * format     A human-readable description of the format
 122                                  ("mp4 container with h264/opus").
 123                                  Calculated from the format_id, width, height.
 124                                  and format_note fields if missing.
 125                     * format_id  A short description of the format
 126                                  ("mp4_h264_opus" or "19").
 127                                 Technically optional, but strongly recommended.
 128                     * format_note Additional info about the format
 129                                  ("3D" or "DASH video")
 130                     * width      Width of the video, if known
 131                     * height     Height of the video, if known
 132                     * resolution Textual description of width and height
 133                     * tbr        Average bitrate of audio and video in KBit/s
 134                     * abr        Average audio bitrate in KBit/s
 135                     * acodec     Name of the audio codec in use
 136                     * asr        Audio sampling rate in Hertz
 137                     * vbr        Average video bitrate in KBit/s
 138                     * fps        Frame rate
 139                     * vcodec     Name of the video codec in use
 140                     * container  Name of the container format
 141                     * filesize   The number of bytes, if known in advance
 142                     * filesize_approx  An estimate for the number of bytes
 143                     * player_url SWF Player URL (used for rtmpdump).
 144                     * protocol   The protocol that will be used for the actual
 145                                  download, lower-case.
 146                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 147                                  "m3u8", "m3u8_native" or "http_dash_segments".
 148                     * fragment_base_url
 149                                  Base URL for fragments. Each fragment's path
 150                                  value (if present) will be relative to
 151                                  this URL.
 152                     * fragments  A list of fragments of a fragmented media.
 153                                  Each fragment entry must contain either an url
 154                                  or a path. If an url is present it should be
 155                                  considered by a client. Otherwise both path and
 156                                  fragment_base_url must be present. Here is
 157                                  the list of all potential fields:
 158                                  * "url" - fragment's URL
 159                                  * "path" - fragment's path relative to
 160                                             fragment_base_url
 161                                  * "duration" (optional, int or float)
 162                                  * "filesize" (optional, int)
 163                     * preference Order number of this format. If this field is
 164                                  present and not None, the formats get sorted
 165                                  by this field, regardless of all other values.
 166                                  -1 for default (order by other properties),
 167                                  -2 or smaller for less than default.
 168                                  < -1000 to hide the format (if there is
 169                                     another one which is strictly better)
 170                     * language   Language code, e.g. "de" or "en-US".
 171                     * language_preference  Is this in the language mentioned in
 172                                  the URL?
 173                                  10 if it's what the URL is about,
 174                                  -1 for default (don't know),
 175                                  -10 otherwise, other values reserved for now.
 176                     * quality    Order number of the video quality of this
 177                                  format, irrespective of the file format.
 178                                  -1 for default (order by other properties),
 179                                  -2 or smaller for less than default.
 180                     * source_preference  Order number for this video source
 181                                   (quality takes higher priority)
 182                                  -1 for default (order by other properties),
 183                                  -2 or smaller for less than default.
 184                     * http_headers  A dictionary of additional HTTP headers
 185                                  to add to the request.
 186                     * stretched_ratio  If given and not 1, indicates that the
 187                                  video's pixels are not square.
 188                                  width : height ratio as float.
 189                     * no_resume  The server does not support resuming the
 190                                  (HTTP or RTMP) download. Boolean.
 191                     * downloader_options  A dictionary of downloader options as
 192                                  described in FileDownloader
 193
 194     url:            Final video URL.
 195     ext:            Video filename extension.
 196     format:         The video format, defaults to ext (used for --get-format)
 197     player_url:     SWF Player URL (used for rtmpdump).
 198
 199     The following fields are optional:
 200
 201     alt_title:      A secondary title of the video.
 202     display_id      An alternative identifier for the video, not necessarily
 203                     unique, but available before title. Typically, id is
 204                     something like "4234987", title "Dancing naked mole rats",
 205                     and display_id "dancing-naked-mole-rats"
 206     thumbnails:     A list of dictionaries, with the following entries:
 207                         * "id" (optional, string) - Thumbnail format ID
 208                         * "url"
 209                         * "preference" (optional, int) - quality of the image
 210                         * "width" (optional, int)
 211                         * "height" (optional, int)
 212                         * "resolution" (optional, string "{width}x{height"},
 213                                         deprecated)
 214                         * "filesize" (optional, int)
 215     thumbnail:      Full URL to a video thumbnail image.
 216     description:    Full video description.
 217     uploader:       Full name of the video uploader.
 218     license:        License name the video is licensed under.
 219     creator:        The creator of the video.
 220     release_date:   The date (YYYYMMDD) when the video was released.
 221     timestamp:      UNIX timestamp of the moment the video became available.
 222     upload_date:    Video upload date (YYYYMMDD).
 223                     If not explicitly set, calculated from timestamp.
 224     uploader_id:    Nickname or id of the video uploader.
 225     uploader_url:   Full URL to a personal webpage of the video uploader.
 226     channel:        Full name of the channel the video is uploaded on.
 227                     Note that channel fields may or may not repeat uploader
 228                     fields. This depends on a particular extractor.
 229     channel_id:     Id of the channel.
 230     channel_url:    Full URL to a channel webpage.
 231     location:       Physical location where the video was filmed.
 232     subtitles:      The available subtitles as a dictionary in the format
 233                     {tag: subformats}. "tag" is usually a language code, and
 234                     "subformats" is a list sorted from lower to higher
 235                     preference, each element is a dictionary with the "ext"
 236                     entry and one of:
 237                         * "data": The subtitles file contents
 238                         * "url": A URL pointing to the subtitles file
 239                     "ext" will be calculated from URL if missing
 240     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 241                     automatically generated captions
 242     duration:       Length of the video in seconds, as an integer or float.
 243     view_count:     How many users have watched the video on the platform.
 244     like_count:     Number of positive ratings of the video
 245     dislike_count:  Number of negative ratings of the video
 246     repost_count:   Number of reposts of the video
 247     average_rating: Average rating give by users, the scale used depends on the webpage
 248     comment_count:  Number of comments on the video
 249     comments:       A list of comments, each with one or more of the following
 250                     properties (all but one of text or html optional):
 251                         * "author" - human-readable name of the comment author
 252                         * "author_id" - user ID of the comment author
 253                         * "id" - Comment ID
 254                         * "html" - Comment as HTML
 255                         * "text" - Plain text of the comment
 256                         * "timestamp" - UNIX timestamp of comment
 257                         * "parent" - ID of the comment this one is replying to.
 258                                      Set to "root" to indicate that this is a
 259                                      comment to the original video.
 260     age_limit:      Age restriction for the video, as an integer (years)
 261     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 262                     should allow to get the same result again. (It will be set
 263                     by YoutubeDL if it's missing)
 264     categories:     A list of categories that the video falls in, for example
 265                     ["Sports", "Berlin"]
 266     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 267     is_live:        True, False, or None (=unknown). Whether this video is a
 268                     live stream that goes on instead of a fixed-length video.
 269     start_time:     Time in seconds where the reproduction should start, as
 270                     specified in the URL.
 271     end_time:       Time in seconds where the reproduction should end, as
 272                     specified in the URL.
 273     chapters:       A list of dictionaries, with the following entries:
 274                         * "start_time" - The start time of the chapter in seconds
 275                         * "end_time" - The end time of the chapter in seconds
 276                         * "title" (optional, string)
 277
 278     The following fields should only be used when the video belongs to some logical
 279     chapter or section:
 280
 281     chapter:        Name or title of the chapter the video belongs to.
 282     chapter_number: Number of the chapter the video belongs to, as an integer.
 283     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 284
 285     The following fields should only be used when the video is an episode of some
 286     series, programme or podcast:
 287
 288     series:         Title of the series or programme the video episode belongs to.
 289     season:         Title of the season the video episode belongs to.
 290     season_number:  Number of the season the video episode belongs to, as an integer.
 291     season_id:      Id of the season the video episode belongs to, as a unicode string.
 292     episode:        Title of the video episode. Unlike mandatory video title field,
 293                     this field should denote the exact title of the video episode
 294                     without any kind of decoration.
 295     episode_number: Number of the video episode within a season, as an integer.
 296     episode_id:     Id of the video episode, as a unicode string.
 297
 298     The following fields should only be used when the media is a track or a part of
 299     a music album:
 300
 301     track:          Title of the track.
 302     track_number:   Number of the track within an album or a disc, as an integer.
 303     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 304                     as a unicode string.
 305     artist:         Artist(s) of the track.
 306     genre:          Genre(s) of the track.
 307     album:          Title of the album the track belongs to.
 308     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 309     album_artist:   List of all artists appeared on the album (e.g.
 310                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 311                     and compilations).
 312     disc_number:    Number of the disc or other physical medium the track belongs to,
 313                     as an integer.
 314     release_year:   Year (YYYY) when the album was released.
 315
 316     Unless mentioned otherwise, the fields should be Unicode strings.
 317
 318     Unless mentioned otherwise, None is equivalent to absence of information.
 319
 320
 321     _type "playlist" indicates multiple videos.
 322     There must be a key "entries", which is a list, an iterable, or a PagedList
 323     object, each element of which is a valid dictionary by this specification.
 324
 325     Additionally, playlists can have "id", "title", "description", "uploader",
 326     "uploader_id", "uploader_url" attributes with the same semantics as videos
 327     (see above).
 328
 329
 330     _type "multi_video" indicates that there are multiple videos that
 331     form a single show, for examples multiple acts of an opera or TV episode.
 332     It must have an entries key like a playlist and contain all the keys
 333     required for a video at the same time.
 334
 335
 336     _type "url" indicates that the video must be extracted from another
 337     location, possibly by a different extractor. Its only required key is:
 338     "url" - the next URL to extract.
 339     The key "ie_key" can be set to the class name (minus the trailing "IE",
 340     e.g. "Youtube") if the extractor class is known in advance.
 341     Additionally, the dictionary may have any properties of the resolved entity
 342     known in advance, for example "title" if the title of the referred video is
 343     known ahead of time.
 344
 345
 346     _type "url_transparent" entities have the same specification as "url", but
 347     indicate that the given additional information is more precise than the one
 348     associated with the resolved URL.
 349     This is useful when a site employs a video service that hosts the video and
 350     its technical metadata, but that video service does not embed a useful
 351     title, description etc.
 352
 353
 354     Subclasses of this one should re-define the _real_initialize() and
 355     _real_extract() methods and define a _VALID_URL regexp.
 356     Probably, they should also be added to the list of extractors.
 357
 358     _GEO_BYPASS attribute may be set to False in order to disable
 359     geo restriction bypass mechanisms for a particular extractor.
 360     Though it won't disable explicit geo restriction bypass based on
 361     country code provided with geo_bypass_country.
 362
 363     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 364     countries for this extractor. One of these countries will be used by
 365     geo restriction bypass mechanism right away in order to bypass
 366     geo restriction, of course, if the mechanism is not disabled.
 367
 368     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 369     IP blocks in CIDR notation for this extractor. One of these IP blocks
 370     will be used by geo restriction bypass mechanism similarly
 371     to _GEO_COUNTRIES.
 372
 373     Finally, the _WORKING attribute should be set to False for broken IEs
 374     in order to warn the users and skip the tests.
 375     """
 376
 377     _ready = False
 378     _downloader = None
 379     _x_forwarded_for_ip = None
 380     _GEO_BYPASS = True
 381     _GEO_COUNTRIES = None
 382     _GEO_IP_BLOCKS = None
 383     _WORKING = True
 384
 385     def __init__(self, downloader=None):
 386         """Constructor. Receives an optional downloader."""
 387         self._ready = False
 388         self._x_forwarded_for_ip = None
 389         self.set_downloader(downloader)
 390
 391     @classmethod
 392     def suitable(cls, url):
 393         """Receives a URL and returns True if suitable for this IE."""
 394
 395         # This does not use has/getattr intentionally - we want to know whether
 396         # we have cached the regexp for *this* class, whereas getattr would also
 397         # match the superclass
 398         if '_VALID_URL_RE' not in cls.__dict__:
 399             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 400         return cls._VALID_URL_RE.match(url) is not None
 401
 402     @classmethod
 403     def _match_id(cls, url):
 404         if '_VALID_URL_RE' not in cls.__dict__:
 405             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 406         m = cls._VALID_URL_RE.match(url)
 407         assert m
 408         return compat_str(m.group('id'))
 409
 410     @classmethod
 411     def working(cls):
 412         """Getter method for _WORKING."""
 413         return cls._WORKING
 414
 415     def initialize(self):
 416         """Initializes an instance (authentication, etc)."""
 417         self._initialize_geo_bypass({
 418             'countries': self._GEO_COUNTRIES,
 419             'ip_blocks': self._GEO_IP_BLOCKS,
 420         })
 421         if not self._ready:
 422             self._real_initialize()
 423             self._ready = True
 424
 425     def _initialize_geo_bypass(self, geo_bypass_context):
 426         """
 427         Initialize geo restriction bypass mechanism.
 428
 429         This method is used to initialize geo bypass mechanism based on faking
 430         X-Forwarded-For HTTP header. A random country from provided country list
 431         is selected and a random IP belonging to this country is generated. This
 432         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 433         HTTP requests.
 434
 435         This method will be used for initial geo bypass mechanism initialization
 436         during the instance initialization with _GEO_COUNTRIES and
 437         _GEO_IP_BLOCKS.
 438
 439         You may also manually call it from extractor's code if geo bypass
 440         information is not available beforehand (e.g. obtained during
 441         extraction) or due to some other reason. In this case you should pass
 442         this information in geo bypass context passed as first argument. It may
 443         contain following fields:
 444
 445         countries:  List of geo unrestricted countries (similar
 446                     to _GEO_COUNTRIES)
 447         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 448                     (similar to _GEO_IP_BLOCKS)
 449
 450         """
 451         if not self._x_forwarded_for_ip:
 452
 453             # Geo bypass mechanism is explicitly disabled by user
 454             if not self._downloader.params.get('geo_bypass', True):
 455                 return
 456
 457             if not geo_bypass_context:
 458                 geo_bypass_context = {}
 459
 460             # Backward compatibility: previously _initialize_geo_bypass
 461             # expected a list of countries, some 3rd party code may still use
 462             # it this way
 463             if isinstance(geo_bypass_context, (list, tuple)):
 464                 geo_bypass_context = {
 465                     'countries': geo_bypass_context,
 466                 }
 467
 468             # The whole point of geo bypass mechanism is to fake IP
 469             # as X-Forwarded-For HTTP header based on some IP block or
 470             # country code.
 471
 472             # Path 1: bypassing based on IP block in CIDR notation
 473
 474             # Explicit IP block specified by user, use it right away
 475             # regardless of whether extractor is geo bypassable or not
 476             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 477
 478             # Otherwise use random IP block from geo bypass context but only
 479             # if extractor is known as geo bypassable
 480             if not ip_block:
 481                 ip_blocks = geo_bypass_context.get('ip_blocks')
 482                 if self._GEO_BYPASS and ip_blocks:
 483                     ip_block = random.choice(ip_blocks)
 484
 485             if ip_block:
 486                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 487                 if self._downloader.params.get('verbose', False):
 488                     self._downloader.to_screen(
 489                         '[debug] Using fake IP %s as X-Forwarded-For.'
 490                         % self._x_forwarded_for_ip)
 491                 return
 492
 493             # Path 2: bypassing based on country code
 494
 495             # Explicit country code specified by user, use it right away
 496             # regardless of whether extractor is geo bypassable or not
 497             country = self._downloader.params.get('geo_bypass_country', None)
 498
 499             # Otherwise use random country code from geo bypass context but
 500             # only if extractor is known as geo bypassable
 501             if not country:
 502                 countries = geo_bypass_context.get('countries')
 503                 if self._GEO_BYPASS and countries:
 504                     country = random.choice(countries)
 505
 506             if country:
 507                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 508                 if self._downloader.params.get('verbose', False):
 509                     self._downloader.to_screen(
 510                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 511                         % (self._x_forwarded_for_ip, country.upper()))
 512
 513     def extract(self, url):
 514         """Extracts URL information and returns it in list of dicts."""
 515         try:
 516             for _ in range(2):
 517                 try:
 518                     self.initialize()
 519                     ie_result = self._real_extract(url)
 520                     if self._x_forwarded_for_ip:
 521                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 522                     return ie_result
 523                 except GeoRestrictedError as e:
 524                     if self.__maybe_fake_ip_and_retry(e.countries):
 525                         continue
 526                     raise
 527         except ExtractorError:
 528             raise
 529         except compat_http_client.IncompleteRead as e:
 530             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 531         except (KeyError, StopIteration) as e:
 532             raise ExtractorError('An extractor error has occurred.', cause=e)
 533
 534     def __maybe_fake_ip_and_retry(self, countries):
 535         if (not self._downloader.params.get('geo_bypass_country', None) and
 536                 self._GEO_BYPASS and
 537                 self._downloader.params.get('geo_bypass', True) and
 538                 not self._x_forwarded_for_ip and
 539                 countries):
 540             country_code = random.choice(countries)
 541             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 542             if self._x_forwarded_for_ip:
 543                 self.report_warning(
 544                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 545                     % (self._x_forwarded_for_ip, country_code.upper()))
 546                 return True
 547         return False
 548
 549     def set_downloader(self, downloader):
 550         """Sets the downloader for this IE."""
 551         self._downloader = downloader
 552
 553     def _real_initialize(self):
 554         """Real initialization process. Redefine in subclasses."""
 555         pass
 556
 557     def _real_extract(self, url):
 558         """Real extraction process. Redefine in subclasses."""
 559         pass
 560
 561     @classmethod
 562     def ie_key(cls):
 563         """A string for getting the InfoExtractor with get_info_extractor"""
 564         return compat_str(cls.__name__[:-2])
 565
 566     @property
 567     def IE_NAME(self):
 568         return compat_str(type(self).__name__[:-2])
 569
 570     @staticmethod
 571     def __can_accept_status_code(err, expected_status):
 572         assert isinstance(err, compat_urllib_error.HTTPError)
 573         if expected_status is None:
 574             return False
 575         if isinstance(expected_status, compat_integer_types):
 576             return err.code == expected_status
 577         elif isinstance(expected_status, (list, tuple)):
 578             return err.code in expected_status
 579         elif callable(expected_status):
 580             return expected_status(err.code) is True
 581         else:
 582             assert False
 583
 584     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 585         """
 586         Return the response handle.
 587
 588         See _download_webpage docstring for arguments specification.
 589         """
 590         if note is None:
 591             self.report_download_webpage(video_id)
 592         elif note is not False:
 593             if video_id is None:
 594                 self.to_screen('%s' % (note,))
 595             else:
 596                 self.to_screen('%s: %s' % (video_id, note))
 597
 598         # Some sites check X-Forwarded-For HTTP header in order to figure out
 599         # the origin of the client behind proxy. This allows bypassing geo
 600         # restriction by faking this header's value to IP that belongs to some
 601         # geo unrestricted country. We will do so once we encounter any
 602         # geo restriction error.
 603         if self._x_forwarded_for_ip:
 604             if 'X-Forwarded-For' not in headers:
 605                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 606
 607         if isinstance(url_or_request, compat_urllib_request.Request):
 608             url_or_request = update_Request(
 609                 url_or_request, data=data, headers=headers, query=query)
 610         else:
 611             if query:
 612                 url_or_request = update_url_query(url_or_request, query)
 613             if data is not None or headers:
 614                 url_or_request = sanitized_Request(url_or_request, data, headers)
 615         try:
 616             return self._downloader.urlopen(url_or_request)
 617         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 618             if isinstance(err, compat_urllib_error.HTTPError):
 619                 if self.__can_accept_status_code(err, expected_status):
 620                     # Retain reference to error to prevent file object from
 621                     # being closed before it can be read. Works around the
 622                     # effects of <https://bugs.python.org/issue15002>
 623                     # introduced in Python 3.4.1.
 624                     err.fp._error = err
 625                     return err.fp
 626
 627             if errnote is False:
 628                 return False
 629             if errnote is None:
 630                 errnote = 'Unable to download webpage'
 631
 632             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 633             if fatal:
 634                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 635             else:
 636                 self._downloader.report_warning(errmsg)
 637                 return False
 638
 639     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 640         """
 641         Return a tuple (page content as string, URL handle).
 642
 643         See _download_webpage docstring for arguments specification.
 644         """
 645         # Strip hashes from the URL (#1038)
 646         if isinstance(url_or_request, (compat_str, str)):
 647             url_or_request = url_or_request.partition('#')[0]
 648
 649         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 650         if urlh is False:
 651             assert not fatal
 652             return False
 653         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 654         return (content, urlh)
 655
 656     @staticmethod
 657     def _guess_encoding_from_content(content_type, webpage_bytes):
 658         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 659         if m:
 660             encoding = m.group(1)
 661         else:
 662             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 663                           webpage_bytes[:1024])
 664             if m:
 665                 encoding = m.group(1).decode('ascii')
 666             elif webpage_bytes.startswith(b'\xff\xfe'):
 667                 encoding = 'utf-16'
 668             else:
 669                 encoding = 'utf-8'
 670
 671         return encoding
 672
 673     def __check_blocked(self, content):
 674         first_block = content[:512]
 675         if ('<title>Access to this site is blocked</title>' in content and
 676                 'Websense' in first_block):
 677             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 678             blocked_iframe = self._html_search_regex(
 679                 r'<iframe src="([^"]+)"', content,
 680                 'Websense information URL', default=None)
 681             if blocked_iframe:
 682                 msg += ' Visit %s for more details' % blocked_iframe
 683             raise ExtractorError(msg, expected=True)
 684         if '<title>The URL you requested has been blocked</title>' in first_block:
 685             msg = (
 686                 'Access to this webpage has been blocked by Indian censorship. '
 687                 'Use a VPN or proxy server (with --proxy) to route around it.')
 688             block_msg = self._html_search_regex(
 689                 r'</h1><p>(.*?)</p>',
 690                 content, 'block message', default=None)
 691             if block_msg:
 692                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 693             raise ExtractorError(msg, expected=True)
 694         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 695                 'blocklist.rkn.gov.ru' in content):
 696             raise ExtractorError(
 697                 'Access to this webpage has been blocked by decision of the Russian government. '
 698                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 699                 expected=True)
 700
 701     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 702         content_type = urlh.headers.get('Content-Type', '')
 703         webpage_bytes = urlh.read()
 704         if prefix is not None:
 705             webpage_bytes = prefix + webpage_bytes
 706         if not encoding:
 707             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 708         if self._downloader.params.get('dump_intermediate_pages', False):
 709             self.to_screen('Dumping request to ' + urlh.geturl())
 710             dump = base64.b64encode(webpage_bytes).decode('ascii')
 711             self._downloader.to_screen(dump)
 712         if self._downloader.params.get('write_pages', False):
 713             basen = '%s_%s' % (video_id, urlh.geturl())
 714             if len(basen) > 240:
 715                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 716                 basen = basen[:240 - len(h)] + h
 717             raw_filename = basen + '.dump'
 718             filename = sanitize_filename(raw_filename, restricted=True)
 719             self.to_screen('Saving request to ' + filename)
 720             # Working around MAX_PATH limitation on Windows (see
 721             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 722             if compat_os_name == 'nt':
 723                 absfilepath = os.path.abspath(filename)
 724                 if len(absfilepath) > 259:
 725                     filename = '\\\\?\\' + absfilepath
 726             with open(filename, 'wb') as outf:
 727                 outf.write(webpage_bytes)
 728
 729         try:
 730             content = webpage_bytes.decode(encoding, 'replace')
 731         except LookupError:
 732             content = webpage_bytes.decode('utf-8', 'replace')
 733
 734         self.__check_blocked(content)
 735
 736         return content
 737
 738     def _download_webpage(
 739             self, url_or_request, video_id, note=None, errnote=None,
 740             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 741             headers={}, query={}, expected_status=None):
 742         """
 743         Return the data of the page as a string.
 744
 745         Arguments:
 746         url_or_request -- plain text URL as a string or
 747             a compat_urllib_request.Requestobject
 748         video_id -- Video/playlist/item identifier (string)
 749
 750         Keyword arguments:
 751         note -- note printed before downloading (string)
 752         errnote -- note printed in case of an error (string)
 753         fatal -- flag denoting whether error should be considered fatal,
 754             i.e. whether it should cause ExtractionError to be raised,
 755             otherwise a warning will be reported and extraction continued
 756         tries -- number of tries
 757         timeout -- sleep interval between tries
 758         encoding -- encoding for a page content decoding, guessed automatically
 759             when not explicitly specified
 760         data -- POST data (bytes)
 761         headers -- HTTP headers (dict)
 762         query -- URL query (dict)
 763         expected_status -- allows to accept failed HTTP requests (non 2xx
 764             status code) by explicitly specifying a set of accepted status
 765             codes. Can be any of the following entities:
 766                 - an integer type specifying an exact failed status code to
 767                   accept
 768                 - a list or a tuple of integer types specifying a list of
 769                   failed status codes to accept
 770                 - a callable accepting an actual failed status code and
 771                   returning True if it should be accepted
 772             Note that this argument does not affect success status codes (2xx)
 773             which are always accepted.
 774         """
 775
 776         success = False
 777         try_count = 0
 778         while success is False:
 779             try:
 780                 res = self._download_webpage_handle(
 781                     url_or_request, video_id, note, errnote, fatal,
 782                     encoding=encoding, data=data, headers=headers, query=query,
 783                     expected_status=expected_status)
 784                 success = True
 785             except compat_http_client.IncompleteRead as e:
 786                 try_count += 1
 787                 if try_count >= tries:
 788                     raise e
 789                 self._sleep(timeout, video_id)
 790         if res is False:
 791             return res
 792         else:
 793             content, _ = res
 794             return content
 795
 796     def _download_xml_handle(
 797             self, url_or_request, video_id, note='Downloading XML',
 798             errnote='Unable to download XML', transform_source=None,
 799             fatal=True, encoding=None, data=None, headers={}, query={},
 800             expected_status=None):
 801         """
 802         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 803
 804         See _download_webpage docstring for arguments specification.
 805         """
 806         res = self._download_webpage_handle(
 807             url_or_request, video_id, note, errnote, fatal=fatal,
 808             encoding=encoding, data=data, headers=headers, query=query,
 809             expected_status=expected_status)
 810         if res is False:
 811             return res
 812         xml_string, urlh = res
 813         return self._parse_xml(
 814             xml_string, video_id, transform_source=transform_source,
 815             fatal=fatal), urlh
 816
 817     def _download_xml(
 818             self, url_or_request, video_id,
 819             note='Downloading XML', errnote='Unable to download XML',
 820             transform_source=None, fatal=True, encoding=None,
 821             data=None, headers={}, query={}, expected_status=None):
 822         """
 823         Return the xml as an xml.etree.ElementTree.Element.
 824
 825         See _download_webpage docstring for arguments specification.
 826         """
 827         res = self._download_xml_handle(
 828             url_or_request, video_id, note=note, errnote=errnote,
 829             transform_source=transform_source, fatal=fatal, encoding=encoding,
 830             data=data, headers=headers, query=query,
 831             expected_status=expected_status)
 832         return res if res is False else res[0]
 833
 834     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 835         if transform_source:
 836             xml_string = transform_source(xml_string)
 837         try:
 838             return compat_etree_fromstring(xml_string.encode('utf-8'))
 839         except compat_xml_parse_error as ve:
 840             errmsg = '%s: Failed to parse XML ' % video_id
 841             if fatal:
 842                 raise ExtractorError(errmsg, cause=ve)
 843             else:
 844                 self.report_warning(errmsg + str(ve))
 845
 846     def _download_json_handle(
 847             self, url_or_request, video_id, note='Downloading JSON metadata',
 848             errnote='Unable to download JSON metadata', transform_source=None,
 849             fatal=True, encoding=None, data=None, headers={}, query={},
 850             expected_status=None):
 851         """
 852         Return a tuple (JSON object, URL handle).
 853
 854         See _download_webpage docstring for arguments specification.
 855         """
 856         res = self._download_webpage_handle(
 857             url_or_request, video_id, note, errnote, fatal=fatal,
 858             encoding=encoding, data=data, headers=headers, query=query,
 859             expected_status=expected_status)
 860         if res is False:
 861             return res
 862         json_string, urlh = res
 863         return self._parse_json(
 864             json_string, video_id, transform_source=transform_source,
 865             fatal=fatal), urlh
 866
 867     def _download_json(
 868             self, url_or_request, video_id, note='Downloading JSON metadata',
 869             errnote='Unable to download JSON metadata', transform_source=None,
 870             fatal=True, encoding=None, data=None, headers={}, query={},
 871             expected_status=None):
 872         """
 873         Return the JSON object as a dict.
 874
 875         See _download_webpage docstring for arguments specification.
 876         """
 877         res = self._download_json_handle(
 878             url_or_request, video_id, note=note, errnote=errnote,
 879             transform_source=transform_source, fatal=fatal, encoding=encoding,
 880             data=data, headers=headers, query=query,
 881             expected_status=expected_status)
 882         return res if res is False else res[0]
 883
 884     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 885         if transform_source:
 886             json_string = transform_source(json_string)
 887         try:
 888             return json.loads(json_string)
 889         except ValueError as ve:
 890             errmsg = '%s: Failed to parse JSON ' % video_id
 891             if fatal:
 892                 raise ExtractorError(errmsg, cause=ve)
 893             else:
 894                 self.report_warning(errmsg + str(ve))
 895
 896     def report_warning(self, msg, video_id=None):
 897         idstr = '' if video_id is None else '%s: ' % video_id
 898         self._downloader.report_warning(
 899             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 900
 901     def to_screen(self, msg):
 902         """Print msg to screen, prefixing it with '[ie_name]'"""
 903         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 904
 905     def report_extraction(self, id_or_name):
 906         """Report information extraction."""
 907         self.to_screen('%s: Extracting information' % id_or_name)
 908
 909     def report_download_webpage(self, video_id):
 910         """Report webpage download."""
 911         self.to_screen('%s: Downloading webpage' % video_id)
 912
 913     def report_age_confirmation(self):
 914         """Report attempt to confirm age."""
 915         self.to_screen('Confirming age')
 916
 917     def report_login(self):
 918         """Report attempt to log in."""
 919         self.to_screen('Logging in')
 920
 921     @staticmethod
 922     def raise_login_required(msg='This video is only available for registered users'):
 923         raise ExtractorError(
 924             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 925             expected=True)
 926
 927     @staticmethod
 928     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 929         raise GeoRestrictedError(msg, countries=countries)
 930
 931     # Methods for following #608
 932     @staticmethod
 933     def url_result(url, ie=None, video_id=None, video_title=None):
 934         """Returns a URL that points to a page that should be processed"""
 935         # TODO: ie should be the class used for getting the info
 936         video_info = {'_type': 'url',
 937                       'url': url,
 938                       'ie_key': ie}
 939         if video_id is not None:
 940             video_info['id'] = video_id
 941         if video_title is not None:
 942             video_info['title'] = video_title
 943         return video_info
 944
 945     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 946         urls = orderedSet(
 947             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 948             for m in matches)
 949         return self.playlist_result(
 950             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 951
 952     @staticmethod
 953     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 954         """Returns a playlist"""
 955         video_info = {'_type': 'playlist',
 956                       'entries': entries}
 957         if playlist_id:
 958             video_info['id'] = playlist_id
 959         if playlist_title:
 960             video_info['title'] = playlist_title
 961         if playlist_description:
 962             video_info['description'] = playlist_description
 963         return video_info
 964
 965     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 966         """
 967         Perform a regex search on the given string, using a single or a list of
 968         patterns returning the first matching group.
 969         In case of failure return a default value or raise a WARNING or a
 970         RegexNotFoundError, depending on fatal, specifying the field name.
 971         """
 972         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 973             mobj = re.search(pattern, string, flags)
 974         else:
 975             for p in pattern:
 976                 mobj = re.search(p, string, flags)
 977                 if mobj:
 978                     break
 979
 980         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 981             _name = '\033[0;34m%s\033[0m' % name
 982         else:
 983             _name = name
 984
 985         if mobj:
 986             if group is None:
 987                 # return the first matching group
 988                 return next(g for g in mobj.groups() if g is not None)
 989             else:
 990                 return mobj.group(group)
 991         elif default is not NO_DEFAULT:
 992             return default
 993         elif fatal:
 994             raise RegexNotFoundError('Unable to extract %s' % _name)
 995         else:
 996             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 997             return None
 998
 999     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1000         """
1001         Like _search_regex, but strips HTML tags and unescapes entities.
1002         """
1003         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1004         if res:
1005             return clean_html(res).strip()
1006         else:
1007             return res
1008
1009     def _get_netrc_login_info(self, netrc_machine=None):
1010         username = None
1011         password = None
1012         netrc_machine = netrc_machine or self._NETRC_MACHINE
1013
1014         if self._downloader.params.get('usenetrc', False):
1015             try:
1016                 info = netrc.netrc().authenticators(netrc_machine)
1017                 if info is not None:
1018                     username = info[0]
1019                     password = info[2]
1020                 else:
1021                     raise netrc.NetrcParseError(
1022                         'No authenticators for %s' % netrc_machine)
1023             except (IOError, netrc.NetrcParseError) as err:
1024                 self._downloader.report_warning(
1025                     'parsing .netrc: %s' % error_to_compat_str(err))
1026
1027         return username, password
1028
1029     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1030         """
1031         Get the login info as (username, password)
1032         First look for the manually specified credentials using username_option
1033         and password_option as keys in params dictionary. If no such credentials
1034         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1035         value.
1036         If there's no info available, return (None, None)
1037         """
1038         if self._downloader is None:
1039             return (None, None)
1040
1041         downloader_params = self._downloader.params
1042
1043         # Attempt to use provided username and password or .netrc data
1044         if downloader_params.get(username_option) is not None:
1045             username = downloader_params[username_option]
1046             password = downloader_params[password_option]
1047         else:
1048             username, password = self._get_netrc_login_info(netrc_machine)
1049
1050         return username, password
1051
1052     def _get_tfa_info(self, note='two-factor verification code'):
1053         """
1054         Get the two-factor authentication info
1055         TODO - asking the user will be required for sms/phone verify
1056         currently just uses the command line option
1057         If there's no info available, return None
1058         """
1059         if self._downloader is None:
1060             return None
1061         downloader_params = self._downloader.params
1062
1063         if downloader_params.get('twofactor') is not None:
1064             return downloader_params['twofactor']
1065
1066         return compat_getpass('Type %s and press [Return]: ' % note)
1067
1068     # Helper functions for extracting OpenGraph info
1069     @staticmethod
1070     def _og_regexes(prop):
1071         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1072         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1073                        % {'prop': re.escape(prop)})
1074         template = r'<meta[^>]+?%s[^>]+?%s'
1075         return [
1076             template % (property_re, content_re),
1077             template % (content_re, property_re),
1078         ]
1079
1080     @staticmethod
1081     def _meta_regex(prop):
1082         return r'''(?isx)<meta
1083                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1084                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1085
1086     def _og_search_property(self, prop, html, name=None, **kargs):
1087         if not isinstance(prop, (list, tuple)):
1088             prop = [prop]
1089         if name is None:
1090             name = 'OpenGraph %s' % prop[0]
1091         og_regexes = []
1092         for p in prop:
1093             og_regexes.extend(self._og_regexes(p))
1094         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1095         if escaped is None:
1096             return None
1097         return unescapeHTML(escaped)
1098
1099     def _og_search_thumbnail(self, html, **kargs):
1100         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1101
1102     def _og_search_description(self, html, **kargs):
1103         return self._og_search_property('description', html, fatal=False, **kargs)
1104
1105     def _og_search_title(self, html, **kargs):
1106         return self._og_search_property('title', html, **kargs)
1107
1108     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1109         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1110         if secure:
1111             regexes = self._og_regexes('video:secure_url') + regexes
1112         return self._html_search_regex(regexes, html, name, **kargs)
1113
1114     def _og_search_url(self, html, **kargs):
1115         return self._og_search_property('url', html, **kargs)
1116
1117     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1118         if not isinstance(name, (list, tuple)):
1119             name = [name]
1120         if display_name is None:
1121             display_name = name[0]
1122         return self._html_search_regex(
1123             [self._meta_regex(n) for n in name],
1124             html, display_name, fatal=fatal, group='content', **kwargs)
1125
1126     def _dc_search_uploader(self, html):
1127         return self._html_search_meta('dc.creator', html, 'uploader')
1128
1129     def _rta_search(self, html):
1130         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1131         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1132                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1133                      html):
1134             return 18
1135         return 0
1136
1137     def _media_rating_search(self, html):
1138         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1139         rating = self._html_search_meta('rating', html)
1140
1141         if not rating:
1142             return None
1143
1144         RATING_TABLE = {
1145             'safe for kids': 0,
1146             'general': 8,
1147             '14 years': 14,
1148             'mature': 17,
1149             'restricted': 19,
1150         }
1151         return RATING_TABLE.get(rating.lower())
1152
1153     def _family_friendly_search(self, html):
1154         # See http://schema.org/VideoObject
1155         family_friendly = self._html_search_meta(
1156             'isFamilyFriendly', html, default=None)
1157
1158         if not family_friendly:
1159             return None
1160
1161         RATING_TABLE = {
1162             '1': 0,
1163             'true': 0,
1164             '0': 18,
1165             'false': 18,
1166         }
1167         return RATING_TABLE.get(family_friendly.lower())
1168
1169     def _twitter_search_player(self, html):
1170         return self._html_search_meta('twitter:player', html,
1171                                       'twitter card player')
1172
1173     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1174         json_ld = self._search_regex(
1175             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1176         default = kwargs.get('default', NO_DEFAULT)
1177         if not json_ld:
1178             return default if default is not NO_DEFAULT else {}
1179         # JSON-LD may be malformed and thus `fatal` should be respected.
1180         # At the same time `default` may be passed that assumes `fatal=False`
1181         # for _search_regex. Let's simulate the same behavior here as well.
1182         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1183         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1184
1185     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1186         if isinstance(json_ld, compat_str):
1187             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1188         if not json_ld:
1189             return {}
1190         info = {}
1191         if not isinstance(json_ld, (list, tuple, dict)):
1192             return info
1193         if isinstance(json_ld, dict):
1194             json_ld = [json_ld]
1195
1196         INTERACTION_TYPE_MAP = {
1197             'CommentAction': 'comment',
1198             'AgreeAction': 'like',
1199             'DisagreeAction': 'dislike',
1200             'LikeAction': 'like',
1201             'DislikeAction': 'dislike',
1202             'ListenAction': 'view',
1203             'WatchAction': 'view',
1204             'ViewAction': 'view',
1205         }
1206
1207         def extract_interaction_statistic(e):
1208             interaction_statistic = e.get('interactionStatistic')
1209             if not isinstance(interaction_statistic, list):
1210                 return
1211             for is_e in interaction_statistic:
1212                 if not isinstance(is_e, dict):
1213                     continue
1214                 if is_e.get('@type') != 'InteractionCounter':
1215                     continue
1216                 interaction_type = is_e.get('interactionType')
1217                 if not isinstance(interaction_type, compat_str):
1218                     continue
1219                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1220                 if interaction_count is None:
1221                     continue
1222                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1223                 if not count_kind:
1224                     continue
1225                 count_key = '%s_count' % count_kind
1226                 if info.get(count_key) is not None:
1227                     continue
1228                 info[count_key] = interaction_count
1229
1230         def extract_video_object(e):
1231             assert e['@type'] == 'VideoObject'
1232             info.update({
1233                 'url': url_or_none(e.get('contentUrl')),
1234                 'title': unescapeHTML(e.get('name')),
1235                 'description': unescapeHTML(e.get('description')),
1236                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1237                 'duration': parse_duration(e.get('duration')),
1238                 'timestamp': unified_timestamp(e.get('uploadDate')),
1239                 'filesize': float_or_none(e.get('contentSize')),
1240                 'tbr': int_or_none(e.get('bitrate')),
1241                 'width': int_or_none(e.get('width')),
1242                 'height': int_or_none(e.get('height')),
1243                 'view_count': int_or_none(e.get('interactionCount')),
1244             })
1245             extract_interaction_statistic(e)
1246
1247         for e in json_ld:
1248             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1249                 item_type = e.get('@type')
1250                 if expected_type is not None and expected_type != item_type:
1251                     return info
1252                 if item_type in ('TVEpisode', 'Episode'):
1253                     episode_name = unescapeHTML(e.get('name'))
1254                     info.update({
1255                         'episode': episode_name,
1256                         'episode_number': int_or_none(e.get('episodeNumber')),
1257                         'description': unescapeHTML(e.get('description')),
1258                     })
1259                     if not info.get('title') and episode_name:
1260                         info['title'] = episode_name
1261                     part_of_season = e.get('partOfSeason')
1262                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1263                         info.update({
1264                             'season': unescapeHTML(part_of_season.get('name')),
1265                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1266                         })
1267                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1268                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1269                         info['series'] = unescapeHTML(part_of_series.get('name'))
1270                 elif item_type == 'Movie':
1271                     info.update({
1272                         'title': unescapeHTML(e.get('name')),
1273                         'description': unescapeHTML(e.get('description')),
1274                         'duration': parse_duration(e.get('duration')),
1275                         'timestamp': unified_timestamp(e.get('dateCreated')),
1276                     })
1277                 elif item_type in ('Article', 'NewsArticle'):
1278                     info.update({
1279                         'timestamp': parse_iso8601(e.get('datePublished')),
1280                         'title': unescapeHTML(e.get('headline')),
1281                         'description': unescapeHTML(e.get('articleBody')),
1282                     })
1283                 elif item_type == 'VideoObject':
1284                     extract_video_object(e)
1285                     continue
1286                 video = e.get('video')
1287                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1288                     extract_video_object(video)
1289                 break
1290         return dict((k, v) for k, v in info.items() if v is not None)
1291
1292     @staticmethod
1293     def _hidden_inputs(html):
1294         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1295         hidden_inputs = {}
1296         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1297             attrs = extract_attributes(input)
1298             if not input:
1299                 continue
1300             if attrs.get('type') not in ('hidden', 'submit'):
1301                 continue
1302             name = attrs.get('name') or attrs.get('id')
1303             value = attrs.get('value')
1304             if name and value is not None:
1305                 hidden_inputs[name] = value
1306         return hidden_inputs
1307
1308     def _form_hidden_inputs(self, form_id, html):
1309         form = self._search_regex(
1310             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1311             html, '%s form' % form_id, group='form')
1312         return self._hidden_inputs(form)
1313
1314     def _sort_formats(self, formats, field_preference=None):
1315         if not formats:
1316             raise ExtractorError('No video formats found')
1317
1318         for f in formats:
1319             # Automatically determine tbr when missing based on abr and vbr (improves
1320             # formats sorting in some cases)
1321             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1322                 f['tbr'] = f['abr'] + f['vbr']
1323
1324         def _formats_key(f):
1325             # TODO remove the following workaround
1326             from ..utils import determine_ext
1327             if not f.get('ext') and 'url' in f:
1328                 f['ext'] = determine_ext(f['url'])
1329
1330             if isinstance(field_preference, (list, tuple)):
1331                 return tuple(
1332                     f.get(field)
1333                     if f.get(field) is not None
1334                     else ('' if field == 'format_id' else -1)
1335                     for field in field_preference)
1336
1337             preference = f.get('preference')
1338             if preference is None:
1339                 preference = 0
1340                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1341                     preference -= 0.5
1342
1343             protocol = f.get('protocol') or determine_protocol(f)
1344             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1345
1346             if f.get('vcodec') == 'none':  # audio only
1347                 preference -= 50
1348                 if self._downloader.params.get('prefer_free_formats'):
1349                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1350                 else:
1351                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1352                 ext_preference = 0
1353                 try:
1354                     audio_ext_preference = ORDER.index(f['ext'])
1355                 except ValueError:
1356                     audio_ext_preference = -1
1357             else:
1358                 if f.get('acodec') == 'none':  # video only
1359                     preference -= 40
1360                 if self._downloader.params.get('prefer_free_formats'):
1361                     ORDER = ['flv', 'mp4', 'webm']
1362                 else:
1363                     ORDER = ['webm', 'flv', 'mp4']
1364                 try:
1365                     ext_preference = ORDER.index(f['ext'])
1366                 except ValueError:
1367                     ext_preference = -1
1368                 audio_ext_preference = 0
1369
1370             return (
1371                 preference,
1372                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1373                 f.get('quality') if f.get('quality') is not None else -1,
1374                 f.get('tbr') if f.get('tbr') is not None else -1,
1375                 f.get('filesize') if f.get('filesize') is not None else -1,
1376                 f.get('vbr') if f.get('vbr') is not None else -1,
1377                 f.get('height') if f.get('height') is not None else -1,
1378                 f.get('width') if f.get('width') is not None else -1,
1379                 proto_preference,
1380                 ext_preference,
1381                 f.get('abr') if f.get('abr') is not None else -1,
1382                 audio_ext_preference,
1383                 f.get('fps') if f.get('fps') is not None else -1,
1384                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1385                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1386                 f.get('format_id') if f.get('format_id') is not None else '',
1387             )
1388         formats.sort(key=_formats_key)
1389
1390     def _check_formats(self, formats, video_id):
1391         if formats:
1392             formats[:] = filter(
1393                 lambda f: self._is_valid_url(
1394                     f['url'], video_id,
1395                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1396                 formats)
1397
1398     @staticmethod
1399     def _remove_duplicate_formats(formats):
1400         format_urls = set()
1401         unique_formats = []
1402         for f in formats:
1403             if f['url'] not in format_urls:
1404                 format_urls.add(f['url'])
1405                 unique_formats.append(f)
1406         formats[:] = unique_formats
1407
1408     def _is_valid_url(self, url, video_id, item='video', headers={}):
1409         url = self._proto_relative_url(url, scheme='http:')
1410         # For now assume non HTTP(S) URLs always valid
1411         if not (url.startswith('http://') or url.startswith('https://')):
1412             return True
1413         try:
1414             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1415             return True
1416         except ExtractorError as e:
1417             if isinstance(e.cause, compat_urllib_error.URLError):
1418                 self.to_screen(
1419                     '%s: %s URL is invalid, skipping' % (video_id, item))
1420                 return False
1421             raise
1422
1423     def http_scheme(self):
1424         """ Either "http:" or "https:", depending on the user's preferences """
1425         return (
1426             'http:'
1427             if self._downloader.params.get('prefer_insecure', False)
1428             else 'https:')
1429
1430     def _proto_relative_url(self, url, scheme=None):
1431         if url is None:
1432             return url
1433         if url.startswith('//'):
1434             if scheme is None:
1435                 scheme = self.http_scheme()
1436             return scheme + url
1437         else:
1438             return url
1439
1440     def _sleep(self, timeout, video_id, msg_template=None):
1441         if msg_template is None:
1442             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1443         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1444         self.to_screen(msg)
1445         time.sleep(timeout)
1446
1447     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1448                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1449                              fatal=True, m3u8_id=None):
1450         manifest = self._download_xml(
1451             manifest_url, video_id, 'Downloading f4m manifest',
1452             'Unable to download f4m manifest',
1453             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1454             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1455             transform_source=transform_source,
1456             fatal=fatal)
1457
1458         if manifest is False:
1459             return []
1460
1461         return self._parse_f4m_formats(
1462             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1463             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1464
1465     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1466                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1467                            fatal=True, m3u8_id=None):
1468         if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1469             return []
1470
1471         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1472         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1473         if akamai_pv is not None and ';' in akamai_pv.text:
1474             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1475             if playerVerificationChallenge.strip() != '':
1476                 return []
1477
1478         formats = []
1479         manifest_version = '1.0'
1480         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1481         if not media_nodes:
1482             manifest_version = '2.0'
1483             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1484         # Remove unsupported DRM protected media from final formats
1485         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1486         media_nodes = remove_encrypted_media(media_nodes)
1487         if not media_nodes:
1488             return formats
1489
1490         manifest_base_url = get_base_url(manifest)
1491
1492         bootstrap_info = xpath_element(
1493             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1494             'bootstrap info', default=None)
1495
1496         vcodec = None
1497         mime_type = xpath_text(
1498             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1499             'base URL', default=None)
1500         if mime_type and mime_type.startswith('audio/'):
1501             vcodec = 'none'
1502
1503         for i, media_el in enumerate(media_nodes):
1504             tbr = int_or_none(media_el.attrib.get('bitrate'))
1505             width = int_or_none(media_el.attrib.get('width'))
1506             height = int_or_none(media_el.attrib.get('height'))
1507             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1508             # If <bootstrapInfo> is present, the specified f4m is a
1509             # stream-level manifest, and only set-level manifests may refer to
1510             # external resources.  See section 11.4 and section 4 of F4M spec
1511             if bootstrap_info is None:
1512                 media_url = None
1513                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1514                 if manifest_version == '2.0':
1515                     media_url = media_el.attrib.get('href')
1516                 if media_url is None:
1517                     media_url = media_el.attrib.get('url')
1518                 if not media_url:
1519                     continue
1520                 manifest_url = (
1521                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1522                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1523                 # If media_url is itself a f4m manifest do the recursive extraction
1524                 # since bitrates in parent manifest (this one) and media_url manifest
1525                 # may differ leading to inability to resolve the format by requested
1526                 # bitrate in f4m downloader
1527                 ext = determine_ext(manifest_url)
1528                 if ext == 'f4m':
1529                     f4m_formats = self._extract_f4m_formats(
1530                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1531                         transform_source=transform_source, fatal=fatal)
1532                     # Sometimes stream-level manifest contains single media entry that
1533                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1534                     # At the same time parent's media entry in set-level manifest may
1535                     # contain it. We will copy it from parent in such cases.
1536                     if len(f4m_formats) == 1:
1537                         f = f4m_formats[0]
1538                         f.update({
1539                             'tbr': f.get('tbr') or tbr,
1540                             'width': f.get('width') or width,
1541                             'height': f.get('height') or height,
1542                             'format_id': f.get('format_id') if not tbr else format_id,
1543                             'vcodec': vcodec,
1544                         })
1545                     formats.extend(f4m_formats)
1546                     continue
1547                 elif ext == 'm3u8':
1548                     formats.extend(self._extract_m3u8_formats(
1549                         manifest_url, video_id, 'mp4', preference=preference,
1550                         m3u8_id=m3u8_id, fatal=fatal))
1551                     continue
1552             formats.append({
1553                 'format_id': format_id,
1554                 'url': manifest_url,
1555                 'manifest_url': manifest_url,
1556                 'ext': 'flv' if bootstrap_info is not None else None,
1557                 'protocol': 'f4m',
1558                 'tbr': tbr,
1559                 'width': width,
1560                 'height': height,
1561                 'vcodec': vcodec,
1562                 'preference': preference,
1563             })
1564         return formats
1565
1566     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1567         return {
1568             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1569             'url': m3u8_url,
1570             'ext': ext,
1571             'protocol': 'm3u8',
1572             'preference': preference - 100 if preference else -100,
1573             'resolution': 'multiple',
1574             'format_note': 'Quality selection URL',
1575         }
1576
1577     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1578                               entry_protocol='m3u8', preference=None,
1579                               m3u8_id=None, note=None, errnote=None,
1580                               fatal=True, live=False):
1581         res = self._download_webpage_handle(
1582             m3u8_url, video_id,
1583             note=note or 'Downloading m3u8 information',
1584             errnote=errnote or 'Failed to download m3u8 information',
1585             fatal=fatal)
1586
1587         if res is False:
1588             return []
1589
1590         m3u8_doc, urlh = res
1591         m3u8_url = urlh.geturl()
1592
1593         return self._parse_m3u8_formats(
1594             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1595             preference=preference, m3u8_id=m3u8_id, live=live)
1596
1597     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1598                             entry_protocol='m3u8', preference=None,
1599                             m3u8_id=None, live=False):
1600         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1601             return []
1602
1603         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1604             return []
1605
1606         formats = []
1607
1608         format_url = lambda u: (
1609             u
1610             if re.match(r'^https?://', u)
1611             else compat_urlparse.urljoin(m3u8_url, u))
1612
1613         # References:
1614         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1615         # 2. https://github.com/rg3/youtube-dl/issues/12211
1616         # 3. https://github.com/rg3/youtube-dl/issues/18923
1617
1618         # We should try extracting formats only from master playlists [1, 4.3.4],
1619         # i.e. playlists that describe available qualities. On the other hand
1620         # media playlists [1, 4.3.3] should be returned as is since they contain
1621         # just the media without qualities renditions.
1622         # Fortunately, master playlist can be easily distinguished from media
1623         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1624         # master playlist tags MUST NOT appear in a media playist and vice versa.
1625         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1626         # media playlist and MUST NOT appear in master playlist thus we can
1627         # clearly detect media playlist with this criterion.
1628
1629         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1630             return [{
1631                 'url': m3u8_url,
1632                 'format_id': m3u8_id,
1633                 'ext': ext,
1634                 'protocol': entry_protocol,
1635                 'preference': preference,
1636             }]
1637
1638         groups = {}
1639         last_stream_inf = {}
1640
1641         def extract_media(x_media_line):
1642             media = parse_m3u8_attributes(x_media_line)
1643             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1644             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1645             if not (media_type and group_id and name):
1646                 return
1647             groups.setdefault(group_id, []).append(media)
1648             if media_type not in ('VIDEO', 'AUDIO'):
1649                 return
1650             media_url = media.get('URI')
1651             if media_url:
1652                 format_id = []
1653                 for v in (m3u8_id, group_id, name):
1654                     if v:
1655                         format_id.append(v)
1656                 f = {
1657                     'format_id': '-'.join(format_id),
1658                     'url': format_url(media_url),
1659                     'manifest_url': m3u8_url,
1660                     'language': media.get('LANGUAGE'),
1661                     'ext': ext,
1662                     'protocol': entry_protocol,
1663                     'preference': preference,
1664                 }
1665                 if media_type == 'AUDIO':
1666                     f['vcodec'] = 'none'
1667                 formats.append(f)
1668
1669         def build_stream_name():
1670             # Despite specification does not mention NAME attribute for
1671             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1672             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1673             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1674             stream_name = last_stream_inf.get('NAME')
1675             if stream_name:
1676                 return stream_name
1677             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1678             # from corresponding rendition group
1679             stream_group_id = last_stream_inf.get('VIDEO')
1680             if not stream_group_id:
1681                 return
1682             stream_group = groups.get(stream_group_id)
1683             if not stream_group:
1684                 return stream_group_id
1685             rendition = stream_group[0]
1686             return rendition.get('NAME') or stream_group_id
1687
1688         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1689         # chance to detect video only formats when EXT-X-STREAM-INF tags
1690         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1691         for line in m3u8_doc.splitlines():
1692             if line.startswith('#EXT-X-MEDIA:'):
1693                 extract_media(line)
1694
1695         for line in m3u8_doc.splitlines():
1696             if line.startswith('#EXT-X-STREAM-INF:'):
1697                 last_stream_inf = parse_m3u8_attributes(line)
1698             elif line.startswith('#') or not line.strip():
1699                 continue
1700             else:
1701                 tbr = float_or_none(
1702                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1703                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1704                 format_id = []
1705                 if m3u8_id:
1706                     format_id.append(m3u8_id)
1707                 stream_name = build_stream_name()
1708                 # Bandwidth of live streams may differ over time thus making
1709                 # format_id unpredictable. So it's better to keep provided
1710                 # format_id intact.
1711                 if not live:
1712                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1713                 manifest_url = format_url(line.strip())
1714                 f = {
1715                     'format_id': '-'.join(format_id),
1716                     'url': manifest_url,
1717                     'manifest_url': m3u8_url,
1718                     'tbr': tbr,
1719                     'ext': ext,
1720                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1721                     'protocol': entry_protocol,
1722                     'preference': preference,
1723                 }
1724                 resolution = last_stream_inf.get('RESOLUTION')
1725                 if resolution:
1726                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1727                     if mobj:
1728                         f['width'] = int(mobj.group('width'))
1729                         f['height'] = int(mobj.group('height'))
1730                 # Unified Streaming Platform
1731                 mobj = re.search(
1732                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1733                 if mobj:
1734                     abr, vbr = mobj.groups()
1735                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1736                     f.update({
1737                         'vbr': vbr,
1738                         'abr': abr,
1739                     })
1740                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1741                 f.update(codecs)
1742                 audio_group_id = last_stream_inf.get('AUDIO')
1743                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1744                 # references a rendition group MUST have a CODECS attribute.
1745                 # However, this is not always respected, for example, [2]
1746                 # contains EXT-X-STREAM-INF tag which references AUDIO
1747                 # rendition group but does not have CODECS and despite
1748                 # referencing an audio group it represents a complete
1749                 # (with audio and video) format. So, for such cases we will
1750                 # ignore references to rendition groups and treat them
1751                 # as complete formats.
1752                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1753                     audio_group = groups.get(audio_group_id)
1754                     if audio_group and audio_group[0].get('URI'):
1755                         # TODO: update acodec for audio only formats with
1756                         # the same GROUP-ID
1757                         f['acodec'] = 'none'
1758                 formats.append(f)
1759                 last_stream_inf = {}
1760         return formats
1761
1762     @staticmethod
1763     def _xpath_ns(path, namespace=None):
1764         if not namespace:
1765             return path
1766         out = []
1767         for c in path.split('/'):
1768             if not c or c == '.':
1769                 out.append(c)
1770             else:
1771                 out.append('{%s}%s' % (namespace, c))
1772         return '/'.join(out)
1773
1774     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1775         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1776
1777         if smil is False:
1778             assert not fatal
1779             return []
1780
1781         namespace = self._parse_smil_namespace(smil)
1782
1783         return self._parse_smil_formats(
1784             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1785
1786     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1787         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1788         if smil is False:
1789             return {}
1790         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1791
1792     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1793         return self._download_xml(
1794             smil_url, video_id, 'Downloading SMIL file',
1795             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1796
1797     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1798         namespace = self._parse_smil_namespace(smil)
1799
1800         formats = self._parse_smil_formats(
1801             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1802         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1803
1804         video_id = os.path.splitext(url_basename(smil_url))[0]
1805         title = None
1806         description = None
1807         upload_date = None
1808         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1809             name = meta.attrib.get('name')
1810             content = meta.attrib.get('content')
1811             if not name or not content:
1812                 continue
1813             if not title and name == 'title':
1814                 title = content
1815             elif not description and name in ('description', 'abstract'):
1816                 description = content
1817             elif not upload_date and name == 'date':
1818                 upload_date = unified_strdate(content)
1819
1820         thumbnails = [{
1821             'id': image.get('type'),
1822             'url': image.get('src'),
1823             'width': int_or_none(image.get('width')),
1824             'height': int_or_none(image.get('height')),
1825         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1826
1827         return {
1828             'id': video_id,
1829             'title': title or video_id,
1830             'description': description,
1831             'upload_date': upload_date,
1832             'thumbnails': thumbnails,
1833             'formats': formats,
1834             'subtitles': subtitles,
1835         }
1836
1837     def _parse_smil_namespace(self, smil):
1838         return self._search_regex(
1839             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1840
1841     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1842         base = smil_url
1843         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1844             b = meta.get('base') or meta.get('httpBase')
1845             if b:
1846                 base = b
1847                 break
1848
1849         formats = []
1850         rtmp_count = 0
1851         http_count = 0
1852         m3u8_count = 0
1853
1854         srcs = []
1855         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1856         for medium in media:
1857             src = medium.get('src')
1858             if not src or src in srcs:
1859                 continue
1860             srcs.append(src)
1861
1862             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1863             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1864             width = int_or_none(medium.get('width'))
1865             height = int_or_none(medium.get('height'))
1866             proto = medium.get('proto')
1867             ext = medium.get('ext')
1868             src_ext = determine_ext(src)
1869             streamer = medium.get('streamer') or base
1870
1871             if proto == 'rtmp' or streamer.startswith('rtmp'):
1872                 rtmp_count += 1
1873                 formats.append({
1874                     'url': streamer,
1875                     'play_path': src,
1876                     'ext': 'flv',
1877                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1878                     'tbr': bitrate,
1879                     'filesize': filesize,
1880                     'width': width,
1881                     'height': height,
1882                 })
1883                 if transform_rtmp_url:
1884                     streamer, src = transform_rtmp_url(streamer, src)
1885                     formats[-1].update({
1886                         'url': streamer,
1887                         'play_path': src,
1888                     })
1889                 continue
1890
1891             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1892             src_url = src_url.strip()
1893
1894             if proto == 'm3u8' or src_ext == 'm3u8':
1895                 m3u8_formats = self._extract_m3u8_formats(
1896                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1897                 if len(m3u8_formats) == 1:
1898                     m3u8_count += 1
1899                     m3u8_formats[0].update({
1900                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1901                         'tbr': bitrate,
1902                         'width': width,
1903                         'height': height,
1904                     })
1905                 formats.extend(m3u8_formats)
1906             elif src_ext == 'f4m':
1907                 f4m_url = src_url
1908                 if not f4m_params:
1909                     f4m_params = {
1910                         'hdcore': '3.2.0',
1911                         'plugin': 'flowplayer-3.2.0.1',
1912                     }
1913                 f4m_url += '&' if '?' in f4m_url else '?'
1914                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1915                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1916             elif src_ext == 'mpd':
1917                 formats.extend(self._extract_mpd_formats(
1918                     src_url, video_id, mpd_id='dash', fatal=False))
1919             elif re.search(r'\.ism/[Mm]anifest', src_url):
1920                 formats.extend(self._extract_ism_formats(
1921                     src_url, video_id, ism_id='mss', fatal=False))
1922             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1923                 http_count += 1
1924                 formats.append({
1925                     'url': src_url,
1926                     'ext': ext or src_ext or 'flv',
1927                     'format_id': 'http-%d' % (bitrate or http_count),
1928                     'tbr': bitrate,
1929                     'filesize': filesize,
1930                     'width': width,
1931                     'height': height,
1932                 })
1933
1934         return formats
1935
1936     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1937         urls = []
1938         subtitles = {}
1939         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1940             src = textstream.get('src')
1941             if not src or src in urls:
1942                 continue
1943             urls.append(src)
1944             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1945             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1946             subtitles.setdefault(lang, []).append({
1947                 'url': src,
1948                 'ext': ext,
1949             })
1950         return subtitles
1951
1952     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1953         xspf = self._download_xml(
1954             xspf_url, playlist_id, 'Downloading xpsf playlist',
1955             'Unable to download xspf manifest', fatal=fatal)
1956         if xspf is False:
1957             return []
1958         return self._parse_xspf(
1959             xspf, playlist_id, xspf_url=xspf_url,
1960             xspf_base_url=base_url(xspf_url))
1961
1962     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1963         NS_MAP = {
1964             'xspf': 'http://xspf.org/ns/0/',
1965             's1': 'http://static.streamone.nl/player/ns/0',
1966         }
1967
1968         entries = []
1969         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1970             title = xpath_text(
1971                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1972             description = xpath_text(
1973                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1974             thumbnail = xpath_text(
1975                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1976             duration = float_or_none(
1977                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1978
1979             formats = []
1980             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1981                 format_url = urljoin(xspf_base_url, location.text)
1982                 if not format_url:
1983                     continue
1984                 formats.append({
1985                     'url': format_url,
1986                     'manifest_url': xspf_url,
1987                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1988                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1989                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1990                 })
1991             self._sort_formats(formats)
1992
1993             entries.append({
1994                 'id': playlist_id,
1995                 'title': title,
1996                 'description': description,
1997                 'thumbnail': thumbnail,
1998                 'duration': duration,
1999                 'formats': formats,
2000             })
2001         return entries
2002
2003     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2004         res = self._download_xml_handle(
2005             mpd_url, video_id,
2006             note=note or 'Downloading MPD manifest',
2007             errnote=errnote or 'Failed to download MPD manifest',
2008             fatal=fatal)
2009         if res is False:
2010             return []
2011         mpd_doc, urlh = res
2012         mpd_base_url = base_url(urlh.geturl())
2013
2014         return self._parse_mpd_formats(
2015             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2016             formats_dict=formats_dict, mpd_url=mpd_url)
2017
2018     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2019         """
2020         Parse formats from MPD manifest.
2021         References:
2022          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2023             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2024          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2025         """
2026         if mpd_doc.get('type') == 'dynamic':
2027             return []
2028
2029         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2030
2031         def _add_ns(path):
2032             return self._xpath_ns(path, namespace)
2033
2034         def is_drm_protected(element):
2035             return element.find(_add_ns('ContentProtection')) is not None
2036
2037         def extract_multisegment_info(element, ms_parent_info):
2038             ms_info = ms_parent_info.copy()
2039
2040             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2041             # common attributes and elements.  We will only extract relevant
2042             # for us.
2043             def extract_common(source):
2044                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2045                 if segment_timeline is not None:
2046                     s_e = segment_timeline.findall(_add_ns('S'))
2047                     if s_e:
2048                         ms_info['total_number'] = 0
2049                         ms_info['s'] = []
2050                         for s in s_e:
2051                             r = int(s.get('r', 0))
2052                             ms_info['total_number'] += 1 + r
2053                             ms_info['s'].append({
2054                                 't': int(s.get('t', 0)),
2055                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2056                                 'd': int(s.attrib['d']),
2057                                 'r': r,
2058                             })
2059                 start_number = source.get('startNumber')
2060                 if start_number:
2061                     ms_info['start_number'] = int(start_number)
2062                 timescale = source.get('timescale')
2063                 if timescale:
2064                     ms_info['timescale'] = int(timescale)
2065                 segment_duration = source.get('duration')
2066                 if segment_duration:
2067                     ms_info['segment_duration'] = float(segment_duration)
2068
2069             def extract_Initialization(source):
2070                 initialization = source.find(_add_ns('Initialization'))
2071                 if initialization is not None:
2072                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2073
2074             segment_list = element.find(_add_ns('SegmentList'))
2075             if segment_list is not None:
2076                 extract_common(segment_list)
2077                 extract_Initialization(segment_list)
2078                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2079                 if segment_urls_e:
2080                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2081             else:
2082                 segment_template = element.find(_add_ns('SegmentTemplate'))
2083                 if segment_template is not None:
2084                     extract_common(segment_template)
2085                     media = segment_template.get('media')
2086                     if media:
2087                         ms_info['media'] = media
2088                     initialization = segment_template.get('initialization')
2089                     if initialization:
2090                         ms_info['initialization'] = initialization
2091                     else:
2092                         extract_Initialization(segment_template)
2093             return ms_info
2094
2095         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2096         formats = []
2097         for period in mpd_doc.findall(_add_ns('Period')):
2098             period_duration = parse_duration(period.get('duration')) or mpd_duration
2099             period_ms_info = extract_multisegment_info(period, {
2100                 'start_number': 1,
2101                 'timescale': 1,
2102             })
2103             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2104                 if is_drm_protected(adaptation_set):
2105                     continue
2106                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2107                 for representation in adaptation_set.findall(_add_ns('Representation')):
2108                     if is_drm_protected(representation):
2109                         continue
2110                     representation_attrib = adaptation_set.attrib.copy()
2111                     representation_attrib.update(representation.attrib)
2112                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2113                     mime_type = representation_attrib['mimeType']
2114                     content_type = mime_type.split('/')[0]
2115                     if content_type == 'text':
2116                         # TODO implement WebVTT downloading
2117                         pass
2118                     elif content_type in ('video', 'audio'):
2119                         base_url = ''
2120                         for element in (representation, adaptation_set, period, mpd_doc):
2121                             base_url_e = element.find(_add_ns('BaseURL'))
2122                             if base_url_e is not None:
2123                                 base_url = base_url_e.text + base_url
2124                                 if re.match(r'^https?://', base_url):
2125                                     break
2126                         if mpd_base_url and not re.match(r'^https?://', base_url):
2127                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2128                                 mpd_base_url += '/'
2129                             base_url = mpd_base_url + base_url
2130                         representation_id = representation_attrib.get('id')
2131                         lang = representation_attrib.get('lang')
2132                         url_el = representation.find(_add_ns('BaseURL'))
2133                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2134                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2135                         f = {
2136                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2137                             'url': mpd_url,
2138                             'manifest_url': mpd_url,
2139                             'ext': mimetype2ext(mime_type),
2140                             'width': int_or_none(representation_attrib.get('width')),
2141                             'height': int_or_none(representation_attrib.get('height')),
2142                             'tbr': float_or_none(bandwidth, 1000),
2143                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2144                             'fps': int_or_none(representation_attrib.get('frameRate')),
2145                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2146                             'format_note': 'DASH %s' % content_type,
2147                             'filesize': filesize,
2148                             'container': mimetype2ext(mime_type) + '_dash',
2149                         }
2150                         f.update(parse_codecs(representation_attrib.get('codecs')))
2151                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2152
2153                         def prepare_template(template_name, identifiers):
2154                             tmpl = representation_ms_info[template_name]
2155                             # First of, % characters outside $...$ templates
2156                             # must be escaped by doubling for proper processing
2157                             # by % operator string formatting used further (see
2158                             # https://github.com/rg3/youtube-dl/issues/16867).
2159                             t = ''
2160                             in_template = False
2161                             for c in tmpl:
2162                                 t += c
2163                                 if c == '$':
2164                                     in_template = not in_template
2165                                 elif c == '%' and not in_template:
2166                                     t += c
2167                             # Next, $...$ templates are translated to their
2168                             # %(...) counterparts to be used with % operator
2169                             t = t.replace('$RepresentationID$', representation_id)
2170                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2171                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2172                             t.replace('$$', '$')
2173                             return t
2174
2175                         # @initialization is a regular template like @media one
2176                         # so it should be handled just the same way (see
2177                         # https://github.com/rg3/youtube-dl/issues/11605)
2178                         if 'initialization' in representation_ms_info:
2179                             initialization_template = prepare_template(
2180                                 'initialization',
2181                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2182                                 # $Time$ shall not be included for @initialization thus
2183                                 # only $Bandwidth$ remains
2184                                 ('Bandwidth', ))
2185                             representation_ms_info['initialization_url'] = initialization_template % {
2186                                 'Bandwidth': bandwidth,
2187                             }
2188
2189                         def location_key(location):
2190                             return 'url' if re.match(r'^https?://', location) else 'path'
2191
2192                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2193
2194                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2195                             media_location_key = location_key(media_template)
2196
2197                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2198                             # can't be used at the same time
2199                             if '%(Number' in media_template and 's' not in representation_ms_info:
2200                                 segment_duration = None
2201                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2202                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2203                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2204                                 representation_ms_info['fragments'] = [{
2205                                     media_location_key: media_template % {
2206                                         'Number': segment_number,
2207                                         'Bandwidth': bandwidth,
2208                                     },
2209                                     'duration': segment_duration,
2210                                 } for segment_number in range(
2211                                     representation_ms_info['start_number'],
2212                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2213                             else:
2214                                 # $Number*$ or $Time$ in media template with S list available
2215                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2216                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2217                                 representation_ms_info['fragments'] = []
2218                                 segment_time = 0
2219                                 segment_d = None
2220                                 segment_number = representation_ms_info['start_number']
2221
2222                                 def add_segment_url():
2223                                     segment_url = media_template % {
2224                                         'Time': segment_time,
2225                                         'Bandwidth': bandwidth,
2226                                         'Number': segment_number,
2227                                     }
2228                                     representation_ms_info['fragments'].append({
2229                                         media_location_key: segment_url,
2230                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2231                                     })
2232
2233                                 for num, s in enumerate(representation_ms_info['s']):
2234                                     segment_time = s.get('t') or segment_time
2235                                     segment_d = s['d']
2236                                     add_segment_url()
2237                                     segment_number += 1
2238                                     for r in range(s.get('r', 0)):
2239                                         segment_time += segment_d
2240                                         add_segment_url()
2241                                         segment_number += 1
2242                                     segment_time += segment_d
2243                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2244                             # No media template
2245                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2246                             # or any YouTube dashsegments video
2247                             fragments = []
2248                             segment_index = 0
2249                             timescale = representation_ms_info['timescale']
2250                             for s in representation_ms_info['s']:
2251                                 duration = float_or_none(s['d'], timescale)
2252                                 for r in range(s.get('r', 0) + 1):
2253                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2254                                     fragments.append({
2255                                         location_key(segment_uri): segment_uri,
2256                                         'duration': duration,
2257                                     })
2258                                     segment_index += 1
2259                             representation_ms_info['fragments'] = fragments
2260                         elif 'segment_urls' in representation_ms_info:
2261                             # Segment URLs with no SegmentTimeline
2262                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2263                             # https://github.com/rg3/youtube-dl/pull/14844
2264                             fragments = []
2265                             segment_duration = float_or_none(
2266                                 representation_ms_info['segment_duration'],
2267                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2268                             for segment_url in representation_ms_info['segment_urls']:
2269                                 fragment = {
2270                                     location_key(segment_url): segment_url,
2271                                 }
2272                                 if segment_duration:
2273                                     fragment['duration'] = segment_duration
2274                                 fragments.append(fragment)
2275                             representation_ms_info['fragments'] = fragments
2276                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2277                         # No fragments key is present in this case.
2278                         if 'fragments' in representation_ms_info:
2279                             f.update({
2280                                 'fragment_base_url': base_url,
2281                                 'fragments': [],
2282                                 'protocol': 'http_dash_segments',
2283                             })
2284                             if 'initialization_url' in representation_ms_info:
2285                                 initialization_url = representation_ms_info['initialization_url']
2286                                 if not f.get('url'):
2287                                     f['url'] = initialization_url
2288                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2289                             f['fragments'].extend(representation_ms_info['fragments'])
2290                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2291                         # is not necessarily unique within a Period thus formats with
2292                         # the same `format_id` are quite possible. There are numerous examples
2293                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2294                         # https://github.com/rg3/youtube-dl/issues/13919)
2295                         full_info = formats_dict.get(representation_id, {}).copy()
2296                         full_info.update(f)
2297                         formats.append(full_info)
2298                     else:
2299                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2300         return formats
2301
2302     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2303         res = self._download_xml_handle(
2304             ism_url, video_id,
2305             note=note or 'Downloading ISM manifest',
2306             errnote=errnote or 'Failed to download ISM manifest',
2307             fatal=fatal)
2308         if res is False:
2309             return []
2310         ism_doc, urlh = res
2311
2312         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2313
2314     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2315         """
2316         Parse formats from ISM manifest.
2317         References:
2318          1. [MS-SSTR]: Smooth Streaming Protocol,
2319             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2320         """
2321         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2322             return []
2323
2324         duration = int(ism_doc.attrib['Duration'])
2325         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2326
2327         formats = []
2328         for stream in ism_doc.findall('StreamIndex'):
2329             stream_type = stream.get('Type')
2330             if stream_type not in ('video', 'audio'):
2331                 continue
2332             url_pattern = stream.attrib['Url']
2333             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2334             stream_name = stream.get('Name')
2335             for track in stream.findall('QualityLevel'):
2336                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2337                 # TODO: add support for WVC1 and WMAP
2338                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2339                     self.report_warning('%s is not a supported codec' % fourcc)
2340                     continue
2341                 tbr = int(track.attrib['Bitrate']) // 1000
2342                 # [1] does not mention Width and Height attributes. However,
2343                 # they're often present while MaxWidth and MaxHeight are
2344                 # missing, so should be used as fallbacks
2345                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2346                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2347                 sampling_rate = int_or_none(track.get('SamplingRate'))
2348
2349                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2350                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2351
2352                 fragments = []
2353                 fragment_ctx = {
2354                     'time': 0,
2355                 }
2356                 stream_fragments = stream.findall('c')
2357                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2358                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2359                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2360                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2361                     if not fragment_ctx['duration']:
2362                         try:
2363                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2364                         except IndexError:
2365                             next_fragment_time = duration
2366                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2367                     for _ in range(fragment_repeat):
2368                         fragments.append({
2369                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2370                             'duration': fragment_ctx['duration'] / stream_timescale,
2371                         })
2372                         fragment_ctx['time'] += fragment_ctx['duration']
2373
2374                 format_id = []
2375                 if ism_id:
2376                     format_id.append(ism_id)
2377                 if stream_name:
2378                     format_id.append(stream_name)
2379                 format_id.append(compat_str(tbr))
2380
2381                 formats.append({
2382                     'format_id': '-'.join(format_id),
2383                     'url': ism_url,
2384                     'manifest_url': ism_url,
2385                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2386                     'width': width,
2387                     'height': height,
2388                     'tbr': tbr,
2389                     'asr': sampling_rate,
2390                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2391                     'acodec': 'none' if stream_type == 'video' else fourcc,
2392                     'protocol': 'ism',
2393                     'fragments': fragments,
2394                     '_download_params': {
2395                         'duration': duration,
2396                         'timescale': stream_timescale,
2397                         'width': width or 0,
2398                         'height': height or 0,
2399                         'fourcc': fourcc,
2400                         'codec_private_data': track.get('CodecPrivateData'),
2401                         'sampling_rate': sampling_rate,
2402                         'channels': int_or_none(track.get('Channels', 2)),
2403                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2404                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2405                     },
2406                 })
2407         return formats
2408
2409     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2410         def absolute_url(item_url):
2411             return urljoin(base_url, item_url)
2412
2413         def parse_content_type(content_type):
2414             if not content_type:
2415                 return {}
2416             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2417             if ctr:
2418                 mimetype, codecs = ctr.groups()
2419                 f = parse_codecs(codecs)
2420                 f['ext'] = mimetype2ext(mimetype)
2421                 return f
2422             return {}
2423
2424         def _media_formats(src, cur_media_type, type_info={}):
2425             full_url = absolute_url(src)
2426             ext = type_info.get('ext') or determine_ext(full_url)
2427             if ext == 'm3u8':
2428                 is_plain_url = False
2429                 formats = self._extract_m3u8_formats(
2430                     full_url, video_id, ext='mp4',
2431                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2432                     preference=preference, fatal=False)
2433             elif ext == 'mpd':
2434                 is_plain_url = False
2435                 formats = self._extract_mpd_formats(
2436                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2437             else:
2438                 is_plain_url = True
2439                 formats = [{
2440                     'url': full_url,
2441                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2442                 }]
2443             return is_plain_url, formats
2444
2445         entries = []
2446         # amp-video and amp-audio are very similar to their HTML5 counterparts
2447         # so we wll include them right here (see
2448         # https://www.ampproject.org/docs/reference/components/amp-video)
2449         media_tags = [(media_tag, media_type, '')
2450                       for media_tag, media_type
2451                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2452         media_tags.extend(re.findall(
2453             # We only allow video|audio followed by a whitespace or '>'.
2454             # Allowing more characters may end up in significant slow down (see
2455             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2456             # http://www.porntrex.com/maps/videositemap.xml).
2457             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2458         for media_tag, media_type, media_content in media_tags:
2459             media_info = {
2460                 'formats': [],
2461                 'subtitles': {},
2462             }
2463             media_attributes = extract_attributes(media_tag)
2464             src = media_attributes.get('src')
2465             if src:
2466                 _, formats = _media_formats(src, media_type)
2467                 media_info['formats'].extend(formats)
2468             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2469             if media_content:
2470                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2471                     source_attributes = extract_attributes(source_tag)
2472                     src = source_attributes.get('src')
2473                     if not src:
2474                         continue
2475                     f = parse_content_type(source_attributes.get('type'))
2476                     is_plain_url, formats = _media_formats(src, media_type, f)
2477                     if is_plain_url:
2478                         # res attribute is not standard but seen several times
2479                         # in the wild
2480                         f.update({
2481                             'height': int_or_none(source_attributes.get('res')),
2482                             'format_id': source_attributes.get('label'),
2483                         })
2484                         f.update(formats[0])
2485                         media_info['formats'].append(f)
2486                     else:
2487                         media_info['formats'].extend(formats)
2488                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2489                     track_attributes = extract_attributes(track_tag)
2490                     kind = track_attributes.get('kind')
2491                     if not kind or kind in ('subtitles', 'captions'):
2492                         src = track_attributes.get('src')
2493                         if not src:
2494                             continue
2495                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2496                         media_info['subtitles'].setdefault(lang, []).append({
2497                             'url': absolute_url(src),
2498                         })
2499             for f in media_info['formats']:
2500                 f.setdefault('http_headers', {})['Referer'] = base_url
2501             if media_info['formats'] or media_info['subtitles']:
2502                 entries.append(media_info)
2503         return entries
2504
2505     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2506         formats = []
2507         hdcore_sign = 'hdcore=3.7.0'
2508         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2509         hds_host = hosts.get('hds')
2510         if hds_host:
2511             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2512         if 'hdcore=' not in f4m_url:
2513             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2514         f4m_formats = self._extract_f4m_formats(
2515             f4m_url, video_id, f4m_id='hds', fatal=False)
2516         for entry in f4m_formats:
2517             entry.update({'extra_param_to_segment_url': hdcore_sign})
2518         formats.extend(f4m_formats)
2519         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2520         hls_host = hosts.get('hls')
2521         if hls_host:
2522             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2523         formats.extend(self._extract_m3u8_formats(
2524             m3u8_url, video_id, 'mp4', 'm3u8_native',
2525             m3u8_id='hls', fatal=False))
2526         return formats
2527
2528     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2529         query = compat_urlparse.urlparse(url).query
2530         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2531         mobj = re.search(
2532             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2533         url_base = mobj.group('url')
2534         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2535         formats = []
2536
2537         def manifest_url(manifest):
2538             m_url = '%s/%s' % (http_base_url, manifest)
2539             if query:
2540                 m_url += '?%s' % query
2541             return m_url
2542
2543         if 'm3u8' not in skip_protocols:
2544             formats.extend(self._extract_m3u8_formats(
2545                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2546                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2547         if 'f4m' not in skip_protocols:
2548             formats.extend(self._extract_f4m_formats(
2549                 manifest_url('manifest.f4m'),
2550                 video_id, f4m_id='hds', fatal=False))
2551         if 'dash' not in skip_protocols:
2552             formats.extend(self._extract_mpd_formats(
2553                 manifest_url('manifest.mpd'),
2554                 video_id, mpd_id='dash', fatal=False))
2555         if re.search(r'(?:/smil:|\.smil)', url_base):
2556             if 'smil' not in skip_protocols:
2557                 rtmp_formats = self._extract_smil_formats(
2558                     manifest_url('jwplayer.smil'),
2559                     video_id, fatal=False)
2560                 for rtmp_format in rtmp_formats:
2561                     rtsp_format = rtmp_format.copy()
2562                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2563                     del rtsp_format['play_path']
2564                     del rtsp_format['ext']
2565                     rtsp_format.update({
2566                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2567                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2568                         'protocol': 'rtsp',
2569                     })
2570                     formats.extend([rtmp_format, rtsp_format])
2571         else:
2572             for protocol in ('rtmp', 'rtsp'):
2573                 if protocol not in skip_protocols:
2574                     formats.append({
2575                         'url': '%s:%s' % (protocol, url_base),
2576                         'format_id': protocol,
2577                         'protocol': protocol,
2578                     })
2579         return formats
2580
2581     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2582         mobj = re.search(
2583             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2584             webpage)
2585         if mobj:
2586             try:
2587                 jwplayer_data = self._parse_json(mobj.group('options'),
2588                                                  video_id=video_id,
2589                                                  transform_source=transform_source)
2590             except ExtractorError:
2591                 pass
2592             else:
2593                 if isinstance(jwplayer_data, dict):
2594                     return jwplayer_data
2595
2596     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2597         jwplayer_data = self._find_jwplayer_data(
2598             webpage, video_id, transform_source=js_to_json)
2599         return self._parse_jwplayer_data(
2600             jwplayer_data, video_id, *args, **kwargs)
2601
2602     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2603                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2604         # JWPlayer backward compatibility: flattened playlists
2605         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2606         if 'playlist' not in jwplayer_data:
2607             jwplayer_data = {'playlist': [jwplayer_data]}
2608
2609         entries = []
2610
2611         # JWPlayer backward compatibility: single playlist item
2612         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2613         if not isinstance(jwplayer_data['playlist'], list):
2614             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2615
2616         for video_data in jwplayer_data['playlist']:
2617             # JWPlayer backward compatibility: flattened sources
2618             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2619             if 'sources' not in video_data:
2620                 video_data['sources'] = [video_data]
2621
2622             this_video_id = video_id or video_data['mediaid']
2623
2624             formats = self._parse_jwplayer_formats(
2625                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2626                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2627
2628             subtitles = {}
2629             tracks = video_data.get('tracks')
2630             if tracks and isinstance(tracks, list):
2631                 for track in tracks:
2632                     if not isinstance(track, dict):
2633                         continue
2634                     track_kind = track.get('kind')
2635                     if not track_kind or not isinstance(track_kind, compat_str):
2636                         continue
2637                     if track_kind.lower() not in ('captions', 'subtitles'):
2638                         continue
2639                     track_url = urljoin(base_url, track.get('file'))
2640                     if not track_url:
2641                         continue
2642                     subtitles.setdefault(track.get('label') or 'en', []).append({
2643                         'url': self._proto_relative_url(track_url)
2644                     })
2645
2646             entry = {
2647                 'id': this_video_id,
2648                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2649                 'description': video_data.get('description'),
2650                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2651                 'timestamp': int_or_none(video_data.get('pubdate')),
2652                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2653                 'subtitles': subtitles,
2654             }
2655             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2656             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2657                 entry.update({
2658                     '_type': 'url_transparent',
2659                     'url': formats[0]['url'],
2660                 })
2661             else:
2662                 self._sort_formats(formats)
2663                 entry['formats'] = formats
2664             entries.append(entry)
2665         if len(entries) == 1:
2666             return entries[0]
2667         else:
2668             return self.playlist_result(entries)
2669
2670     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2671                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2672         urls = []
2673         formats = []
2674         for source in jwplayer_sources_data:
2675             if not isinstance(source, dict):
2676                 continue
2677             source_url = urljoin(
2678                 base_url, self._proto_relative_url(source.get('file')))
2679             if not source_url or source_url in urls:
2680                 continue
2681             urls.append(source_url)
2682             source_type = source.get('type') or ''
2683             ext = mimetype2ext(source_type) or determine_ext(source_url)
2684             if source_type == 'hls' or ext == 'm3u8':
2685                 formats.extend(self._extract_m3u8_formats(
2686                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2687                     m3u8_id=m3u8_id, fatal=False))
2688             elif source_type == 'dash' or ext == 'mpd':
2689                 formats.extend(self._extract_mpd_formats(
2690                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2691             elif ext == 'smil':
2692                 formats.extend(self._extract_smil_formats(
2693                     source_url, video_id, fatal=False))
2694             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2695             elif source_type.startswith('audio') or ext in (
2696                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2697                 formats.append({
2698                     'url': source_url,
2699                     'vcodec': 'none',
2700                     'ext': ext,
2701                 })
2702             else:
2703                 height = int_or_none(source.get('height'))
2704                 if height is None:
2705                     # Often no height is provided but there is a label in
2706                     # format like "1080p", "720p SD", or 1080.
2707                     height = int_or_none(self._search_regex(
2708                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2709                         'height', default=None))
2710                 a_format = {
2711                     'url': source_url,
2712                     'width': int_or_none(source.get('width')),
2713                     'height': height,
2714                     'tbr': int_or_none(source.get('bitrate')),
2715                     'ext': ext,
2716                 }
2717                 if source_url.startswith('rtmp'):
2718                     a_format['ext'] = 'flv'
2719                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2720                     # of jwplayer.flash.swf
2721                     rtmp_url_parts = re.split(
2722                         r'((?:mp4|mp3|flv):)', source_url, 1)
2723                     if len(rtmp_url_parts) == 3:
2724                         rtmp_url, prefix, play_path = rtmp_url_parts
2725                         a_format.update({
2726                             'url': rtmp_url,
2727                             'play_path': prefix + play_path,
2728                         })
2729                     if rtmp_params:
2730                         a_format.update(rtmp_params)
2731                 formats.append(a_format)
2732         return formats
2733
2734     def _live_title(self, name):
2735         """ Generate the title for a live video """
2736         now = datetime.datetime.now()
2737         now_str = now.strftime('%Y-%m-%d %H:%M')
2738         return name + ' ' + now_str
2739
2740     def _int(self, v, name, fatal=False, **kwargs):
2741         res = int_or_none(v, **kwargs)
2742         if 'get_attr' in kwargs:
2743             print(getattr(v, kwargs['get_attr']))
2744         if res is None:
2745             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2746             if fatal:
2747                 raise ExtractorError(msg)
2748             else:
2749                 self._downloader.report_warning(msg)
2750         return res
2751
2752     def _float(self, v, name, fatal=False, **kwargs):
2753         res = float_or_none(v, **kwargs)
2754         if res is None:
2755             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2756             if fatal:
2757                 raise ExtractorError(msg)
2758             else:
2759                 self._downloader.report_warning(msg)
2760         return res
2761
2762     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2763                     path='/', secure=False, discard=False, rest={}, **kwargs):
2764         cookie = compat_cookiejar.Cookie(
2765             0, name, value, port, port is not None, domain, True,
2766             domain.startswith('.'), path, True, secure, expire_time,
2767             discard, None, None, rest)
2768         self._downloader.cookiejar.set_cookie(cookie)
2769
2770     def _get_cookies(self, url):
2771         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2772         req = sanitized_Request(url)
2773         self._downloader.cookiejar.add_cookie_header(req)
2774         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2775
2776     def get_testcases(self, include_onlymatching=False):
2777         t = getattr(self, '_TEST', None)
2778         if t:
2779             assert not hasattr(self, '_TESTS'), \
2780                 '%s has _TEST and _TESTS' % type(self).__name__
2781             tests = [t]
2782         else:
2783             tests = getattr(self, '_TESTS', [])
2784         for t in tests:
2785             if not include_onlymatching and t.get('only_matching', False):
2786                 continue
2787             t['name'] = type(self).__name__[:-len('IE')]
2788             yield t
2789
2790     def is_suitable(self, age_limit):
2791         """ Test whether the extractor is generally suitable for the given
2792         age limit (i.e. pornographic sites are not, all others usually are) """
2793
2794         any_restricted = False
2795         for tc in self.get_testcases(include_onlymatching=False):
2796             if tc.get('playlist', []):
2797                 tc = tc['playlist'][0]
2798             is_restricted = age_restricted(
2799                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2800             if not is_restricted:
2801                 return True
2802             any_restricted = any_restricted or is_restricted
2803         return not any_restricted
2804
2805     def extract_subtitles(self, *args, **kwargs):
2806         if (self._downloader.params.get('writesubtitles', False) or
2807                 self._downloader.params.get('listsubtitles')):
2808             return self._get_subtitles(*args, **kwargs)
2809         return {}
2810
2811     def _get_subtitles(self, *args, **kwargs):
2812         raise NotImplementedError('This method must be implemented by subclasses')
2813
2814     @staticmethod
2815     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2816         """ Merge subtitle items for one language. Items with duplicated URLs
2817         will be dropped. """
2818         list1_urls = set([item['url'] for item in subtitle_list1])
2819         ret = list(subtitle_list1)
2820         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2821         return ret
2822
2823     @classmethod
2824     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2825         """ Merge two subtitle dictionaries, language by language. """
2826         ret = dict(subtitle_dict1)
2827         for lang in subtitle_dict2:
2828             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2829         return ret
2830
2831     def extract_automatic_captions(self, *args, **kwargs):
2832         if (self._downloader.params.get('writeautomaticsub', False) or
2833                 self._downloader.params.get('listsubtitles')):
2834             return self._get_automatic_captions(*args, **kwargs)
2835         return {}
2836
2837     def _get_automatic_captions(self, *args, **kwargs):
2838         raise NotImplementedError('This method must be implemented by subclasses')
2839
2840     def mark_watched(self, *args, **kwargs):
2841         if (self._downloader.params.get('mark_watched', False) and
2842                 (self._get_login_info()[0] is not None or
2843                     self._downloader.params.get('cookiefile') is not None)):
2844             self._mark_watched(*args, **kwargs)
2845
2846     def _mark_watched(self, *args, **kwargs):
2847         raise NotImplementedError('This method must be implemented by subclasses')
2848
2849     def geo_verification_headers(self):
2850         headers = {}
2851         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2852         if geo_verification_proxy:
2853             headers['Ytdl-request-proxy'] = geo_verification_proxy
2854         return headers
2855
2856     def _generic_id(self, url):
2857         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2858
2859     def _generic_title(self, url):
2860         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2861
2862
2863 class SearchInfoExtractor(InfoExtractor):
2864     """
2865     Base class for paged search queries extractors.
2866     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2867     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2868     """
2869
2870     @classmethod
2871     def _make_valid_url(cls):
2872         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2873
2874     @classmethod
2875     def suitable(cls, url):
2876         return re.match(cls._make_valid_url(), url) is not None
2877
2878     def _real_extract(self, query):
2879         mobj = re.match(self._make_valid_url(), query)
2880         if mobj is None:
2881             raise ExtractorError('Invalid search query "%s"' % query)
2882
2883         prefix = mobj.group('prefix')
2884         query = mobj.group('query')
2885         if prefix == '':
2886             return self._get_n_results(query, 1)
2887         elif prefix == 'all':
2888             return self._get_n_results(query, self._MAX_RESULTS)
2889         else:
2890             n = int(prefix)
2891             if n <= 0:
2892                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2893             elif n > self._MAX_RESULTS:
2894                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2895                 n = self._MAX_RESULTS
2896             return self._get_n_results(query, n)
2897
2898     def _get_n_results(self, query, n):
2899         """Get a specified number of results for a query"""
2900         raise NotImplementedError('This method must be implemented by subclasses')
2901
2902     @property
2903     def SEARCH_KEY(self):
2904         return self._SEARCH_KEY