_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_integer_types,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     NO_DEFAULT,
  39     age_restricted,
  40     base_url,
  41     bug_reports_message,
  42     clean_html,
  43     compiled_regex_type,
  44     determine_ext,
  45     determine_protocol,
  46     error_to_compat_str,
  47     ExtractorError,
  48     extract_attributes,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     GeoRestrictedError,
  52     GeoUtils,
  53     int_or_none,
  54     js_to_json,
  55     JSON_LD_RE,
  56     mimetype2ext,
  57     orderedSet,
  58     parse_codecs,
  59     parse_duration,
  60     parse_iso8601,
  61     parse_m3u8_attributes,
  62     RegexNotFoundError,
  63     sanitized_Request,
  64     sanitize_filename,
  65     unescapeHTML,
  66     unified_strdate,
  67     unified_timestamp,
  68     update_Request,
  69     update_url_query,
  70     urljoin,
  71     url_basename,
  72     url_or_none,
  73     xpath_element,
  74     xpath_text,
  75     xpath_with_ns,
  76 )
  77
  78
  79 class InfoExtractor(object):
  80     """Information Extractor class.
  81
  82     Information extractors are the classes that, given a URL, extract
  83     information about the video (or videos) the URL refers to. This
  84     information includes the real video URL, the video title, author and
  85     others. The information is stored in a dictionary which is then
  86     passed to the YoutubeDL. The YoutubeDL processes this
  87     information possibly downloading the video to the file system, among
  88     other possible outcomes.
  89
  90     The type field determines the type of the result.
  91     By far the most common value (and the default if _type is missing) is
  92     "video", which indicates a single video.
  93
  94     For a video, the dictionaries must include the following fields:
  95
  96     id:             Video identifier.
  97     title:          Video title, unescaped.
  98
  99     Additionally, it must contain either a formats entry or a url one:
 100
 101     formats:        A list of dictionaries for each format available, ordered
 102                     from worst to best quality.
 103
 104                     Potential fields:
 105                     * url        Mandatory. The URL of the video file
 106                     * manifest_url
 107                                  The URL of the manifest file in case of
 108                                  fragmented media (DASH, hls, hds)
 109                     * ext        Will be calculated from URL if missing
 110                     * format     A human-readable description of the format
 111                                  ("mp4 container with h264/opus").
 112                                  Calculated from the format_id, width, height.
 113                                  and format_note fields if missing.
 114                     * format_id  A short description of the format
 115                                  ("mp4_h264_opus" or "19").
 116                                 Technically optional, but strongly recommended.
 117                     * format_note Additional info about the format
 118                                  ("3D" or "DASH video")
 119                     * width      Width of the video, if known
 120                     * height     Height of the video, if known
 121                     * resolution Textual description of width and height
 122                     * tbr        Average bitrate of audio and video in KBit/s
 123                     * abr        Average audio bitrate in KBit/s
 124                     * acodec     Name of the audio codec in use
 125                     * asr        Audio sampling rate in Hertz
 126                     * vbr        Average video bitrate in KBit/s
 127                     * fps        Frame rate
 128                     * vcodec     Name of the video codec in use
 129                     * container  Name of the container format
 130                     * filesize   The number of bytes, if known in advance
 131                     * filesize_approx  An estimate for the number of bytes
 132                     * player_url SWF Player URL (used for rtmpdump).
 133                     * protocol   The protocol that will be used for the actual
 134                                  download, lower-case.
 135                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 136                                  "m3u8", "m3u8_native" or "http_dash_segments".
 137                     * fragment_base_url
 138                                  Base URL for fragments. Each fragment's path
 139                                  value (if present) will be relative to
 140                                  this URL.
 141                     * fragments  A list of fragments of a fragmented media.
 142                                  Each fragment entry must contain either an url
 143                                  or a path. If an url is present it should be
 144                                  considered by a client. Otherwise both path and
 145                                  fragment_base_url must be present. Here is
 146                                  the list of all potential fields:
 147                                  * "url" - fragment's URL
 148                                  * "path" - fragment's path relative to
 149                                             fragment_base_url
 150                                  * "duration" (optional, int or float)
 151                                  * "filesize" (optional, int)
 152                     * preference Order number of this format. If this field is
 153                                  present and not None, the formats get sorted
 154                                  by this field, regardless of all other values.
 155                                  -1 for default (order by other properties),
 156                                  -2 or smaller for less than default.
 157                                  < -1000 to hide the format (if there is
 158                                     another one which is strictly better)
 159                     * language   Language code, e.g. "de" or "en-US".
 160                     * language_preference  Is this in the language mentioned in
 161                                  the URL?
 162                                  10 if it's what the URL is about,
 163                                  -1 for default (don't know),
 164                                  -10 otherwise, other values reserved for now.
 165                     * quality    Order number of the video quality of this
 166                                  format, irrespective of the file format.
 167                                  -1 for default (order by other properties),
 168                                  -2 or smaller for less than default.
 169                     * source_preference  Order number for this video source
 170                                   (quality takes higher priority)
 171                                  -1 for default (order by other properties),
 172                                  -2 or smaller for less than default.
 173                     * http_headers  A dictionary of additional HTTP headers
 174                                  to add to the request.
 175                     * stretched_ratio  If given and not 1, indicates that the
 176                                  video's pixels are not square.
 177                                  width : height ratio as float.
 178                     * no_resume  The server does not support resuming the
 179                                  (HTTP or RTMP) download. Boolean.
 180                     * downloader_options  A dictionary of downloader options as
 181                                  described in FileDownloader
 182
 183     url:            Final video URL.
 184     ext:            Video filename extension.
 185     format:         The video format, defaults to ext (used for --get-format)
 186     player_url:     SWF Player URL (used for rtmpdump).
 187
 188     The following fields are optional:
 189
 190     alt_title:      A secondary title of the video.
 191     display_id      An alternative identifier for the video, not necessarily
 192                     unique, but available before title. Typically, id is
 193                     something like "4234987", title "Dancing naked mole rats",
 194                     and display_id "dancing-naked-mole-rats"
 195     thumbnails:     A list of dictionaries, with the following entries:
 196                         * "id" (optional, string) - Thumbnail format ID
 197                         * "url"
 198                         * "preference" (optional, int) - quality of the image
 199                         * "width" (optional, int)
 200                         * "height" (optional, int)
 201                         * "resolution" (optional, string "{width}x{height"},
 202                                         deprecated)
 203                         * "filesize" (optional, int)
 204     thumbnail:      Full URL to a video thumbnail image.
 205     description:    Full video description.
 206     uploader:       Full name of the video uploader.
 207     license:        License name the video is licensed under.
 208     creator:        The creator of the video.
 209     release_date:   The date (YYYYMMDD) when the video was released.
 210     timestamp:      UNIX timestamp of the moment the video became available.
 211     upload_date:    Video upload date (YYYYMMDD).
 212                     If not explicitly set, calculated from timestamp.
 213     uploader_id:    Nickname or id of the video uploader.
 214     uploader_url:   Full URL to a personal webpage of the video uploader.
 215     channel:        Full name of the channel the video is uploaded on.
 216                     Note that channel fields may or may not repeat uploader
 217                     fields. This depends on a particular extractor.
 218     channel_id:     Id of the channel.
 219     channel_url:    Full URL to a channel webpage.
 220     location:       Physical location where the video was filmed.
 221     subtitles:      The available subtitles as a dictionary in the format
 222                     {tag: subformats}. "tag" is usually a language code, and
 223                     "subformats" is a list sorted from lower to higher
 224                     preference, each element is a dictionary with the "ext"
 225                     entry and one of:
 226                         * "data": The subtitles file contents
 227                         * "url": A URL pointing to the subtitles file
 228                     "ext" will be calculated from URL if missing
 229     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 230                     automatically generated captions
 231     duration:       Length of the video in seconds, as an integer or float.
 232     view_count:     How many users have watched the video on the platform.
 233     like_count:     Number of positive ratings of the video
 234     dislike_count:  Number of negative ratings of the video
 235     repost_count:   Number of reposts of the video
 236     average_rating: Average rating give by users, the scale used depends on the webpage
 237     comment_count:  Number of comments on the video
 238     comments:       A list of comments, each with one or more of the following
 239                     properties (all but one of text or html optional):
 240                         * "author" - human-readable name of the comment author
 241                         * "author_id" - user ID of the comment author
 242                         * "id" - Comment ID
 243                         * "html" - Comment as HTML
 244                         * "text" - Plain text of the comment
 245                         * "timestamp" - UNIX timestamp of comment
 246                         * "parent" - ID of the comment this one is replying to.
 247                                      Set to "root" to indicate that this is a
 248                                      comment to the original video.
 249     age_limit:      Age restriction for the video, as an integer (years)
 250     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 251                     should allow to get the same result again. (It will be set
 252                     by YoutubeDL if it's missing)
 253     categories:     A list of categories that the video falls in, for example
 254                     ["Sports", "Berlin"]
 255     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 256     is_live:        True, False, or None (=unknown). Whether this video is a
 257                     live stream that goes on instead of a fixed-length video.
 258     start_time:     Time in seconds where the reproduction should start, as
 259                     specified in the URL.
 260     end_time:       Time in seconds where the reproduction should end, as
 261                     specified in the URL.
 262     chapters:       A list of dictionaries, with the following entries:
 263                         * "start_time" - The start time of the chapter in seconds
 264                         * "end_time" - The end time of the chapter in seconds
 265                         * "title" (optional, string)
 266
 267     The following fields should only be used when the video belongs to some logical
 268     chapter or section:
 269
 270     chapter:        Name or title of the chapter the video belongs to.
 271     chapter_number: Number of the chapter the video belongs to, as an integer.
 272     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 273
 274     The following fields should only be used when the video is an episode of some
 275     series, programme or podcast:
 276
 277     series:         Title of the series or programme the video episode belongs to.
 278     season:         Title of the season the video episode belongs to.
 279     season_number:  Number of the season the video episode belongs to, as an integer.
 280     season_id:      Id of the season the video episode belongs to, as a unicode string.
 281     episode:        Title of the video episode. Unlike mandatory video title field,
 282                     this field should denote the exact title of the video episode
 283                     without any kind of decoration.
 284     episode_number: Number of the video episode within a season, as an integer.
 285     episode_id:     Id of the video episode, as a unicode string.
 286
 287     The following fields should only be used when the media is a track or a part of
 288     a music album:
 289
 290     track:          Title of the track.
 291     track_number:   Number of the track within an album or a disc, as an integer.
 292     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 293                     as a unicode string.
 294     artist:         Artist(s) of the track.
 295     genre:          Genre(s) of the track.
 296     album:          Title of the album the track belongs to.
 297     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 298     album_artist:   List of all artists appeared on the album (e.g.
 299                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 300                     and compilations).
 301     disc_number:    Number of the disc or other physical medium the track belongs to,
 302                     as an integer.
 303     release_year:   Year (YYYY) when the album was released.
 304
 305     Unless mentioned otherwise, the fields should be Unicode strings.
 306
 307     Unless mentioned otherwise, None is equivalent to absence of information.
 308
 309
 310     _type "playlist" indicates multiple videos.
 311     There must be a key "entries", which is a list, an iterable, or a PagedList
 312     object, each element of which is a valid dictionary by this specification.
 313
 314     Additionally, playlists can have "id", "title", "description", "uploader",
 315     "uploader_id", "uploader_url" attributes with the same semantics as videos
 316     (see above).
 317
 318
 319     _type "multi_video" indicates that there are multiple videos that
 320     form a single show, for examples multiple acts of an opera or TV episode.
 321     It must have an entries key like a playlist and contain all the keys
 322     required for a video at the same time.
 323
 324
 325     _type "url" indicates that the video must be extracted from another
 326     location, possibly by a different extractor. Its only required key is:
 327     "url" - the next URL to extract.
 328     The key "ie_key" can be set to the class name (minus the trailing "IE",
 329     e.g. "Youtube") if the extractor class is known in advance.
 330     Additionally, the dictionary may have any properties of the resolved entity
 331     known in advance, for example "title" if the title of the referred video is
 332     known ahead of time.
 333
 334
 335     _type "url_transparent" entities have the same specification as "url", but
 336     indicate that the given additional information is more precise than the one
 337     associated with the resolved URL.
 338     This is useful when a site employs a video service that hosts the video and
 339     its technical metadata, but that video service does not embed a useful
 340     title, description etc.
 341
 342
 343     Subclasses of this one should re-define the _real_initialize() and
 344     _real_extract() methods and define a _VALID_URL regexp.
 345     Probably, they should also be added to the list of extractors.
 346
 347     _GEO_BYPASS attribute may be set to False in order to disable
 348     geo restriction bypass mechanisms for a particular extractor.
 349     Though it won't disable explicit geo restriction bypass based on
 350     country code provided with geo_bypass_country.
 351
 352     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 353     countries for this extractor. One of these countries will be used by
 354     geo restriction bypass mechanism right away in order to bypass
 355     geo restriction, of course, if the mechanism is not disabled.
 356
 357     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 358     IP blocks in CIDR notation for this extractor. One of these IP blocks
 359     will be used by geo restriction bypass mechanism similarly
 360     to _GEO_COUNTRIES.
 361
 362     Finally, the _WORKING attribute should be set to False for broken IEs
 363     in order to warn the users and skip the tests.
 364     """
 365
 366     _ready = False
 367     _downloader = None
 368     _x_forwarded_for_ip = None
 369     _GEO_BYPASS = True
 370     _GEO_COUNTRIES = None
 371     _GEO_IP_BLOCKS = None
 372     _WORKING = True
 373
 374     def __init__(self, downloader=None):
 375         """Constructor. Receives an optional downloader."""
 376         self._ready = False
 377         self._x_forwarded_for_ip = None
 378         self.set_downloader(downloader)
 379
 380     @classmethod
 381     def suitable(cls, url):
 382         """Receives a URL and returns True if suitable for this IE."""
 383
 384         # This does not use has/getattr intentionally - we want to know whether
 385         # we have cached the regexp for *this* class, whereas getattr would also
 386         # match the superclass
 387         if '_VALID_URL_RE' not in cls.__dict__:
 388             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 389         return cls._VALID_URL_RE.match(url) is not None
 390
 391     @classmethod
 392     def _match_id(cls, url):
 393         if '_VALID_URL_RE' not in cls.__dict__:
 394             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 395         m = cls._VALID_URL_RE.match(url)
 396         assert m
 397         return compat_str(m.group('id'))
 398
 399     @classmethod
 400     def working(cls):
 401         """Getter method for _WORKING."""
 402         return cls._WORKING
 403
 404     def initialize(self):
 405         """Initializes an instance (authentication, etc)."""
 406         self._initialize_geo_bypass({
 407             'countries': self._GEO_COUNTRIES,
 408             'ip_blocks': self._GEO_IP_BLOCKS,
 409         })
 410         if not self._ready:
 411             self._real_initialize()
 412             self._ready = True
 413
 414     def _initialize_geo_bypass(self, geo_bypass_context):
 415         """
 416         Initialize geo restriction bypass mechanism.
 417
 418         This method is used to initialize geo bypass mechanism based on faking
 419         X-Forwarded-For HTTP header. A random country from provided country list
 420         is selected and a random IP belonging to this country is generated. This
 421         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 422         HTTP requests.
 423
 424         This method will be used for initial geo bypass mechanism initialization
 425         during the instance initialization with _GEO_COUNTRIES and
 426         _GEO_IP_BLOCKS.
 427
 428         You may also manually call it from extractor's code if geo bypass
 429         information is not available beforehand (e.g. obtained during
 430         extraction) or due to some other reason. In this case you should pass
 431         this information in geo bypass context passed as first argument. It may
 432         contain following fields:
 433
 434         countries:  List of geo unrestricted countries (similar
 435                     to _GEO_COUNTRIES)
 436         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 437                     (similar to _GEO_IP_BLOCKS)
 438
 439         """
 440         if not self._x_forwarded_for_ip:
 441
 442             # Geo bypass mechanism is explicitly disabled by user
 443             if not self._downloader.params.get('geo_bypass', True):
 444                 return
 445
 446             if not geo_bypass_context:
 447                 geo_bypass_context = {}
 448
 449             # Backward compatibility: previously _initialize_geo_bypass
 450             # expected a list of countries, some 3rd party code may still use
 451             # it this way
 452             if isinstance(geo_bypass_context, (list, tuple)):
 453                 geo_bypass_context = {
 454                     'countries': geo_bypass_context,
 455                 }
 456
 457             # The whole point of geo bypass mechanism is to fake IP
 458             # as X-Forwarded-For HTTP header based on some IP block or
 459             # country code.
 460
 461             # Path 1: bypassing based on IP block in CIDR notation
 462
 463             # Explicit IP block specified by user, use it right away
 464             # regardless of whether extractor is geo bypassable or not
 465             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 466
 467             # Otherwise use random IP block from geo bypass context but only
 468             # if extractor is known as geo bypassable
 469             if not ip_block:
 470                 ip_blocks = geo_bypass_context.get('ip_blocks')
 471                 if self._GEO_BYPASS and ip_blocks:
 472                     ip_block = random.choice(ip_blocks)
 473
 474             if ip_block:
 475                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 476                 if self._downloader.params.get('verbose', False):
 477                     self._downloader.to_screen(
 478                         '[debug] Using fake IP %s as X-Forwarded-For.'
 479                         % self._x_forwarded_for_ip)
 480                 return
 481
 482             # Path 2: bypassing based on country code
 483
 484             # Explicit country code specified by user, use it right away
 485             # regardless of whether extractor is geo bypassable or not
 486             country = self._downloader.params.get('geo_bypass_country', None)
 487
 488             # Otherwise use random country code from geo bypass context but
 489             # only if extractor is known as geo bypassable
 490             if not country:
 491                 countries = geo_bypass_context.get('countries')
 492                 if self._GEO_BYPASS and countries:
 493                     country = random.choice(countries)
 494
 495             if country:
 496                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 497                 if self._downloader.params.get('verbose', False):
 498                     self._downloader.to_screen(
 499                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 500                         % (self._x_forwarded_for_ip, country.upper()))
 501
 502     def extract(self, url):
 503         """Extracts URL information and returns it in list of dicts."""
 504         try:
 505             for _ in range(2):
 506                 try:
 507                     self.initialize()
 508                     ie_result = self._real_extract(url)
 509                     if self._x_forwarded_for_ip:
 510                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 511                     return ie_result
 512                 except GeoRestrictedError as e:
 513                     if self.__maybe_fake_ip_and_retry(e.countries):
 514                         continue
 515                     raise
 516         except ExtractorError:
 517             raise
 518         except compat_http_client.IncompleteRead as e:
 519             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 520         except (KeyError, StopIteration) as e:
 521             raise ExtractorError('An extractor error has occurred.', cause=e)
 522
 523     def __maybe_fake_ip_and_retry(self, countries):
 524         if (not self._downloader.params.get('geo_bypass_country', None) and
 525                 self._GEO_BYPASS and
 526                 self._downloader.params.get('geo_bypass', True) and
 527                 not self._x_forwarded_for_ip and
 528                 countries):
 529             country_code = random.choice(countries)
 530             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 531             if self._x_forwarded_for_ip:
 532                 self.report_warning(
 533                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 534                     % (self._x_forwarded_for_ip, country_code.upper()))
 535                 return True
 536         return False
 537
 538     def set_downloader(self, downloader):
 539         """Sets the downloader for this IE."""
 540         self._downloader = downloader
 541
 542     def _real_initialize(self):
 543         """Real initialization process. Redefine in subclasses."""
 544         pass
 545
 546     def _real_extract(self, url):
 547         """Real extraction process. Redefine in subclasses."""
 548         pass
 549
 550     @classmethod
 551     def ie_key(cls):
 552         """A string for getting the InfoExtractor with get_info_extractor"""
 553         return compat_str(cls.__name__[:-2])
 554
 555     @property
 556     def IE_NAME(self):
 557         return compat_str(type(self).__name__[:-2])
 558
 559     @staticmethod
 560     def __can_accept_status_code(err, expected_status):
 561         assert isinstance(err, compat_urllib_error.HTTPError)
 562         if expected_status is None:
 563             return False
 564         if isinstance(expected_status, compat_integer_types):
 565             return err.code == expected_status
 566         elif isinstance(expected_status, (list, tuple)):
 567             return err.code in expected_status
 568         elif callable(expected_status):
 569             return expected_status(err.code) is True
 570         else:
 571             assert False
 572
 573     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 574         """
 575         Return the response handle.
 576
 577         See _download_webpage docstring for arguments specification.
 578         """
 579         if note is None:
 580             self.report_download_webpage(video_id)
 581         elif note is not False:
 582             if video_id is None:
 583                 self.to_screen('%s' % (note,))
 584             else:
 585                 self.to_screen('%s: %s' % (video_id, note))
 586
 587         # Some sites check X-Forwarded-For HTTP header in order to figure out
 588         # the origin of the client behind proxy. This allows bypassing geo
 589         # restriction by faking this header's value to IP that belongs to some
 590         # geo unrestricted country. We will do so once we encounter any
 591         # geo restriction error.
 592         if self._x_forwarded_for_ip:
 593             if 'X-Forwarded-For' not in headers:
 594                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 595
 596         if isinstance(url_or_request, compat_urllib_request.Request):
 597             url_or_request = update_Request(
 598                 url_or_request, data=data, headers=headers, query=query)
 599         else:
 600             if query:
 601                 url_or_request = update_url_query(url_or_request, query)
 602             if data is not None or headers:
 603                 url_or_request = sanitized_Request(url_or_request, data, headers)
 604         try:
 605             return self._downloader.urlopen(url_or_request)
 606         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 607             if isinstance(err, compat_urllib_error.HTTPError):
 608                 if self.__can_accept_status_code(err, expected_status):
 609                     # Retain reference to error to prevent file object from
 610                     # being closed before it can be read. Works around the
 611                     # effects of <https://bugs.python.org/issue15002>
 612                     # introduced in Python 3.4.1.
 613                     err.fp._error = err
 614                     return err.fp
 615
 616             if errnote is False:
 617                 return False
 618             if errnote is None:
 619                 errnote = 'Unable to download webpage'
 620
 621             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 622             if fatal:
 623                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 624             else:
 625                 self._downloader.report_warning(errmsg)
 626                 return False
 627
 628     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 629         """
 630         Return a tuple (page content as string, URL handle).
 631
 632         See _download_webpage docstring for arguments specification.
 633         """
 634         # Strip hashes from the URL (#1038)
 635         if isinstance(url_or_request, (compat_str, str)):
 636             url_or_request = url_or_request.partition('#')[0]
 637
 638         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 639         if urlh is False:
 640             assert not fatal
 641             return False
 642         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 643         return (content, urlh)
 644
 645     @staticmethod
 646     def _guess_encoding_from_content(content_type, webpage_bytes):
 647         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 648         if m:
 649             encoding = m.group(1)
 650         else:
 651             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 652                           webpage_bytes[:1024])
 653             if m:
 654                 encoding = m.group(1).decode('ascii')
 655             elif webpage_bytes.startswith(b'\xff\xfe'):
 656                 encoding = 'utf-16'
 657             else:
 658                 encoding = 'utf-8'
 659
 660         return encoding
 661
 662     def __check_blocked(self, content):
 663         first_block = content[:512]
 664         if ('<title>Access to this site is blocked</title>' in content and
 665                 'Websense' in first_block):
 666             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 667             blocked_iframe = self._html_search_regex(
 668                 r'<iframe src="([^"]+)"', content,
 669                 'Websense information URL', default=None)
 670             if blocked_iframe:
 671                 msg += ' Visit %s for more details' % blocked_iframe
 672             raise ExtractorError(msg, expected=True)
 673         if '<title>The URL you requested has been blocked</title>' in first_block:
 674             msg = (
 675                 'Access to this webpage has been blocked by Indian censorship. '
 676                 'Use a VPN or proxy server (with --proxy) to route around it.')
 677             block_msg = self._html_search_regex(
 678                 r'</h1><p>(.*?)</p>',
 679                 content, 'block message', default=None)
 680             if block_msg:
 681                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 682             raise ExtractorError(msg, expected=True)
 683         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 684                 'blocklist.rkn.gov.ru' in content):
 685             raise ExtractorError(
 686                 'Access to this webpage has been blocked by decision of the Russian government. '
 687                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 688                 expected=True)
 689
 690     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 691         content_type = urlh.headers.get('Content-Type', '')
 692         webpage_bytes = urlh.read()
 693         if prefix is not None:
 694             webpage_bytes = prefix + webpage_bytes
 695         if not encoding:
 696             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 697         if self._downloader.params.get('dump_intermediate_pages', False):
 698             self.to_screen('Dumping request to ' + urlh.geturl())
 699             dump = base64.b64encode(webpage_bytes).decode('ascii')
 700             self._downloader.to_screen(dump)
 701         if self._downloader.params.get('write_pages', False):
 702             basen = '%s_%s' % (video_id, urlh.geturl())
 703             if len(basen) > 240:
 704                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 705                 basen = basen[:240 - len(h)] + h
 706             raw_filename = basen + '.dump'
 707             filename = sanitize_filename(raw_filename, restricted=True)
 708             self.to_screen('Saving request to ' + filename)
 709             # Working around MAX_PATH limitation on Windows (see
 710             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 711             if compat_os_name == 'nt':
 712                 absfilepath = os.path.abspath(filename)
 713                 if len(absfilepath) > 259:
 714                     filename = '\\\\?\\' + absfilepath
 715             with open(filename, 'wb') as outf:
 716                 outf.write(webpage_bytes)
 717
 718         try:
 719             content = webpage_bytes.decode(encoding, 'replace')
 720         except LookupError:
 721             content = webpage_bytes.decode('utf-8', 'replace')
 722
 723         self.__check_blocked(content)
 724
 725         return content
 726
 727     def _download_webpage(
 728             self, url_or_request, video_id, note=None, errnote=None,
 729             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 730             headers={}, query={}, expected_status=None):
 731         """
 732         Return the data of the page as a string.
 733
 734         Arguments:
 735         url_or_request -- plain text URL as a string or
 736             a compat_urllib_request.Requestobject
 737         video_id -- Video/playlist/item identifier (string)
 738
 739         Keyword arguments:
 740         note -- note printed before downloading (string)
 741         errnote -- note printed in case of an error (string)
 742         fatal -- flag denoting whether error should be considered fatal,
 743             i.e. whether it should cause ExtractionError to be raised,
 744             otherwise a warning will be reported and extraction continued
 745         tries -- number of tries
 746         timeout -- sleep interval between tries
 747         encoding -- encoding for a page content decoding, guessed automatically
 748             when not explicitly specified
 749         data -- POST data (bytes)
 750         headers -- HTTP headers (dict)
 751         query -- URL query (dict)
 752         expected_status -- allows to accept failed HTTP requests (non 2xx
 753             status code) by explicitly specifying a set of accepted status
 754             codes. Can be any of the following entities:
 755                 - an integer type specifying an exact failed status code to
 756                   accept
 757                 - a list or a tuple of integer types specifying a list of
 758                   failed status codes to accept
 759                 - a callable accepting an actual failed status code and
 760                   returning True if it should be accepted
 761             Note that this argument does not affect success status codes (2xx)
 762             which are always accepted.
 763         """
 764
 765         success = False
 766         try_count = 0
 767         while success is False:
 768             try:
 769                 res = self._download_webpage_handle(
 770                     url_or_request, video_id, note, errnote, fatal,
 771                     encoding=encoding, data=data, headers=headers, query=query,
 772                     expected_status=expected_status)
 773                 success = True
 774             except compat_http_client.IncompleteRead as e:
 775                 try_count += 1
 776                 if try_count >= tries:
 777                     raise e
 778                 self._sleep(timeout, video_id)
 779         if res is False:
 780             return res
 781         else:
 782             content, _ = res
 783             return content
 784
 785     def _download_xml_handle(
 786             self, url_or_request, video_id, note='Downloading XML',
 787             errnote='Unable to download XML', transform_source=None,
 788             fatal=True, encoding=None, data=None, headers={}, query={},
 789             expected_status=None):
 790         """
 791         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
 792
 793         See _download_webpage docstring for arguments specification.
 794         """
 795         res = self._download_webpage_handle(
 796             url_or_request, video_id, note, errnote, fatal=fatal,
 797             encoding=encoding, data=data, headers=headers, query=query,
 798             expected_status=expected_status)
 799         if res is False:
 800             return res
 801         xml_string, urlh = res
 802         return self._parse_xml(
 803             xml_string, video_id, transform_source=transform_source,
 804             fatal=fatal), urlh
 805
 806     def _download_xml(
 807             self, url_or_request, video_id,
 808             note='Downloading XML', errnote='Unable to download XML',
 809             transform_source=None, fatal=True, encoding=None,
 810             data=None, headers={}, query={}, expected_status=None):
 811         """
 812         Return the xml as an xml.etree.ElementTree.Element.
 813
 814         See _download_webpage docstring for arguments specification.
 815         """
 816         res = self._download_xml_handle(
 817             url_or_request, video_id, note=note, errnote=errnote,
 818             transform_source=transform_source, fatal=fatal, encoding=encoding,
 819             data=data, headers=headers, query=query,
 820             expected_status=expected_status)
 821         return res if res is False else res[0]
 822
 823     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 824         if transform_source:
 825             xml_string = transform_source(xml_string)
 826         try:
 827             return compat_etree_fromstring(xml_string.encode('utf-8'))
 828         except compat_xml_parse_error as ve:
 829             errmsg = '%s: Failed to parse XML ' % video_id
 830             if fatal:
 831                 raise ExtractorError(errmsg, cause=ve)
 832             else:
 833                 self.report_warning(errmsg + str(ve))
 834
 835     def _download_json_handle(
 836             self, url_or_request, video_id, note='Downloading JSON metadata',
 837             errnote='Unable to download JSON metadata', transform_source=None,
 838             fatal=True, encoding=None, data=None, headers={}, query={},
 839             expected_status=None):
 840         """
 841         Return a tuple (JSON object, URL handle).
 842
 843         See _download_webpage docstring for arguments specification.
 844         """
 845         res = self._download_webpage_handle(
 846             url_or_request, video_id, note, errnote, fatal=fatal,
 847             encoding=encoding, data=data, headers=headers, query=query,
 848             expected_status=expected_status)
 849         if res is False:
 850             return res
 851         json_string, urlh = res
 852         return self._parse_json(
 853             json_string, video_id, transform_source=transform_source,
 854             fatal=fatal), urlh
 855
 856     def _download_json(
 857             self, url_or_request, video_id, note='Downloading JSON metadata',
 858             errnote='Unable to download JSON metadata', transform_source=None,
 859             fatal=True, encoding=None, data=None, headers={}, query={},
 860             expected_status=None):
 861         """
 862         Return the JSON object as a dict.
 863
 864         See _download_webpage docstring for arguments specification.
 865         """
 866         res = self._download_json_handle(
 867             url_or_request, video_id, note=note, errnote=errnote,
 868             transform_source=transform_source, fatal=fatal, encoding=encoding,
 869             data=data, headers=headers, query=query,
 870             expected_status=expected_status)
 871         return res if res is False else res[0]
 872
 873     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 874         if transform_source:
 875             json_string = transform_source(json_string)
 876         try:
 877             return json.loads(json_string)
 878         except ValueError as ve:
 879             errmsg = '%s: Failed to parse JSON ' % video_id
 880             if fatal:
 881                 raise ExtractorError(errmsg, cause=ve)
 882             else:
 883                 self.report_warning(errmsg + str(ve))
 884
 885     def report_warning(self, msg, video_id=None):
 886         idstr = '' if video_id is None else '%s: ' % video_id
 887         self._downloader.report_warning(
 888             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 889
 890     def to_screen(self, msg):
 891         """Print msg to screen, prefixing it with '[ie_name]'"""
 892         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 893
 894     def report_extraction(self, id_or_name):
 895         """Report information extraction."""
 896         self.to_screen('%s: Extracting information' % id_or_name)
 897
 898     def report_download_webpage(self, video_id):
 899         """Report webpage download."""
 900         self.to_screen('%s: Downloading webpage' % video_id)
 901
 902     def report_age_confirmation(self):
 903         """Report attempt to confirm age."""
 904         self.to_screen('Confirming age')
 905
 906     def report_login(self):
 907         """Report attempt to log in."""
 908         self.to_screen('Logging in')
 909
 910     @staticmethod
 911     def raise_login_required(msg='This video is only available for registered users'):
 912         raise ExtractorError(
 913             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 914             expected=True)
 915
 916     @staticmethod
 917     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 918         raise GeoRestrictedError(msg, countries=countries)
 919
 920     # Methods for following #608
 921     @staticmethod
 922     def url_result(url, ie=None, video_id=None, video_title=None):
 923         """Returns a URL that points to a page that should be processed"""
 924         # TODO: ie should be the class used for getting the info
 925         video_info = {'_type': 'url',
 926                       'url': url,
 927                       'ie_key': ie}
 928         if video_id is not None:
 929             video_info['id'] = video_id
 930         if video_title is not None:
 931             video_info['title'] = video_title
 932         return video_info
 933
 934     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 935         urls = orderedSet(
 936             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 937             for m in matches)
 938         return self.playlist_result(
 939             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 940
 941     @staticmethod
 942     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 943         """Returns a playlist"""
 944         video_info = {'_type': 'playlist',
 945                       'entries': entries}
 946         if playlist_id:
 947             video_info['id'] = playlist_id
 948         if playlist_title:
 949             video_info['title'] = playlist_title
 950         if playlist_description:
 951             video_info['description'] = playlist_description
 952         return video_info
 953
 954     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 955         """
 956         Perform a regex search on the given string, using a single or a list of
 957         patterns returning the first matching group.
 958         In case of failure return a default value or raise a WARNING or a
 959         RegexNotFoundError, depending on fatal, specifying the field name.
 960         """
 961         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 962             mobj = re.search(pattern, string, flags)
 963         else:
 964             for p in pattern:
 965                 mobj = re.search(p, string, flags)
 966                 if mobj:
 967                     break
 968
 969         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 970             _name = '\033[0;34m%s\033[0m' % name
 971         else:
 972             _name = name
 973
 974         if mobj:
 975             if group is None:
 976                 # return the first matching group
 977                 return next(g for g in mobj.groups() if g is not None)
 978             else:
 979                 return mobj.group(group)
 980         elif default is not NO_DEFAULT:
 981             return default
 982         elif fatal:
 983             raise RegexNotFoundError('Unable to extract %s' % _name)
 984         else:
 985             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 986             return None
 987
 988     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 989         """
 990         Like _search_regex, but strips HTML tags and unescapes entities.
 991         """
 992         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 993         if res:
 994             return clean_html(res).strip()
 995         else:
 996             return res
 997
 998     def _get_netrc_login_info(self, netrc_machine=None):
 999         username = None
1000         password = None
1001         netrc_machine = netrc_machine or self._NETRC_MACHINE
1002
1003         if self._downloader.params.get('usenetrc', False):
1004             try:
1005                 info = netrc.netrc().authenticators(netrc_machine)
1006                 if info is not None:
1007                     username = info[0]
1008                     password = info[2]
1009                 else:
1010                     raise netrc.NetrcParseError(
1011                         'No authenticators for %s' % netrc_machine)
1012             except (IOError, netrc.NetrcParseError) as err:
1013                 self._downloader.report_warning(
1014                     'parsing .netrc: %s' % error_to_compat_str(err))
1015
1016         return username, password
1017
1018     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1019         """
1020         Get the login info as (username, password)
1021         First look for the manually specified credentials using username_option
1022         and password_option as keys in params dictionary. If no such credentials
1023         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1024         value.
1025         If there's no info available, return (None, None)
1026         """
1027         if self._downloader is None:
1028             return (None, None)
1029
1030         downloader_params = self._downloader.params
1031
1032         # Attempt to use provided username and password or .netrc data
1033         if downloader_params.get(username_option) is not None:
1034             username = downloader_params[username_option]
1035             password = downloader_params[password_option]
1036         else:
1037             username, password = self._get_netrc_login_info(netrc_machine)
1038
1039         return username, password
1040
1041     def _get_tfa_info(self, note='two-factor verification code'):
1042         """
1043         Get the two-factor authentication info
1044         TODO - asking the user will be required for sms/phone verify
1045         currently just uses the command line option
1046         If there's no info available, return None
1047         """
1048         if self._downloader is None:
1049             return None
1050         downloader_params = self._downloader.params
1051
1052         if downloader_params.get('twofactor') is not None:
1053             return downloader_params['twofactor']
1054
1055         return compat_getpass('Type %s and press [Return]: ' % note)
1056
1057     # Helper functions for extracting OpenGraph info
1058     @staticmethod
1059     def _og_regexes(prop):
1060         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1061         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1062                        % {'prop': re.escape(prop)})
1063         template = r'<meta[^>]+?%s[^>]+?%s'
1064         return [
1065             template % (property_re, content_re),
1066             template % (content_re, property_re),
1067         ]
1068
1069     @staticmethod
1070     def _meta_regex(prop):
1071         return r'''(?isx)<meta
1072                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1073                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1074
1075     def _og_search_property(self, prop, html, name=None, **kargs):
1076         if not isinstance(prop, (list, tuple)):
1077             prop = [prop]
1078         if name is None:
1079             name = 'OpenGraph %s' % prop[0]
1080         og_regexes = []
1081         for p in prop:
1082             og_regexes.extend(self._og_regexes(p))
1083         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1084         if escaped is None:
1085             return None
1086         return unescapeHTML(escaped)
1087
1088     def _og_search_thumbnail(self, html, **kargs):
1089         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1090
1091     def _og_search_description(self, html, **kargs):
1092         return self._og_search_property('description', html, fatal=False, **kargs)
1093
1094     def _og_search_title(self, html, **kargs):
1095         return self._og_search_property('title', html, **kargs)
1096
1097     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1098         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1099         if secure:
1100             regexes = self._og_regexes('video:secure_url') + regexes
1101         return self._html_search_regex(regexes, html, name, **kargs)
1102
1103     def _og_search_url(self, html, **kargs):
1104         return self._og_search_property('url', html, **kargs)
1105
1106     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1107         if not isinstance(name, (list, tuple)):
1108             name = [name]
1109         if display_name is None:
1110             display_name = name[0]
1111         return self._html_search_regex(
1112             [self._meta_regex(n) for n in name],
1113             html, display_name, fatal=fatal, group='content', **kwargs)
1114
1115     def _dc_search_uploader(self, html):
1116         return self._html_search_meta('dc.creator', html, 'uploader')
1117
1118     def _rta_search(self, html):
1119         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1120         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1121                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1122                      html):
1123             return 18
1124         return 0
1125
1126     def _media_rating_search(self, html):
1127         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1128         rating = self._html_search_meta('rating', html)
1129
1130         if not rating:
1131             return None
1132
1133         RATING_TABLE = {
1134             'safe for kids': 0,
1135             'general': 8,
1136             '14 years': 14,
1137             'mature': 17,
1138             'restricted': 19,
1139         }
1140         return RATING_TABLE.get(rating.lower())
1141
1142     def _family_friendly_search(self, html):
1143         # See http://schema.org/VideoObject
1144         family_friendly = self._html_search_meta(
1145             'isFamilyFriendly', html, default=None)
1146
1147         if not family_friendly:
1148             return None
1149
1150         RATING_TABLE = {
1151             '1': 0,
1152             'true': 0,
1153             '0': 18,
1154             'false': 18,
1155         }
1156         return RATING_TABLE.get(family_friendly.lower())
1157
1158     def _twitter_search_player(self, html):
1159         return self._html_search_meta('twitter:player', html,
1160                                       'twitter card player')
1161
1162     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1163         json_ld = self._search_regex(
1164             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1165         default = kwargs.get('default', NO_DEFAULT)
1166         if not json_ld:
1167             return default if default is not NO_DEFAULT else {}
1168         # JSON-LD may be malformed and thus `fatal` should be respected.
1169         # At the same time `default` may be passed that assumes `fatal=False`
1170         # for _search_regex. Let's simulate the same behavior here as well.
1171         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1172         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1173
1174     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1175         if isinstance(json_ld, compat_str):
1176             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1177         if not json_ld:
1178             return {}
1179         info = {}
1180         if not isinstance(json_ld, (list, tuple, dict)):
1181             return info
1182         if isinstance(json_ld, dict):
1183             json_ld = [json_ld]
1184
1185         INTERACTION_TYPE_MAP = {
1186             'CommentAction': 'comment',
1187             'AgreeAction': 'like',
1188             'DisagreeAction': 'dislike',
1189             'LikeAction': 'like',
1190             'DislikeAction': 'dislike',
1191             'ListenAction': 'view',
1192             'WatchAction': 'view',
1193             'ViewAction': 'view',
1194         }
1195
1196         def extract_interaction_statistic(e):
1197             interaction_statistic = e.get('interactionStatistic')
1198             if not isinstance(interaction_statistic, list):
1199                 return
1200             for is_e in interaction_statistic:
1201                 if not isinstance(is_e, dict):
1202                     continue
1203                 if is_e.get('@type') != 'InteractionCounter':
1204                     continue
1205                 interaction_type = is_e.get('interactionType')
1206                 if not isinstance(interaction_type, compat_str):
1207                     continue
1208                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1209                 if interaction_count is None:
1210                     continue
1211                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1212                 if not count_kind:
1213                     continue
1214                 count_key = '%s_count' % count_kind
1215                 if info.get(count_key) is not None:
1216                     continue
1217                 info[count_key] = interaction_count
1218
1219         def extract_video_object(e):
1220             assert e['@type'] == 'VideoObject'
1221             info.update({
1222                 'url': url_or_none(e.get('contentUrl')),
1223                 'title': unescapeHTML(e.get('name')),
1224                 'description': unescapeHTML(e.get('description')),
1225                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1226                 'duration': parse_duration(e.get('duration')),
1227                 'timestamp': unified_timestamp(e.get('uploadDate')),
1228                 'filesize': float_or_none(e.get('contentSize')),
1229                 'tbr': int_or_none(e.get('bitrate')),
1230                 'width': int_or_none(e.get('width')),
1231                 'height': int_or_none(e.get('height')),
1232                 'view_count': int_or_none(e.get('interactionCount')),
1233             })
1234             extract_interaction_statistic(e)
1235
1236         for e in json_ld:
1237             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1238                 item_type = e.get('@type')
1239                 if expected_type is not None and expected_type != item_type:
1240                     return info
1241                 if item_type in ('TVEpisode', 'Episode'):
1242                     episode_name = unescapeHTML(e.get('name'))
1243                     info.update({
1244                         'episode': episode_name,
1245                         'episode_number': int_or_none(e.get('episodeNumber')),
1246                         'description': unescapeHTML(e.get('description')),
1247                     })
1248                     if not info.get('title') and episode_name:
1249                         info['title'] = episode_name
1250                     part_of_season = e.get('partOfSeason')
1251                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1252                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1253                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1254                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1255                         info['series'] = unescapeHTML(part_of_series.get('name'))
1256                 elif item_type == 'Movie':
1257                     info.update({
1258                         'title': unescapeHTML(e.get('name')),
1259                         'description': unescapeHTML(e.get('description')),
1260                         'duration': parse_duration(e.get('duration')),
1261                         'timestamp': unified_timestamp(e.get('dateCreated')),
1262                     })
1263                 elif item_type in ('Article', 'NewsArticle'):
1264                     info.update({
1265                         'timestamp': parse_iso8601(e.get('datePublished')),
1266                         'title': unescapeHTML(e.get('headline')),
1267                         'description': unescapeHTML(e.get('articleBody')),
1268                     })
1269                 elif item_type == 'VideoObject':
1270                     extract_video_object(e)
1271                     continue
1272                 video = e.get('video')
1273                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1274                     extract_video_object(video)
1275                 break
1276         return dict((k, v) for k, v in info.items() if v is not None)
1277
1278     @staticmethod
1279     def _hidden_inputs(html):
1280         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1281         hidden_inputs = {}
1282         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1283             attrs = extract_attributes(input)
1284             if not input:
1285                 continue
1286             if attrs.get('type') not in ('hidden', 'submit'):
1287                 continue
1288             name = attrs.get('name') or attrs.get('id')
1289             value = attrs.get('value')
1290             if name and value is not None:
1291                 hidden_inputs[name] = value
1292         return hidden_inputs
1293
1294     def _form_hidden_inputs(self, form_id, html):
1295         form = self._search_regex(
1296             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1297             html, '%s form' % form_id, group='form')
1298         return self._hidden_inputs(form)
1299
1300     def _sort_formats(self, formats, field_preference=None):
1301         if not formats:
1302             raise ExtractorError('No video formats found')
1303
1304         for f in formats:
1305             # Automatically determine tbr when missing based on abr and vbr (improves
1306             # formats sorting in some cases)
1307             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1308                 f['tbr'] = f['abr'] + f['vbr']
1309
1310         def _formats_key(f):
1311             # TODO remove the following workaround
1312             from ..utils import determine_ext
1313             if not f.get('ext') and 'url' in f:
1314                 f['ext'] = determine_ext(f['url'])
1315
1316             if isinstance(field_preference, (list, tuple)):
1317                 return tuple(
1318                     f.get(field)
1319                     if f.get(field) is not None
1320                     else ('' if field == 'format_id' else -1)
1321                     for field in field_preference)
1322
1323             preference = f.get('preference')
1324             if preference is None:
1325                 preference = 0
1326                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1327                     preference -= 0.5
1328
1329             protocol = f.get('protocol') or determine_protocol(f)
1330             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1331
1332             if f.get('vcodec') == 'none':  # audio only
1333                 preference -= 50
1334                 if self._downloader.params.get('prefer_free_formats'):
1335                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1336                 else:
1337                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1338                 ext_preference = 0
1339                 try:
1340                     audio_ext_preference = ORDER.index(f['ext'])
1341                 except ValueError:
1342                     audio_ext_preference = -1
1343             else:
1344                 if f.get('acodec') == 'none':  # video only
1345                     preference -= 40
1346                 if self._downloader.params.get('prefer_free_formats'):
1347                     ORDER = ['flv', 'mp4', 'webm']
1348                 else:
1349                     ORDER = ['webm', 'flv', 'mp4']
1350                 try:
1351                     ext_preference = ORDER.index(f['ext'])
1352                 except ValueError:
1353                     ext_preference = -1
1354                 audio_ext_preference = 0
1355
1356             return (
1357                 preference,
1358                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1359                 f.get('quality') if f.get('quality') is not None else -1,
1360                 f.get('tbr') if f.get('tbr') is not None else -1,
1361                 f.get('filesize') if f.get('filesize') is not None else -1,
1362                 f.get('vbr') if f.get('vbr') is not None else -1,
1363                 f.get('height') if f.get('height') is not None else -1,
1364                 f.get('width') if f.get('width') is not None else -1,
1365                 proto_preference,
1366                 ext_preference,
1367                 f.get('abr') if f.get('abr') is not None else -1,
1368                 audio_ext_preference,
1369                 f.get('fps') if f.get('fps') is not None else -1,
1370                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1371                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1372                 f.get('format_id') if f.get('format_id') is not None else '',
1373             )
1374         formats.sort(key=_formats_key)
1375
1376     def _check_formats(self, formats, video_id):
1377         if formats:
1378             formats[:] = filter(
1379                 lambda f: self._is_valid_url(
1380                     f['url'], video_id,
1381                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1382                 formats)
1383
1384     @staticmethod
1385     def _remove_duplicate_formats(formats):
1386         format_urls = set()
1387         unique_formats = []
1388         for f in formats:
1389             if f['url'] not in format_urls:
1390                 format_urls.add(f['url'])
1391                 unique_formats.append(f)
1392         formats[:] = unique_formats
1393
1394     def _is_valid_url(self, url, video_id, item='video', headers={}):
1395         url = self._proto_relative_url(url, scheme='http:')
1396         # For now assume non HTTP(S) URLs always valid
1397         if not (url.startswith('http://') or url.startswith('https://')):
1398             return True
1399         try:
1400             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1401             return True
1402         except ExtractorError as e:
1403             if isinstance(e.cause, compat_urllib_error.URLError):
1404                 self.to_screen(
1405                     '%s: %s URL is invalid, skipping' % (video_id, item))
1406                 return False
1407             raise
1408
1409     def http_scheme(self):
1410         """ Either "http:" or "https:", depending on the user's preferences """
1411         return (
1412             'http:'
1413             if self._downloader.params.get('prefer_insecure', False)
1414             else 'https:')
1415
1416     def _proto_relative_url(self, url, scheme=None):
1417         if url is None:
1418             return url
1419         if url.startswith('//'):
1420             if scheme is None:
1421                 scheme = self.http_scheme()
1422             return scheme + url
1423         else:
1424             return url
1425
1426     def _sleep(self, timeout, video_id, msg_template=None):
1427         if msg_template is None:
1428             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1429         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1430         self.to_screen(msg)
1431         time.sleep(timeout)
1432
1433     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1434                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1435                              fatal=True, m3u8_id=None):
1436         manifest = self._download_xml(
1437             manifest_url, video_id, 'Downloading f4m manifest',
1438             'Unable to download f4m manifest',
1439             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1440             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1441             transform_source=transform_source,
1442             fatal=fatal)
1443
1444         if manifest is False:
1445             return []
1446
1447         return self._parse_f4m_formats(
1448             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1449             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1450
1451     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1452                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1453                            fatal=True, m3u8_id=None):
1454         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1455         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1456         if akamai_pv is not None and ';' in akamai_pv.text:
1457             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1458             if playerVerificationChallenge.strip() != '':
1459                 return []
1460
1461         formats = []
1462         manifest_version = '1.0'
1463         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1464         if not media_nodes:
1465             manifest_version = '2.0'
1466             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1467         # Remove unsupported DRM protected media from final formats
1468         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1469         media_nodes = remove_encrypted_media(media_nodes)
1470         if not media_nodes:
1471             return formats
1472
1473         manifest_base_url = get_base_url(manifest)
1474
1475         bootstrap_info = xpath_element(
1476             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1477             'bootstrap info', default=None)
1478
1479         vcodec = None
1480         mime_type = xpath_text(
1481             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1482             'base URL', default=None)
1483         if mime_type and mime_type.startswith('audio/'):
1484             vcodec = 'none'
1485
1486         for i, media_el in enumerate(media_nodes):
1487             tbr = int_or_none(media_el.attrib.get('bitrate'))
1488             width = int_or_none(media_el.attrib.get('width'))
1489             height = int_or_none(media_el.attrib.get('height'))
1490             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1491             # If <bootstrapInfo> is present, the specified f4m is a
1492             # stream-level manifest, and only set-level manifests may refer to
1493             # external resources.  See section 11.4 and section 4 of F4M spec
1494             if bootstrap_info is None:
1495                 media_url = None
1496                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1497                 if manifest_version == '2.0':
1498                     media_url = media_el.attrib.get('href')
1499                 if media_url is None:
1500                     media_url = media_el.attrib.get('url')
1501                 if not media_url:
1502                     continue
1503                 manifest_url = (
1504                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1505                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1506                 # If media_url is itself a f4m manifest do the recursive extraction
1507                 # since bitrates in parent manifest (this one) and media_url manifest
1508                 # may differ leading to inability to resolve the format by requested
1509                 # bitrate in f4m downloader
1510                 ext = determine_ext(manifest_url)
1511                 if ext == 'f4m':
1512                     f4m_formats = self._extract_f4m_formats(
1513                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1514                         transform_source=transform_source, fatal=fatal)
1515                     # Sometimes stream-level manifest contains single media entry that
1516                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1517                     # At the same time parent's media entry in set-level manifest may
1518                     # contain it. We will copy it from parent in such cases.
1519                     if len(f4m_formats) == 1:
1520                         f = f4m_formats[0]
1521                         f.update({
1522                             'tbr': f.get('tbr') or tbr,
1523                             'width': f.get('width') or width,
1524                             'height': f.get('height') or height,
1525                             'format_id': f.get('format_id') if not tbr else format_id,
1526                             'vcodec': vcodec,
1527                         })
1528                     formats.extend(f4m_formats)
1529                     continue
1530                 elif ext == 'm3u8':
1531                     formats.extend(self._extract_m3u8_formats(
1532                         manifest_url, video_id, 'mp4', preference=preference,
1533                         m3u8_id=m3u8_id, fatal=fatal))
1534                     continue
1535             formats.append({
1536                 'format_id': format_id,
1537                 'url': manifest_url,
1538                 'manifest_url': manifest_url,
1539                 'ext': 'flv' if bootstrap_info is not None else None,
1540                 'protocol': 'f4m',
1541                 'tbr': tbr,
1542                 'width': width,
1543                 'height': height,
1544                 'vcodec': vcodec,
1545                 'preference': preference,
1546             })
1547         return formats
1548
1549     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1550         return {
1551             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1552             'url': m3u8_url,
1553             'ext': ext,
1554             'protocol': 'm3u8',
1555             'preference': preference - 100 if preference else -100,
1556             'resolution': 'multiple',
1557             'format_note': 'Quality selection URL',
1558         }
1559
1560     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1561                               entry_protocol='m3u8', preference=None,
1562                               m3u8_id=None, note=None, errnote=None,
1563                               fatal=True, live=False):
1564         res = self._download_webpage_handle(
1565             m3u8_url, video_id,
1566             note=note or 'Downloading m3u8 information',
1567             errnote=errnote or 'Failed to download m3u8 information',
1568             fatal=fatal)
1569
1570         if res is False:
1571             return []
1572
1573         m3u8_doc, urlh = res
1574         m3u8_url = urlh.geturl()
1575
1576         return self._parse_m3u8_formats(
1577             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1578             preference=preference, m3u8_id=m3u8_id, live=live)
1579
1580     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1581                             entry_protocol='m3u8', preference=None,
1582                             m3u8_id=None, live=False):
1583         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1584             return []
1585
1586         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1587             return []
1588
1589         formats = []
1590
1591         format_url = lambda u: (
1592             u
1593             if re.match(r'^https?://', u)
1594             else compat_urlparse.urljoin(m3u8_url, u))
1595
1596         # References:
1597         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1598         # 2. https://github.com/rg3/youtube-dl/issues/12211
1599
1600         # We should try extracting formats only from master playlists [1, 4.3.4],
1601         # i.e. playlists that describe available qualities. On the other hand
1602         # media playlists [1, 4.3.3] should be returned as is since they contain
1603         # just the media without qualities renditions.
1604         # Fortunately, master playlist can be easily distinguished from media
1605         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1606         # master playlist tags MUST NOT appear in a media playist and vice versa.
1607         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1608         # media playlist and MUST NOT appear in master playlist thus we can
1609         # clearly detect media playlist with this criterion.
1610
1611         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1612             return [{
1613                 'url': m3u8_url,
1614                 'format_id': m3u8_id,
1615                 'ext': ext,
1616                 'protocol': entry_protocol,
1617                 'preference': preference,
1618             }]
1619
1620         groups = {}
1621         last_stream_inf = {}
1622
1623         def extract_media(x_media_line):
1624             media = parse_m3u8_attributes(x_media_line)
1625             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1626             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1627             if not (media_type and group_id and name):
1628                 return
1629             groups.setdefault(group_id, []).append(media)
1630             if media_type not in ('VIDEO', 'AUDIO'):
1631                 return
1632             media_url = media.get('URI')
1633             if media_url:
1634                 format_id = []
1635                 for v in (m3u8_id, group_id, name):
1636                     if v:
1637                         format_id.append(v)
1638                 f = {
1639                     'format_id': '-'.join(format_id),
1640                     'url': format_url(media_url),
1641                     'manifest_url': m3u8_url,
1642                     'language': media.get('LANGUAGE'),
1643                     'ext': ext,
1644                     'protocol': entry_protocol,
1645                     'preference': preference,
1646                 }
1647                 if media_type == 'AUDIO':
1648                     f['vcodec'] = 'none'
1649                 formats.append(f)
1650
1651         def build_stream_name():
1652             # Despite specification does not mention NAME attribute for
1653             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1654             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1655             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1656             stream_name = last_stream_inf.get('NAME')
1657             if stream_name:
1658                 return stream_name
1659             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1660             # from corresponding rendition group
1661             stream_group_id = last_stream_inf.get('VIDEO')
1662             if not stream_group_id:
1663                 return
1664             stream_group = groups.get(stream_group_id)
1665             if not stream_group:
1666                 return stream_group_id
1667             rendition = stream_group[0]
1668             return rendition.get('NAME') or stream_group_id
1669
1670         for line in m3u8_doc.splitlines():
1671             if line.startswith('#EXT-X-STREAM-INF:'):
1672                 last_stream_inf = parse_m3u8_attributes(line)
1673             elif line.startswith('#EXT-X-MEDIA:'):
1674                 extract_media(line)
1675             elif line.startswith('#') or not line.strip():
1676                 continue
1677             else:
1678                 tbr = float_or_none(
1679                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1680                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1681                 format_id = []
1682                 if m3u8_id:
1683                     format_id.append(m3u8_id)
1684                 stream_name = build_stream_name()
1685                 # Bandwidth of live streams may differ over time thus making
1686                 # format_id unpredictable. So it's better to keep provided
1687                 # format_id intact.
1688                 if not live:
1689                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1690                 manifest_url = format_url(line.strip())
1691                 f = {
1692                     'format_id': '-'.join(format_id),
1693                     'url': manifest_url,
1694                     'manifest_url': m3u8_url,
1695                     'tbr': tbr,
1696                     'ext': ext,
1697                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1698                     'protocol': entry_protocol,
1699                     'preference': preference,
1700                 }
1701                 resolution = last_stream_inf.get('RESOLUTION')
1702                 if resolution:
1703                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1704                     if mobj:
1705                         f['width'] = int(mobj.group('width'))
1706                         f['height'] = int(mobj.group('height'))
1707                 # Unified Streaming Platform
1708                 mobj = re.search(
1709                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1710                 if mobj:
1711                     abr, vbr = mobj.groups()
1712                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1713                     f.update({
1714                         'vbr': vbr,
1715                         'abr': abr,
1716                     })
1717                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1718                 f.update(codecs)
1719                 audio_group_id = last_stream_inf.get('AUDIO')
1720                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1721                 # references a rendition group MUST have a CODECS attribute.
1722                 # However, this is not always respected, for example, [2]
1723                 # contains EXT-X-STREAM-INF tag which references AUDIO
1724                 # rendition group but does not have CODECS and despite
1725                 # referencing an audio group it represents a complete
1726                 # (with audio and video) format. So, for such cases we will
1727                 # ignore references to rendition groups and treat them
1728                 # as complete formats.
1729                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1730                     audio_group = groups.get(audio_group_id)
1731                     if audio_group and audio_group[0].get('URI'):
1732                         # TODO: update acodec for audio only formats with
1733                         # the same GROUP-ID
1734                         f['acodec'] = 'none'
1735                 formats.append(f)
1736                 last_stream_inf = {}
1737         return formats
1738
1739     @staticmethod
1740     def _xpath_ns(path, namespace=None):
1741         if not namespace:
1742             return path
1743         out = []
1744         for c in path.split('/'):
1745             if not c or c == '.':
1746                 out.append(c)
1747             else:
1748                 out.append('{%s}%s' % (namespace, c))
1749         return '/'.join(out)
1750
1751     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1752         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1753
1754         if smil is False:
1755             assert not fatal
1756             return []
1757
1758         namespace = self._parse_smil_namespace(smil)
1759
1760         return self._parse_smil_formats(
1761             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1762
1763     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1764         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1765         if smil is False:
1766             return {}
1767         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1768
1769     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1770         return self._download_xml(
1771             smil_url, video_id, 'Downloading SMIL file',
1772             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1773
1774     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1775         namespace = self._parse_smil_namespace(smil)
1776
1777         formats = self._parse_smil_formats(
1778             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1779         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1780
1781         video_id = os.path.splitext(url_basename(smil_url))[0]
1782         title = None
1783         description = None
1784         upload_date = None
1785         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1786             name = meta.attrib.get('name')
1787             content = meta.attrib.get('content')
1788             if not name or not content:
1789                 continue
1790             if not title and name == 'title':
1791                 title = content
1792             elif not description and name in ('description', 'abstract'):
1793                 description = content
1794             elif not upload_date and name == 'date':
1795                 upload_date = unified_strdate(content)
1796
1797         thumbnails = [{
1798             'id': image.get('type'),
1799             'url': image.get('src'),
1800             'width': int_or_none(image.get('width')),
1801             'height': int_or_none(image.get('height')),
1802         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1803
1804         return {
1805             'id': video_id,
1806             'title': title or video_id,
1807             'description': description,
1808             'upload_date': upload_date,
1809             'thumbnails': thumbnails,
1810             'formats': formats,
1811             'subtitles': subtitles,
1812         }
1813
1814     def _parse_smil_namespace(self, smil):
1815         return self._search_regex(
1816             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1817
1818     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1819         base = smil_url
1820         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1821             b = meta.get('base') or meta.get('httpBase')
1822             if b:
1823                 base = b
1824                 break
1825
1826         formats = []
1827         rtmp_count = 0
1828         http_count = 0
1829         m3u8_count = 0
1830
1831         srcs = []
1832         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1833         for medium in media:
1834             src = medium.get('src')
1835             if not src or src in srcs:
1836                 continue
1837             srcs.append(src)
1838
1839             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1840             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1841             width = int_or_none(medium.get('width'))
1842             height = int_or_none(medium.get('height'))
1843             proto = medium.get('proto')
1844             ext = medium.get('ext')
1845             src_ext = determine_ext(src)
1846             streamer = medium.get('streamer') or base
1847
1848             if proto == 'rtmp' or streamer.startswith('rtmp'):
1849                 rtmp_count += 1
1850                 formats.append({
1851                     'url': streamer,
1852                     'play_path': src,
1853                     'ext': 'flv',
1854                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1855                     'tbr': bitrate,
1856                     'filesize': filesize,
1857                     'width': width,
1858                     'height': height,
1859                 })
1860                 if transform_rtmp_url:
1861                     streamer, src = transform_rtmp_url(streamer, src)
1862                     formats[-1].update({
1863                         'url': streamer,
1864                         'play_path': src,
1865                     })
1866                 continue
1867
1868             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1869             src_url = src_url.strip()
1870
1871             if proto == 'm3u8' or src_ext == 'm3u8':
1872                 m3u8_formats = self._extract_m3u8_formats(
1873                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1874                 if len(m3u8_formats) == 1:
1875                     m3u8_count += 1
1876                     m3u8_formats[0].update({
1877                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1878                         'tbr': bitrate,
1879                         'width': width,
1880                         'height': height,
1881                     })
1882                 formats.extend(m3u8_formats)
1883             elif src_ext == 'f4m':
1884                 f4m_url = src_url
1885                 if not f4m_params:
1886                     f4m_params = {
1887                         'hdcore': '3.2.0',
1888                         'plugin': 'flowplayer-3.2.0.1',
1889                     }
1890                 f4m_url += '&' if '?' in f4m_url else '?'
1891                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1892                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1893             elif src_ext == 'mpd':
1894                 formats.extend(self._extract_mpd_formats(
1895                     src_url, video_id, mpd_id='dash', fatal=False))
1896             elif re.search(r'\.ism/[Mm]anifest', src_url):
1897                 formats.extend(self._extract_ism_formats(
1898                     src_url, video_id, ism_id='mss', fatal=False))
1899             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1900                 http_count += 1
1901                 formats.append({
1902                     'url': src_url,
1903                     'ext': ext or src_ext or 'flv',
1904                     'format_id': 'http-%d' % (bitrate or http_count),
1905                     'tbr': bitrate,
1906                     'filesize': filesize,
1907                     'width': width,
1908                     'height': height,
1909                 })
1910
1911         return formats
1912
1913     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1914         urls = []
1915         subtitles = {}
1916         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1917             src = textstream.get('src')
1918             if not src or src in urls:
1919                 continue
1920             urls.append(src)
1921             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1922             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1923             subtitles.setdefault(lang, []).append({
1924                 'url': src,
1925                 'ext': ext,
1926             })
1927         return subtitles
1928
1929     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1930         xspf = self._download_xml(
1931             xspf_url, playlist_id, 'Downloading xpsf playlist',
1932             'Unable to download xspf manifest', fatal=fatal)
1933         if xspf is False:
1934             return []
1935         return self._parse_xspf(
1936             xspf, playlist_id, xspf_url=xspf_url,
1937             xspf_base_url=base_url(xspf_url))
1938
1939     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1940         NS_MAP = {
1941             'xspf': 'http://xspf.org/ns/0/',
1942             's1': 'http://static.streamone.nl/player/ns/0',
1943         }
1944
1945         entries = []
1946         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1947             title = xpath_text(
1948                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1949             description = xpath_text(
1950                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1951             thumbnail = xpath_text(
1952                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1953             duration = float_or_none(
1954                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1955
1956             formats = []
1957             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1958                 format_url = urljoin(xspf_base_url, location.text)
1959                 if not format_url:
1960                     continue
1961                 formats.append({
1962                     'url': format_url,
1963                     'manifest_url': xspf_url,
1964                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1965                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1966                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1967                 })
1968             self._sort_formats(formats)
1969
1970             entries.append({
1971                 'id': playlist_id,
1972                 'title': title,
1973                 'description': description,
1974                 'thumbnail': thumbnail,
1975                 'duration': duration,
1976                 'formats': formats,
1977             })
1978         return entries
1979
1980     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1981         res = self._download_xml_handle(
1982             mpd_url, video_id,
1983             note=note or 'Downloading MPD manifest',
1984             errnote=errnote or 'Failed to download MPD manifest',
1985             fatal=fatal)
1986         if res is False:
1987             return []
1988         mpd_doc, urlh = res
1989         mpd_base_url = base_url(urlh.geturl())
1990
1991         return self._parse_mpd_formats(
1992             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1993             formats_dict=formats_dict, mpd_url=mpd_url)
1994
1995     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1996         """
1997         Parse formats from MPD manifest.
1998         References:
1999          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2000             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2001          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2002         """
2003         if mpd_doc.get('type') == 'dynamic':
2004             return []
2005
2006         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2007
2008         def _add_ns(path):
2009             return self._xpath_ns(path, namespace)
2010
2011         def is_drm_protected(element):
2012             return element.find(_add_ns('ContentProtection')) is not None
2013
2014         def extract_multisegment_info(element, ms_parent_info):
2015             ms_info = ms_parent_info.copy()
2016
2017             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2018             # common attributes and elements.  We will only extract relevant
2019             # for us.
2020             def extract_common(source):
2021                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2022                 if segment_timeline is not None:
2023                     s_e = segment_timeline.findall(_add_ns('S'))
2024                     if s_e:
2025                         ms_info['total_number'] = 0
2026                         ms_info['s'] = []
2027                         for s in s_e:
2028                             r = int(s.get('r', 0))
2029                             ms_info['total_number'] += 1 + r
2030                             ms_info['s'].append({
2031                                 't': int(s.get('t', 0)),
2032                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2033                                 'd': int(s.attrib['d']),
2034                                 'r': r,
2035                             })
2036                 start_number = source.get('startNumber')
2037                 if start_number:
2038                     ms_info['start_number'] = int(start_number)
2039                 timescale = source.get('timescale')
2040                 if timescale:
2041                     ms_info['timescale'] = int(timescale)
2042                 segment_duration = source.get('duration')
2043                 if segment_duration:
2044                     ms_info['segment_duration'] = float(segment_duration)
2045
2046             def extract_Initialization(source):
2047                 initialization = source.find(_add_ns('Initialization'))
2048                 if initialization is not None:
2049                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2050
2051             segment_list = element.find(_add_ns('SegmentList'))
2052             if segment_list is not None:
2053                 extract_common(segment_list)
2054                 extract_Initialization(segment_list)
2055                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2056                 if segment_urls_e:
2057                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2058             else:
2059                 segment_template = element.find(_add_ns('SegmentTemplate'))
2060                 if segment_template is not None:
2061                     extract_common(segment_template)
2062                     media = segment_template.get('media')
2063                     if media:
2064                         ms_info['media'] = media
2065                     initialization = segment_template.get('initialization')
2066                     if initialization:
2067                         ms_info['initialization'] = initialization
2068                     else:
2069                         extract_Initialization(segment_template)
2070             return ms_info
2071
2072         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2073         formats = []
2074         for period in mpd_doc.findall(_add_ns('Period')):
2075             period_duration = parse_duration(period.get('duration')) or mpd_duration
2076             period_ms_info = extract_multisegment_info(period, {
2077                 'start_number': 1,
2078                 'timescale': 1,
2079             })
2080             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2081                 if is_drm_protected(adaptation_set):
2082                     continue
2083                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2084                 for representation in adaptation_set.findall(_add_ns('Representation')):
2085                     if is_drm_protected(representation):
2086                         continue
2087                     representation_attrib = adaptation_set.attrib.copy()
2088                     representation_attrib.update(representation.attrib)
2089                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2090                     mime_type = representation_attrib['mimeType']
2091                     content_type = mime_type.split('/')[0]
2092                     if content_type == 'text':
2093                         # TODO implement WebVTT downloading
2094                         pass
2095                     elif content_type in ('video', 'audio'):
2096                         base_url = ''
2097                         for element in (representation, adaptation_set, period, mpd_doc):
2098                             base_url_e = element.find(_add_ns('BaseURL'))
2099                             if base_url_e is not None:
2100                                 base_url = base_url_e.text + base_url
2101                                 if re.match(r'^https?://', base_url):
2102                                     break
2103                         if mpd_base_url and not re.match(r'^https?://', base_url):
2104                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2105                                 mpd_base_url += '/'
2106                             base_url = mpd_base_url + base_url
2107                         representation_id = representation_attrib.get('id')
2108                         lang = representation_attrib.get('lang')
2109                         url_el = representation.find(_add_ns('BaseURL'))
2110                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2111                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2112                         f = {
2113                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2114                             'url': base_url,
2115                             'manifest_url': mpd_url,
2116                             'ext': mimetype2ext(mime_type),
2117                             'width': int_or_none(representation_attrib.get('width')),
2118                             'height': int_or_none(representation_attrib.get('height')),
2119                             'tbr': float_or_none(bandwidth, 1000),
2120                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2121                             'fps': int_or_none(representation_attrib.get('frameRate')),
2122                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2123                             'format_note': 'DASH %s' % content_type,
2124                             'filesize': filesize,
2125                             'container': mimetype2ext(mime_type) + '_dash',
2126                         }
2127                         f.update(parse_codecs(representation_attrib.get('codecs')))
2128                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2129
2130                         def prepare_template(template_name, identifiers):
2131                             tmpl = representation_ms_info[template_name]
2132                             # First of, % characters outside $...$ templates
2133                             # must be escaped by doubling for proper processing
2134                             # by % operator string formatting used further (see
2135                             # https://github.com/rg3/youtube-dl/issues/16867).
2136                             t = ''
2137                             in_template = False
2138                             for c in tmpl:
2139                                 t += c
2140                                 if c == '$':
2141                                     in_template = not in_template
2142                                 elif c == '%' and not in_template:
2143                                     t += c
2144                             # Next, $...$ templates are translated to their
2145                             # %(...) counterparts to be used with % operator
2146                             t = t.replace('$RepresentationID$', representation_id)
2147                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2148                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2149                             t.replace('$$', '$')
2150                             return t
2151
2152                         # @initialization is a regular template like @media one
2153                         # so it should be handled just the same way (see
2154                         # https://github.com/rg3/youtube-dl/issues/11605)
2155                         if 'initialization' in representation_ms_info:
2156                             initialization_template = prepare_template(
2157                                 'initialization',
2158                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2159                                 # $Time$ shall not be included for @initialization thus
2160                                 # only $Bandwidth$ remains
2161                                 ('Bandwidth', ))
2162                             representation_ms_info['initialization_url'] = initialization_template % {
2163                                 'Bandwidth': bandwidth,
2164                             }
2165
2166                         def location_key(location):
2167                             return 'url' if re.match(r'^https?://', location) else 'path'
2168
2169                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2170
2171                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2172                             media_location_key = location_key(media_template)
2173
2174                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2175                             # can't be used at the same time
2176                             if '%(Number' in media_template and 's' not in representation_ms_info:
2177                                 segment_duration = None
2178                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2179                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2180                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2181                                 representation_ms_info['fragments'] = [{
2182                                     media_location_key: media_template % {
2183                                         'Number': segment_number,
2184                                         'Bandwidth': bandwidth,
2185                                     },
2186                                     'duration': segment_duration,
2187                                 } for segment_number in range(
2188                                     representation_ms_info['start_number'],
2189                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2190                             else:
2191                                 # $Number*$ or $Time$ in media template with S list available
2192                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2193                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2194                                 representation_ms_info['fragments'] = []
2195                                 segment_time = 0
2196                                 segment_d = None
2197                                 segment_number = representation_ms_info['start_number']
2198
2199                                 def add_segment_url():
2200                                     segment_url = media_template % {
2201                                         'Time': segment_time,
2202                                         'Bandwidth': bandwidth,
2203                                         'Number': segment_number,
2204                                     }
2205                                     representation_ms_info['fragments'].append({
2206                                         media_location_key: segment_url,
2207                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2208                                     })
2209
2210                                 for num, s in enumerate(representation_ms_info['s']):
2211                                     segment_time = s.get('t') or segment_time
2212                                     segment_d = s['d']
2213                                     add_segment_url()
2214                                     segment_number += 1
2215                                     for r in range(s.get('r', 0)):
2216                                         segment_time += segment_d
2217                                         add_segment_url()
2218                                         segment_number += 1
2219                                     segment_time += segment_d
2220                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2221                             # No media template
2222                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2223                             # or any YouTube dashsegments video
2224                             fragments = []
2225                             segment_index = 0
2226                             timescale = representation_ms_info['timescale']
2227                             for s in representation_ms_info['s']:
2228                                 duration = float_or_none(s['d'], timescale)
2229                                 for r in range(s.get('r', 0) + 1):
2230                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2231                                     fragments.append({
2232                                         location_key(segment_uri): segment_uri,
2233                                         'duration': duration,
2234                                     })
2235                                     segment_index += 1
2236                             representation_ms_info['fragments'] = fragments
2237                         elif 'segment_urls' in representation_ms_info:
2238                             # Segment URLs with no SegmentTimeline
2239                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2240                             # https://github.com/rg3/youtube-dl/pull/14844
2241                             fragments = []
2242                             segment_duration = float_or_none(
2243                                 representation_ms_info['segment_duration'],
2244                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2245                             for segment_url in representation_ms_info['segment_urls']:
2246                                 fragment = {
2247                                     location_key(segment_url): segment_url,
2248                                 }
2249                                 if segment_duration:
2250                                     fragment['duration'] = segment_duration
2251                                 fragments.append(fragment)
2252                             representation_ms_info['fragments'] = fragments
2253                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2254                         # No fragments key is present in this case.
2255                         if 'fragments' in representation_ms_info:
2256                             f.update({
2257                                 'fragment_base_url': base_url,
2258                                 'fragments': [],
2259                                 'protocol': 'http_dash_segments',
2260                             })
2261                             if 'initialization_url' in representation_ms_info:
2262                                 initialization_url = representation_ms_info['initialization_url']
2263                                 if not f.get('url'):
2264                                     f['url'] = initialization_url
2265                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2266                             f['fragments'].extend(representation_ms_info['fragments'])
2267                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2268                         # is not necessarily unique within a Period thus formats with
2269                         # the same `format_id` are quite possible. There are numerous examples
2270                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2271                         # https://github.com/rg3/youtube-dl/issues/13919)
2272                         full_info = formats_dict.get(representation_id, {}).copy()
2273                         full_info.update(f)
2274                         formats.append(full_info)
2275                     else:
2276                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2277         return formats
2278
2279     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2280         res = self._download_xml_handle(
2281             ism_url, video_id,
2282             note=note or 'Downloading ISM manifest',
2283             errnote=errnote or 'Failed to download ISM manifest',
2284             fatal=fatal)
2285         if res is False:
2286             return []
2287         ism_doc, urlh = res
2288
2289         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2290
2291     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2292         """
2293         Parse formats from ISM manifest.
2294         References:
2295          1. [MS-SSTR]: Smooth Streaming Protocol,
2296             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2297         """
2298         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2299             return []
2300
2301         duration = int(ism_doc.attrib['Duration'])
2302         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2303
2304         formats = []
2305         for stream in ism_doc.findall('StreamIndex'):
2306             stream_type = stream.get('Type')
2307             if stream_type not in ('video', 'audio'):
2308                 continue
2309             url_pattern = stream.attrib['Url']
2310             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2311             stream_name = stream.get('Name')
2312             for track in stream.findall('QualityLevel'):
2313                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2314                 # TODO: add support for WVC1 and WMAP
2315                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2316                     self.report_warning('%s is not a supported codec' % fourcc)
2317                     continue
2318                 tbr = int(track.attrib['Bitrate']) // 1000
2319                 # [1] does not mention Width and Height attributes. However,
2320                 # they're often present while MaxWidth and MaxHeight are
2321                 # missing, so should be used as fallbacks
2322                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2323                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2324                 sampling_rate = int_or_none(track.get('SamplingRate'))
2325
2326                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2327                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2328
2329                 fragments = []
2330                 fragment_ctx = {
2331                     'time': 0,
2332                 }
2333                 stream_fragments = stream.findall('c')
2334                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2335                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2336                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2337                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2338                     if not fragment_ctx['duration']:
2339                         try:
2340                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2341                         except IndexError:
2342                             next_fragment_time = duration
2343                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2344                     for _ in range(fragment_repeat):
2345                         fragments.append({
2346                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2347                             'duration': fragment_ctx['duration'] / stream_timescale,
2348                         })
2349                         fragment_ctx['time'] += fragment_ctx['duration']
2350
2351                 format_id = []
2352                 if ism_id:
2353                     format_id.append(ism_id)
2354                 if stream_name:
2355                     format_id.append(stream_name)
2356                 format_id.append(compat_str(tbr))
2357
2358                 formats.append({
2359                     'format_id': '-'.join(format_id),
2360                     'url': ism_url,
2361                     'manifest_url': ism_url,
2362                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2363                     'width': width,
2364                     'height': height,
2365                     'tbr': tbr,
2366                     'asr': sampling_rate,
2367                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2368                     'acodec': 'none' if stream_type == 'video' else fourcc,
2369                     'protocol': 'ism',
2370                     'fragments': fragments,
2371                     '_download_params': {
2372                         'duration': duration,
2373                         'timescale': stream_timescale,
2374                         'width': width or 0,
2375                         'height': height or 0,
2376                         'fourcc': fourcc,
2377                         'codec_private_data': track.get('CodecPrivateData'),
2378                         'sampling_rate': sampling_rate,
2379                         'channels': int_or_none(track.get('Channels', 2)),
2380                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2381                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2382                     },
2383                 })
2384         return formats
2385
2386     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2387         def absolute_url(item_url):
2388             return urljoin(base_url, item_url)
2389
2390         def parse_content_type(content_type):
2391             if not content_type:
2392                 return {}
2393             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2394             if ctr:
2395                 mimetype, codecs = ctr.groups()
2396                 f = parse_codecs(codecs)
2397                 f['ext'] = mimetype2ext(mimetype)
2398                 return f
2399             return {}
2400
2401         def _media_formats(src, cur_media_type, type_info={}):
2402             full_url = absolute_url(src)
2403             ext = type_info.get('ext') or determine_ext(full_url)
2404             if ext == 'm3u8':
2405                 is_plain_url = False
2406                 formats = self._extract_m3u8_formats(
2407                     full_url, video_id, ext='mp4',
2408                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2409                     preference=preference, fatal=False)
2410             elif ext == 'mpd':
2411                 is_plain_url = False
2412                 formats = self._extract_mpd_formats(
2413                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2414             else:
2415                 is_plain_url = True
2416                 formats = [{
2417                     'url': full_url,
2418                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2419                 }]
2420             return is_plain_url, formats
2421
2422         entries = []
2423         # amp-video and amp-audio are very similar to their HTML5 counterparts
2424         # so we wll include them right here (see
2425         # https://www.ampproject.org/docs/reference/components/amp-video)
2426         media_tags = [(media_tag, media_type, '')
2427                       for media_tag, media_type
2428                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2429         media_tags.extend(re.findall(
2430             # We only allow video|audio followed by a whitespace or '>'.
2431             # Allowing more characters may end up in significant slow down (see
2432             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2433             # http://www.porntrex.com/maps/videositemap.xml).
2434             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2435         for media_tag, media_type, media_content in media_tags:
2436             media_info = {
2437                 'formats': [],
2438                 'subtitles': {},
2439             }
2440             media_attributes = extract_attributes(media_tag)
2441             src = media_attributes.get('src')
2442             if src:
2443                 _, formats = _media_formats(src, media_type)
2444                 media_info['formats'].extend(formats)
2445             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2446             if media_content:
2447                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2448                     source_attributes = extract_attributes(source_tag)
2449                     src = source_attributes.get('src')
2450                     if not src:
2451                         continue
2452                     f = parse_content_type(source_attributes.get('type'))
2453                     is_plain_url, formats = _media_formats(src, media_type, f)
2454                     if is_plain_url:
2455                         # res attribute is not standard but seen several times
2456                         # in the wild
2457                         f.update({
2458                             'height': int_or_none(source_attributes.get('res')),
2459                             'format_id': source_attributes.get('label'),
2460                         })
2461                         f.update(formats[0])
2462                         media_info['formats'].append(f)
2463                     else:
2464                         media_info['formats'].extend(formats)
2465                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2466                     track_attributes = extract_attributes(track_tag)
2467                     kind = track_attributes.get('kind')
2468                     if not kind or kind in ('subtitles', 'captions'):
2469                         src = track_attributes.get('src')
2470                         if not src:
2471                             continue
2472                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2473                         media_info['subtitles'].setdefault(lang, []).append({
2474                             'url': absolute_url(src),
2475                         })
2476             for f in media_info['formats']:
2477                 f.setdefault('http_headers', {})['Referer'] = base_url
2478             if media_info['formats'] or media_info['subtitles']:
2479                 entries.append(media_info)
2480         return entries
2481
2482     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2483         formats = []
2484         hdcore_sign = 'hdcore=3.7.0'
2485         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2486         hds_host = hosts.get('hds')
2487         if hds_host:
2488             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2489         if 'hdcore=' not in f4m_url:
2490             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2491         f4m_formats = self._extract_f4m_formats(
2492             f4m_url, video_id, f4m_id='hds', fatal=False)
2493         for entry in f4m_formats:
2494             entry.update({'extra_param_to_segment_url': hdcore_sign})
2495         formats.extend(f4m_formats)
2496         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2497         hls_host = hosts.get('hls')
2498         if hls_host:
2499             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2500         formats.extend(self._extract_m3u8_formats(
2501             m3u8_url, video_id, 'mp4', 'm3u8_native',
2502             m3u8_id='hls', fatal=False))
2503         return formats
2504
2505     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2506         query = compat_urlparse.urlparse(url).query
2507         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2508         mobj = re.search(
2509             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2510         url_base = mobj.group('url')
2511         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2512         formats = []
2513
2514         def manifest_url(manifest):
2515             m_url = '%s/%s' % (http_base_url, manifest)
2516             if query:
2517                 m_url += '?%s' % query
2518             return m_url
2519
2520         if 'm3u8' not in skip_protocols:
2521             formats.extend(self._extract_m3u8_formats(
2522                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2523                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2524         if 'f4m' not in skip_protocols:
2525             formats.extend(self._extract_f4m_formats(
2526                 manifest_url('manifest.f4m'),
2527                 video_id, f4m_id='hds', fatal=False))
2528         if 'dash' not in skip_protocols:
2529             formats.extend(self._extract_mpd_formats(
2530                 manifest_url('manifest.mpd'),
2531                 video_id, mpd_id='dash', fatal=False))
2532         if re.search(r'(?:/smil:|\.smil)', url_base):
2533             if 'smil' not in skip_protocols:
2534                 rtmp_formats = self._extract_smil_formats(
2535                     manifest_url('jwplayer.smil'),
2536                     video_id, fatal=False)
2537                 for rtmp_format in rtmp_formats:
2538                     rtsp_format = rtmp_format.copy()
2539                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2540                     del rtsp_format['play_path']
2541                     del rtsp_format['ext']
2542                     rtsp_format.update({
2543                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2544                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2545                         'protocol': 'rtsp',
2546                     })
2547                     formats.extend([rtmp_format, rtsp_format])
2548         else:
2549             for protocol in ('rtmp', 'rtsp'):
2550                 if protocol not in skip_protocols:
2551                     formats.append({
2552                         'url': '%s:%s' % (protocol, url_base),
2553                         'format_id': protocol,
2554                         'protocol': protocol,
2555                     })
2556         return formats
2557
2558     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2559         mobj = re.search(
2560             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2561             webpage)
2562         if mobj:
2563             try:
2564                 jwplayer_data = self._parse_json(mobj.group('options'),
2565                                                  video_id=video_id,
2566                                                  transform_source=transform_source)
2567             except ExtractorError:
2568                 pass
2569             else:
2570                 if isinstance(jwplayer_data, dict):
2571                     return jwplayer_data
2572
2573     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2574         jwplayer_data = self._find_jwplayer_data(
2575             webpage, video_id, transform_source=js_to_json)
2576         return self._parse_jwplayer_data(
2577             jwplayer_data, video_id, *args, **kwargs)
2578
2579     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2580                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2581         # JWPlayer backward compatibility: flattened playlists
2582         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2583         if 'playlist' not in jwplayer_data:
2584             jwplayer_data = {'playlist': [jwplayer_data]}
2585
2586         entries = []
2587
2588         # JWPlayer backward compatibility: single playlist item
2589         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2590         if not isinstance(jwplayer_data['playlist'], list):
2591             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2592
2593         for video_data in jwplayer_data['playlist']:
2594             # JWPlayer backward compatibility: flattened sources
2595             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2596             if 'sources' not in video_data:
2597                 video_data['sources'] = [video_data]
2598
2599             this_video_id = video_id or video_data['mediaid']
2600
2601             formats = self._parse_jwplayer_formats(
2602                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2603                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2604
2605             subtitles = {}
2606             tracks = video_data.get('tracks')
2607             if tracks and isinstance(tracks, list):
2608                 for track in tracks:
2609                     if not isinstance(track, dict):
2610                         continue
2611                     track_kind = track.get('kind')
2612                     if not track_kind or not isinstance(track_kind, compat_str):
2613                         continue
2614                     if track_kind.lower() not in ('captions', 'subtitles'):
2615                         continue
2616                     track_url = urljoin(base_url, track.get('file'))
2617                     if not track_url:
2618                         continue
2619                     subtitles.setdefault(track.get('label') or 'en', []).append({
2620                         'url': self._proto_relative_url(track_url)
2621                     })
2622
2623             entry = {
2624                 'id': this_video_id,
2625                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2626                 'description': video_data.get('description'),
2627                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2628                 'timestamp': int_or_none(video_data.get('pubdate')),
2629                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2630                 'subtitles': subtitles,
2631             }
2632             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2633             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2634                 entry.update({
2635                     '_type': 'url_transparent',
2636                     'url': formats[0]['url'],
2637                 })
2638             else:
2639                 self._sort_formats(formats)
2640                 entry['formats'] = formats
2641             entries.append(entry)
2642         if len(entries) == 1:
2643             return entries[0]
2644         else:
2645             return self.playlist_result(entries)
2646
2647     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2648                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2649         urls = []
2650         formats = []
2651         for source in jwplayer_sources_data:
2652             if not isinstance(source, dict):
2653                 continue
2654             source_url = self._proto_relative_url(source.get('file'))
2655             if not source_url:
2656                 continue
2657             if base_url:
2658                 source_url = compat_urlparse.urljoin(base_url, source_url)
2659             if source_url in urls:
2660                 continue
2661             urls.append(source_url)
2662             source_type = source.get('type') or ''
2663             ext = mimetype2ext(source_type) or determine_ext(source_url)
2664             if source_type == 'hls' or ext == 'm3u8':
2665                 formats.extend(self._extract_m3u8_formats(
2666                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2667                     m3u8_id=m3u8_id, fatal=False))
2668             elif source_type == 'dash' or ext == 'mpd':
2669                 formats.extend(self._extract_mpd_formats(
2670                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2671             elif ext == 'smil':
2672                 formats.extend(self._extract_smil_formats(
2673                     source_url, video_id, fatal=False))
2674             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2675             elif source_type.startswith('audio') or ext in (
2676                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2677                 formats.append({
2678                     'url': source_url,
2679                     'vcodec': 'none',
2680                     'ext': ext,
2681                 })
2682             else:
2683                 height = int_or_none(source.get('height'))
2684                 if height is None:
2685                     # Often no height is provided but there is a label in
2686                     # format like "1080p", "720p SD", or 1080.
2687                     height = int_or_none(self._search_regex(
2688                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2689                         'height', default=None))
2690                 a_format = {
2691                     'url': source_url,
2692                     'width': int_or_none(source.get('width')),
2693                     'height': height,
2694                     'tbr': int_or_none(source.get('bitrate')),
2695                     'ext': ext,
2696                 }
2697                 if source_url.startswith('rtmp'):
2698                     a_format['ext'] = 'flv'
2699                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2700                     # of jwplayer.flash.swf
2701                     rtmp_url_parts = re.split(
2702                         r'((?:mp4|mp3|flv):)', source_url, 1)
2703                     if len(rtmp_url_parts) == 3:
2704                         rtmp_url, prefix, play_path = rtmp_url_parts
2705                         a_format.update({
2706                             'url': rtmp_url,
2707                             'play_path': prefix + play_path,
2708                         })
2709                     if rtmp_params:
2710                         a_format.update(rtmp_params)
2711                 formats.append(a_format)
2712         return formats
2713
2714     def _live_title(self, name):
2715         """ Generate the title for a live video """
2716         now = datetime.datetime.now()
2717         now_str = now.strftime('%Y-%m-%d %H:%M')
2718         return name + ' ' + now_str
2719
2720     def _int(self, v, name, fatal=False, **kwargs):
2721         res = int_or_none(v, **kwargs)
2722         if 'get_attr' in kwargs:
2723             print(getattr(v, kwargs['get_attr']))
2724         if res is None:
2725             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2726             if fatal:
2727                 raise ExtractorError(msg)
2728             else:
2729                 self._downloader.report_warning(msg)
2730         return res
2731
2732     def _float(self, v, name, fatal=False, **kwargs):
2733         res = float_or_none(v, **kwargs)
2734         if res is None:
2735             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2736             if fatal:
2737                 raise ExtractorError(msg)
2738             else:
2739                 self._downloader.report_warning(msg)
2740         return res
2741
2742     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2743                     path='/', secure=False, discard=False, rest={}, **kwargs):
2744         cookie = compat_cookiejar.Cookie(
2745             0, name, value, port, port is not None, domain, True,
2746             domain.startswith('.'), path, True, secure, expire_time,
2747             discard, None, None, rest)
2748         self._downloader.cookiejar.set_cookie(cookie)
2749
2750     def _get_cookies(self, url):
2751         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2752         req = sanitized_Request(url)
2753         self._downloader.cookiejar.add_cookie_header(req)
2754         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2755
2756     def get_testcases(self, include_onlymatching=False):
2757         t = getattr(self, '_TEST', None)
2758         if t:
2759             assert not hasattr(self, '_TESTS'), \
2760                 '%s has _TEST and _TESTS' % type(self).__name__
2761             tests = [t]
2762         else:
2763             tests = getattr(self, '_TESTS', [])
2764         for t in tests:
2765             if not include_onlymatching and t.get('only_matching', False):
2766                 continue
2767             t['name'] = type(self).__name__[:-len('IE')]
2768             yield t
2769
2770     def is_suitable(self, age_limit):
2771         """ Test whether the extractor is generally suitable for the given
2772         age limit (i.e. pornographic sites are not, all others usually are) """
2773
2774         any_restricted = False
2775         for tc in self.get_testcases(include_onlymatching=False):
2776             if tc.get('playlist', []):
2777                 tc = tc['playlist'][0]
2778             is_restricted = age_restricted(
2779                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2780             if not is_restricted:
2781                 return True
2782             any_restricted = any_restricted or is_restricted
2783         return not any_restricted
2784
2785     def extract_subtitles(self, *args, **kwargs):
2786         if (self._downloader.params.get('writesubtitles', False) or
2787                 self._downloader.params.get('listsubtitles')):
2788             return self._get_subtitles(*args, **kwargs)
2789         return {}
2790
2791     def _get_subtitles(self, *args, **kwargs):
2792         raise NotImplementedError('This method must be implemented by subclasses')
2793
2794     @staticmethod
2795     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2796         """ Merge subtitle items for one language. Items with duplicated URLs
2797         will be dropped. """
2798         list1_urls = set([item['url'] for item in subtitle_list1])
2799         ret = list(subtitle_list1)
2800         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2801         return ret
2802
2803     @classmethod
2804     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2805         """ Merge two subtitle dictionaries, language by language. """
2806         ret = dict(subtitle_dict1)
2807         for lang in subtitle_dict2:
2808             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2809         return ret
2810
2811     def extract_automatic_captions(self, *args, **kwargs):
2812         if (self._downloader.params.get('writeautomaticsub', False) or
2813                 self._downloader.params.get('listsubtitles')):
2814             return self._get_automatic_captions(*args, **kwargs)
2815         return {}
2816
2817     def _get_automatic_captions(self, *args, **kwargs):
2818         raise NotImplementedError('This method must be implemented by subclasses')
2819
2820     def mark_watched(self, *args, **kwargs):
2821         if (self._downloader.params.get('mark_watched', False) and
2822                 (self._get_login_info()[0] is not None or
2823                     self._downloader.params.get('cookiefile') is not None)):
2824             self._mark_watched(*args, **kwargs)
2825
2826     def _mark_watched(self, *args, **kwargs):
2827         raise NotImplementedError('This method must be implemented by subclasses')
2828
2829     def geo_verification_headers(self):
2830         headers = {}
2831         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2832         if geo_verification_proxy:
2833             headers['Ytdl-request-proxy'] = geo_verification_proxy
2834         return headers
2835
2836     def _generic_id(self, url):
2837         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2838
2839     def _generic_title(self, url):
2840         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2841
2842
2843 class SearchInfoExtractor(InfoExtractor):
2844     """
2845     Base class for paged search queries extractors.
2846     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2847     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2848     """
2849
2850     @classmethod
2851     def _make_valid_url(cls):
2852         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2853
2854     @classmethod
2855     def suitable(cls, url):
2856         return re.match(cls._make_valid_url(), url) is not None
2857
2858     def _real_extract(self, query):
2859         mobj = re.match(self._make_valid_url(), query)
2860         if mobj is None:
2861             raise ExtractorError('Invalid search query "%s"' % query)
2862
2863         prefix = mobj.group('prefix')
2864         query = mobj.group('query')
2865         if prefix == '':
2866             return self._get_n_results(query, 1)
2867         elif prefix == 'all':
2868             return self._get_n_results(query, self._MAX_RESULTS)
2869         else:
2870             n = int(prefix)
2871             if n <= 0:
2872                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2873             elif n > self._MAX_RESULTS:
2874                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2875                 n = self._MAX_RESULTS
2876             return self._get_n_results(query, n)
2877
2878     def _get_n_results(self, query, n):
2879         """Get a specified number of results for a query"""
2880         raise NotImplementedError('This method must be implemented by subclasses')
2881
2882     @property
2883     def SEARCH_KEY(self):
2884         return self._SEARCH_KEY