_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177                     * downloader_options  A dictionary of downloader options as
 178                                  described in FileDownloader
 179
 180     url:            Final video URL.
 181     ext:            Video filename extension.
 182     format:         The video format, defaults to ext (used for --get-format)
 183     player_url:     SWF Player URL (used for rtmpdump).
 184
 185     The following fields are optional:
 186
 187     alt_title:      A secondary title of the video.
 188     display_id      An alternative identifier for the video, not necessarily
 189                     unique, but available before title. Typically, id is
 190                     something like "4234987", title "Dancing naked mole rats",
 191                     and display_id "dancing-naked-mole-rats"
 192     thumbnails:     A list of dictionaries, with the following entries:
 193                         * "id" (optional, string) - Thumbnail format ID
 194                         * "url"
 195                         * "preference" (optional, int) - quality of the image
 196                         * "width" (optional, int)
 197                         * "height" (optional, int)
 198                         * "resolution" (optional, string "{width}x{height"},
 199                                         deprecated)
 200                         * "filesize" (optional, int)
 201     thumbnail:      Full URL to a video thumbnail image.
 202     description:    Full video description.
 203     uploader:       Full name of the video uploader.
 204     license:        License name the video is licensed under.
 205     creator:        The creator of the video.
 206     release_date:   The date (YYYYMMDD) when the video was released.
 207     timestamp:      UNIX timestamp of the moment the video became available.
 208     upload_date:    Video upload date (YYYYMMDD).
 209                     If not explicitly set, calculated from timestamp.
 210     uploader_id:    Nickname or id of the video uploader.
 211     uploader_url:   Full URL to a personal webpage of the video uploader.
 212     location:       Physical location where the video was filmed.
 213     subtitles:      The available subtitles as a dictionary in the format
 214                     {tag: subformats}. "tag" is usually a language code, and
 215                     "subformats" is a list sorted from lower to higher
 216                     preference, each element is a dictionary with the "ext"
 217                     entry and one of:
 218                         * "data": The subtitles file contents
 219                         * "url": A URL pointing to the subtitles file
 220                     "ext" will be calculated from URL if missing
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 222                     automatically generated captions
 223     duration:       Length of the video in seconds, as an integer or float.
 224     view_count:     How many users have watched the video on the platform.
 225     like_count:     Number of positive ratings of the video
 226     dislike_count:  Number of negative ratings of the video
 227     repost_count:   Number of reposts of the video
 228     average_rating: Average rating give by users, the scale used depends on the webpage
 229     comment_count:  Number of comments on the video
 230     comments:       A list of comments, each with one or more of the following
 231                     properties (all but one of text or html optional):
 232                         * "author" - human-readable name of the comment author
 233                         * "author_id" - user ID of the comment author
 234                         * "id" - Comment ID
 235                         * "html" - Comment as HTML
 236                         * "text" - Plain text of the comment
 237                         * "timestamp" - UNIX timestamp of comment
 238                         * "parent" - ID of the comment this one is replying to.
 239                                      Set to "root" to indicate that this is a
 240                                      comment to the original video.
 241     age_limit:      Age restriction for the video, as an integer (years)
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 243                     should allow to get the same result again. (It will be set
 244                     by YoutubeDL if it's missing)
 245     categories:     A list of categories that the video falls in, for example
 246                     ["Sports", "Berlin"]
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 248     is_live:        True, False, or None (=unknown). Whether this video is a
 249                     live stream that goes on instead of a fixed-length video.
 250     start_time:     Time in seconds where the reproduction should start, as
 251                     specified in the URL.
 252     end_time:       Time in seconds where the reproduction should end, as
 253                     specified in the URL.
 254     chapters:       A list of dictionaries, with the following entries:
 255                         * "start_time" - The start time of the chapter in seconds
 256                         * "end_time" - The end time of the chapter in seconds
 257                         * "title" (optional, string)
 258
 259     The following fields should only be used when the video belongs to some logical
 260     chapter or section:
 261
 262     chapter:        Name or title of the chapter the video belongs to.
 263     chapter_number: Number of the chapter the video belongs to, as an integer.
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 265
 266     The following fields should only be used when the video is an episode of some
 267     series, programme or podcast:
 268
 269     series:         Title of the series or programme the video episode belongs to.
 270     season:         Title of the season the video episode belongs to.
 271     season_number:  Number of the season the video episode belongs to, as an integer.
 272     season_id:      Id of the season the video episode belongs to, as a unicode string.
 273     episode:        Title of the video episode. Unlike mandatory video title field,
 274                     this field should denote the exact title of the video episode
 275                     without any kind of decoration.
 276     episode_number: Number of the video episode within a season, as an integer.
 277     episode_id:     Id of the video episode, as a unicode string.
 278
 279     The following fields should only be used when the media is a track or a part of
 280     a music album:
 281
 282     track:          Title of the track.
 283     track_number:   Number of the track within an album or a disc, as an integer.
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 285                     as a unicode string.
 286     artist:         Artist(s) of the track.
 287     genre:          Genre(s) of the track.
 288     album:          Title of the album the track belongs to.
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 290     album_artist:   List of all artists appeared on the album (e.g.
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 292                     and compilations).
 293     disc_number:    Number of the disc or other physical medium the track belongs to,
 294                     as an integer.
 295     release_year:   Year (YYYY) when the album was released.
 296
 297     Unless mentioned otherwise, the fields should be Unicode strings.
 298
 299     Unless mentioned otherwise, None is equivalent to absence of information.
 300
 301
 302     _type "playlist" indicates multiple videos.
 303     There must be a key "entries", which is a list, an iterable, or a PagedList
 304     object, each element of which is a valid dictionary by this specification.
 305
 306     Additionally, playlists can have "id", "title", "description", "uploader",
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos
 308     (see above).
 309
 310
 311     _type "multi_video" indicates that there are multiple videos that
 312     form a single show, for examples multiple acts of an opera or TV episode.
 313     It must have an entries key like a playlist and contain all the keys
 314     required for a video at the same time.
 315
 316
 317     _type "url" indicates that the video must be extracted from another
 318     location, possibly by a different extractor. Its only required key is:
 319     "url" - the next URL to extract.
 320     The key "ie_key" can be set to the class name (minus the trailing "IE",
 321     e.g. "Youtube") if the extractor class is known in advance.
 322     Additionally, the dictionary may have any properties of the resolved entity
 323     known in advance, for example "title" if the title of the referred video is
 324     known ahead of time.
 325
 326
 327     _type "url_transparent" entities have the same specification as "url", but
 328     indicate that the given additional information is more precise than the one
 329     associated with the resolved URL.
 330     This is useful when a site employs a video service that hosts the video and
 331     its technical metadata, but that video service does not embed a useful
 332     title, description etc.
 333
 334
 335     Subclasses of this one should re-define the _real_initialize() and
 336     _real_extract() methods and define a _VALID_URL regexp.
 337     Probably, they should also be added to the list of extractors.
 338
 339     _GEO_BYPASS attribute may be set to False in order to disable
 340     geo restriction bypass mechanisms for a particular extractor.
 341     Though it won't disable explicit geo restriction bypass based on
 342     country code provided with geo_bypass_country. (experimental)
 343
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 345     countries for this extractor. One of these countries will be used by
 346     geo restriction bypass mechanism right away in order to bypass
 347     geo restriction, of course, if the mechanism is not disabled. (experimental)
 348
 349     NB: both these geo attributes are experimental and may change in future
 350     or be completely removed.
 351
 352     Finally, the _WORKING attribute should be set to False for broken IEs
 353     in order to warn the users and skip the tests.
 354     """
 355
 356     _ready = False
 357     _downloader = None
 358     _x_forwarded_for_ip = None
 359     _GEO_BYPASS = True
 360     _GEO_COUNTRIES = None
 361     _WORKING = True
 362
 363     def __init__(self, downloader=None):
 364         """Constructor. Receives an optional downloader."""
 365         self._ready = False
 366         self._x_forwarded_for_ip = None
 367         self.set_downloader(downloader)
 368
 369     @classmethod
 370     def suitable(cls, url):
 371         """Receives a URL and returns True if suitable for this IE."""
 372
 373         # This does not use has/getattr intentionally - we want to know whether
 374         # we have cached the regexp for *this* class, whereas getattr would also
 375         # match the superclass
 376         if '_VALID_URL_RE' not in cls.__dict__:
 377             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 378         return cls._VALID_URL_RE.match(url) is not None
 379
 380     @classmethod
 381     def _match_id(cls, url):
 382         if '_VALID_URL_RE' not in cls.__dict__:
 383             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 384         m = cls._VALID_URL_RE.match(url)
 385         assert m
 386         return compat_str(m.group('id'))
 387
 388     @classmethod
 389     def working(cls):
 390         """Getter method for _WORKING."""
 391         return cls._WORKING
 392
 393     def initialize(self):
 394         """Initializes an instance (authentication, etc)."""
 395         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 396         if not self._ready:
 397             self._real_initialize()
 398             self._ready = True
 399
 400     def _initialize_geo_bypass(self, countries):
 401         """
 402         Initialize geo restriction bypass mechanism.
 403
 404         This method is used to initialize geo bypass mechanism based on faking
 405         X-Forwarded-For HTTP header. A random country from provided country list
 406         is selected and a random IP belonging to this country is generated. This
 407         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 408         HTTP requests.
 409
 410         This method will be used for initial geo bypass mechanism initialization
 411         during the instance initialization with _GEO_COUNTRIES.
 412
 413         You may also manually call it from extractor's code if geo countries
 414         information is not available beforehand (e.g. obtained during
 415         extraction) or due to some another reason.
 416         """
 417         if not self._x_forwarded_for_ip:
 418             country_code = self._downloader.params.get('geo_bypass_country', None)
 419             # If there is no explicit country for geo bypass specified and
 420             # the extractor is known to be geo restricted let's fake IP
 421             # as X-Forwarded-For right away.
 422             if (not country_code and
 423                     self._GEO_BYPASS and
 424                     self._downloader.params.get('geo_bypass', True) and
 425                     countries):
 426                 country_code = random.choice(countries)
 427             if country_code:
 428                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 429                 if self._downloader.params.get('verbose', False):
 430                     self._downloader.to_screen(
 431                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 432                         % (self._x_forwarded_for_ip, country_code.upper()))
 433
 434     def extract(self, url):
 435         """Extracts URL information and returns it in list of dicts."""
 436         try:
 437             for _ in range(2):
 438                 try:
 439                     self.initialize()
 440                     ie_result = self._real_extract(url)
 441                     if self._x_forwarded_for_ip:
 442                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 443                     return ie_result
 444                 except GeoRestrictedError as e:
 445                     if self.__maybe_fake_ip_and_retry(e.countries):
 446                         continue
 447                     raise
 448         except ExtractorError:
 449             raise
 450         except compat_http_client.IncompleteRead as e:
 451             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 452         except (KeyError, StopIteration) as e:
 453             raise ExtractorError('An extractor error has occurred.', cause=e)
 454
 455     def __maybe_fake_ip_and_retry(self, countries):
 456         if (not self._downloader.params.get('geo_bypass_country', None) and
 457                 self._GEO_BYPASS and
 458                 self._downloader.params.get('geo_bypass', True) and
 459                 not self._x_forwarded_for_ip and
 460                 countries):
 461             country_code = random.choice(countries)
 462             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 463             if self._x_forwarded_for_ip:
 464                 self.report_warning(
 465                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 466                     % (self._x_forwarded_for_ip, country_code.upper()))
 467                 return True
 468         return False
 469
 470     def set_downloader(self, downloader):
 471         """Sets the downloader for this IE."""
 472         self._downloader = downloader
 473
 474     def _real_initialize(self):
 475         """Real initialization process. Redefine in subclasses."""
 476         pass
 477
 478     def _real_extract(self, url):
 479         """Real extraction process. Redefine in subclasses."""
 480         pass
 481
 482     @classmethod
 483     def ie_key(cls):
 484         """A string for getting the InfoExtractor with get_info_extractor"""
 485         return compat_str(cls.__name__[:-2])
 486
 487     @property
 488     def IE_NAME(self):
 489         return compat_str(type(self).__name__[:-2])
 490
 491     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 492         """ Returns the response handle """
 493         if note is None:
 494             self.report_download_webpage(video_id)
 495         elif note is not False:
 496             if video_id is None:
 497                 self.to_screen('%s' % (note,))
 498             else:
 499                 self.to_screen('%s: %s' % (video_id, note))
 500
 501         # Some sites check X-Forwarded-For HTTP header in order to figure out
 502         # the origin of the client behind proxy. This allows bypassing geo
 503         # restriction by faking this header's value to IP that belongs to some
 504         # geo unrestricted country. We will do so once we encounter any
 505         # geo restriction error.
 506         if self._x_forwarded_for_ip:
 507             if 'X-Forwarded-For' not in headers:
 508                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 509
 510         if isinstance(url_or_request, compat_urllib_request.Request):
 511             url_or_request = update_Request(
 512                 url_or_request, data=data, headers=headers, query=query)
 513         else:
 514             if query:
 515                 url_or_request = update_url_query(url_or_request, query)
 516             if data is not None or headers:
 517                 url_or_request = sanitized_Request(url_or_request, data, headers)
 518         try:
 519             return self._downloader.urlopen(url_or_request)
 520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 521             if errnote is False:
 522                 return False
 523             if errnote is None:
 524                 errnote = 'Unable to download webpage'
 525
 526             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 527             if fatal:
 528                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 529             else:
 530                 self._downloader.report_warning(errmsg)
 531                 return False
 532
 533     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 534         """ Returns a tuple (page content as string, URL handle) """
 535         # Strip hashes from the URL (#1038)
 536         if isinstance(url_or_request, (compat_str, str)):
 537             url_or_request = url_or_request.partition('#')[0]
 538
 539         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 540         if urlh is False:
 541             assert not fatal
 542             return False
 543         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 544         return (content, urlh)
 545
 546     @staticmethod
 547     def _guess_encoding_from_content(content_type, webpage_bytes):
 548         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 549         if m:
 550             encoding = m.group(1)
 551         else:
 552             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 553                           webpage_bytes[:1024])
 554             if m:
 555                 encoding = m.group(1).decode('ascii')
 556             elif webpage_bytes.startswith(b'\xff\xfe'):
 557                 encoding = 'utf-16'
 558             else:
 559                 encoding = 'utf-8'
 560
 561         return encoding
 562
 563     def __check_blocked(self, content):
 564         first_block = content[:512]
 565         if ('<title>Access to this site is blocked</title>' in content and
 566                 'Websense' in first_block):
 567             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 568             blocked_iframe = self._html_search_regex(
 569                 r'<iframe src="([^"]+)"', content,
 570                 'Websense information URL', default=None)
 571             if blocked_iframe:
 572                 msg += ' Visit %s for more details' % blocked_iframe
 573             raise ExtractorError(msg, expected=True)
 574         if '<title>The URL you requested has been blocked</title>' in first_block:
 575             msg = (
 576                 'Access to this webpage has been blocked by Indian censorship. '
 577                 'Use a VPN or proxy server (with --proxy) to route around it.')
 578             block_msg = self._html_search_regex(
 579                 r'</h1><p>(.*?)</p>',
 580                 content, 'block message', default=None)
 581             if block_msg:
 582                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 583             raise ExtractorError(msg, expected=True)
 584         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 585                 'blocklist.rkn.gov.ru' in content):
 586             raise ExtractorError(
 587                 'Access to this webpage has been blocked by decision of the Russian government. '
 588                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 589                 expected=True)
 590
 591     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 592         content_type = urlh.headers.get('Content-Type', '')
 593         webpage_bytes = urlh.read()
 594         if prefix is not None:
 595             webpage_bytes = prefix + webpage_bytes
 596         if not encoding:
 597             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 598         if self._downloader.params.get('dump_intermediate_pages', False):
 599             self.to_screen('Dumping request to ' + urlh.geturl())
 600             dump = base64.b64encode(webpage_bytes).decode('ascii')
 601             self._downloader.to_screen(dump)
 602         if self._downloader.params.get('write_pages', False):
 603             basen = '%s_%s' % (video_id, urlh.geturl())
 604             if len(basen) > 240:
 605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 606                 basen = basen[:240 - len(h)] + h
 607             raw_filename = basen + '.dump'
 608             filename = sanitize_filename(raw_filename, restricted=True)
 609             self.to_screen('Saving request to ' + filename)
 610             # Working around MAX_PATH limitation on Windows (see
 611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 612             if compat_os_name == 'nt':
 613                 absfilepath = os.path.abspath(filename)
 614                 if len(absfilepath) > 259:
 615                     filename = '\\\\?\\' + absfilepath
 616             with open(filename, 'wb') as outf:
 617                 outf.write(webpage_bytes)
 618
 619         try:
 620             content = webpage_bytes.decode(encoding, 'replace')
 621         except LookupError:
 622             content = webpage_bytes.decode('utf-8', 'replace')
 623
 624         self.__check_blocked(content)
 625
 626         return content
 627
 628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 629         """ Returns the data of the page as a string """
 630         success = False
 631         try_count = 0
 632         while success is False:
 633             try:
 634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 635                 success = True
 636             except compat_http_client.IncompleteRead as e:
 637                 try_count += 1
 638                 if try_count >= tries:
 639                     raise e
 640                 self._sleep(timeout, video_id)
 641         if res is False:
 642             return res
 643         else:
 644             content, _ = res
 645             return content
 646
 647     def _download_xml_handle(
 648             self, url_or_request, video_id, note='Downloading XML',
 649             errnote='Unable to download XML', transform_source=None,
 650             fatal=True, encoding=None, data=None, headers={}, query={}):
 651         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
 652         res = self._download_webpage_handle(
 653             url_or_request, video_id, note, errnote, fatal=fatal,
 654             encoding=encoding, data=data, headers=headers, query=query)
 655         if res is False:
 656             return res
 657         xml_string, urlh = res
 658         return self._parse_xml(
 659             xml_string, video_id, transform_source=transform_source,
 660             fatal=fatal), urlh
 661
 662     def _download_xml(self, url_or_request, video_id,
 663                       note='Downloading XML', errnote='Unable to download XML',
 664                       transform_source=None, fatal=True, encoding=None,
 665                       data=None, headers={}, query={}):
 666         """Return the xml as an xml.etree.ElementTree.Element"""
 667         res = self._download_xml_handle(
 668             url_or_request, video_id, note=note, errnote=errnote,
 669             transform_source=transform_source, fatal=fatal, encoding=encoding,
 670             data=data, headers=headers, query=query)
 671         return res if res is False else res[0]
 672
 673     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 674         if transform_source:
 675             xml_string = transform_source(xml_string)
 676         try:
 677             return compat_etree_fromstring(xml_string.encode('utf-8'))
 678         except compat_xml_parse_error as ve:
 679             errmsg = '%s: Failed to parse XML ' % video_id
 680             if fatal:
 681                 raise ExtractorError(errmsg, cause=ve)
 682             else:
 683                 self.report_warning(errmsg + str(ve))
 684
 685     def _download_json_handle(
 686             self, url_or_request, video_id, note='Downloading JSON metadata',
 687             errnote='Unable to download JSON metadata', transform_source=None,
 688             fatal=True, encoding=None, data=None, headers={}, query={}):
 689         """Return a tuple (JSON object, URL handle)"""
 690         res = self._download_webpage_handle(
 691             url_or_request, video_id, note, errnote, fatal=fatal,
 692             encoding=encoding, data=data, headers=headers, query=query)
 693         if res is False:
 694             return res
 695         json_string, urlh = res
 696         return self._parse_json(
 697             json_string, video_id, transform_source=transform_source,
 698             fatal=fatal), urlh
 699
 700     def _download_json(
 701             self, url_or_request, video_id, note='Downloading JSON metadata',
 702             errnote='Unable to download JSON metadata', transform_source=None,
 703             fatal=True, encoding=None, data=None, headers={}, query={}):
 704         res = self._download_json_handle(
 705             url_or_request, video_id, note=note, errnote=errnote,
 706             transform_source=transform_source, fatal=fatal, encoding=encoding,
 707             data=data, headers=headers, query=query)
 708         return res if res is False else res[0]
 709
 710     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 711         if transform_source:
 712             json_string = transform_source(json_string)
 713         try:
 714             return json.loads(json_string)
 715         except ValueError as ve:
 716             errmsg = '%s: Failed to parse JSON ' % video_id
 717             if fatal:
 718                 raise ExtractorError(errmsg, cause=ve)
 719             else:
 720                 self.report_warning(errmsg + str(ve))
 721
 722     def report_warning(self, msg, video_id=None):
 723         idstr = '' if video_id is None else '%s: ' % video_id
 724         self._downloader.report_warning(
 725             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 726
 727     def to_screen(self, msg):
 728         """Print msg to screen, prefixing it with '[ie_name]'"""
 729         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 730
 731     def report_extraction(self, id_or_name):
 732         """Report information extraction."""
 733         self.to_screen('%s: Extracting information' % id_or_name)
 734
 735     def report_download_webpage(self, video_id):
 736         """Report webpage download."""
 737         self.to_screen('%s: Downloading webpage' % video_id)
 738
 739     def report_age_confirmation(self):
 740         """Report attempt to confirm age."""
 741         self.to_screen('Confirming age')
 742
 743     def report_login(self):
 744         """Report attempt to log in."""
 745         self.to_screen('Logging in')
 746
 747     @staticmethod
 748     def raise_login_required(msg='This video is only available for registered users'):
 749         raise ExtractorError(
 750             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 751             expected=True)
 752
 753     @staticmethod
 754     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 755         raise GeoRestrictedError(msg, countries=countries)
 756
 757     # Methods for following #608
 758     @staticmethod
 759     def url_result(url, ie=None, video_id=None, video_title=None):
 760         """Returns a URL that points to a page that should be processed"""
 761         # TODO: ie should be the class used for getting the info
 762         video_info = {'_type': 'url',
 763                       'url': url,
 764                       'ie_key': ie}
 765         if video_id is not None:
 766             video_info['id'] = video_id
 767         if video_title is not None:
 768             video_info['title'] = video_title
 769         return video_info
 770
 771     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 772         urls = orderedSet(
 773             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 774             for m in matches)
 775         return self.playlist_result(
 776             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 777
 778     @staticmethod
 779     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 780         """Returns a playlist"""
 781         video_info = {'_type': 'playlist',
 782                       'entries': entries}
 783         if playlist_id:
 784             video_info['id'] = playlist_id
 785         if playlist_title:
 786             video_info['title'] = playlist_title
 787         if playlist_description:
 788             video_info['description'] = playlist_description
 789         return video_info
 790
 791     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 792         """
 793         Perform a regex search on the given string, using a single or a list of
 794         patterns returning the first matching group.
 795         In case of failure return a default value or raise a WARNING or a
 796         RegexNotFoundError, depending on fatal, specifying the field name.
 797         """
 798         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 799             mobj = re.search(pattern, string, flags)
 800         else:
 801             for p in pattern:
 802                 mobj = re.search(p, string, flags)
 803                 if mobj:
 804                     break
 805
 806         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 807             _name = '\033[0;34m%s\033[0m' % name
 808         else:
 809             _name = name
 810
 811         if mobj:
 812             if group is None:
 813                 # return the first matching group
 814                 return next(g for g in mobj.groups() if g is not None)
 815             else:
 816                 return mobj.group(group)
 817         elif default is not NO_DEFAULT:
 818             return default
 819         elif fatal:
 820             raise RegexNotFoundError('Unable to extract %s' % _name)
 821         else:
 822             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 823             return None
 824
 825     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 826         """
 827         Like _search_regex, but strips HTML tags and unescapes entities.
 828         """
 829         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 830         if res:
 831             return clean_html(res).strip()
 832         else:
 833             return res
 834
 835     def _get_netrc_login_info(self, netrc_machine=None):
 836         username = None
 837         password = None
 838         netrc_machine = netrc_machine or self._NETRC_MACHINE
 839
 840         if self._downloader.params.get('usenetrc', False):
 841             try:
 842                 info = netrc.netrc().authenticators(netrc_machine)
 843                 if info is not None:
 844                     username = info[0]
 845                     password = info[2]
 846                 else:
 847                     raise netrc.NetrcParseError(
 848                         'No authenticators for %s' % netrc_machine)
 849             except (IOError, netrc.NetrcParseError) as err:
 850                 self._downloader.report_warning(
 851                     'parsing .netrc: %s' % error_to_compat_str(err))
 852
 853         return username, password
 854
 855     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 856         """
 857         Get the login info as (username, password)
 858         First look for the manually specified credentials using username_option
 859         and password_option as keys in params dictionary. If no such credentials
 860         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 861         value.
 862         If there's no info available, return (None, None)
 863         """
 864         if self._downloader is None:
 865             return (None, None)
 866
 867         downloader_params = self._downloader.params
 868
 869         # Attempt to use provided username and password or .netrc data
 870         if downloader_params.get(username_option) is not None:
 871             username = downloader_params[username_option]
 872             password = downloader_params[password_option]
 873         else:
 874             username, password = self._get_netrc_login_info(netrc_machine)
 875
 876         return username, password
 877
 878     def _get_tfa_info(self, note='two-factor verification code'):
 879         """
 880         Get the two-factor authentication info
 881         TODO - asking the user will be required for sms/phone verify
 882         currently just uses the command line option
 883         If there's no info available, return None
 884         """
 885         if self._downloader is None:
 886             return None
 887         downloader_params = self._downloader.params
 888
 889         if downloader_params.get('twofactor') is not None:
 890             return downloader_params['twofactor']
 891
 892         return compat_getpass('Type %s and press [Return]: ' % note)
 893
 894     # Helper functions for extracting OpenGraph info
 895     @staticmethod
 896     def _og_regexes(prop):
 897         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 898         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 899                        % {'prop': re.escape(prop)})
 900         template = r'<meta[^>]+?%s[^>]+?%s'
 901         return [
 902             template % (property_re, content_re),
 903             template % (content_re, property_re),
 904         ]
 905
 906     @staticmethod
 907     def _meta_regex(prop):
 908         return r'''(?isx)<meta
 909                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 910                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 911
 912     def _og_search_property(self, prop, html, name=None, **kargs):
 913         if not isinstance(prop, (list, tuple)):
 914             prop = [prop]
 915         if name is None:
 916             name = 'OpenGraph %s' % prop[0]
 917         og_regexes = []
 918         for p in prop:
 919             og_regexes.extend(self._og_regexes(p))
 920         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 921         if escaped is None:
 922             return None
 923         return unescapeHTML(escaped)
 924
 925     def _og_search_thumbnail(self, html, **kargs):
 926         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 927
 928     def _og_search_description(self, html, **kargs):
 929         return self._og_search_property('description', html, fatal=False, **kargs)
 930
 931     def _og_search_title(self, html, **kargs):
 932         return self._og_search_property('title', html, **kargs)
 933
 934     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 935         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 936         if secure:
 937             regexes = self._og_regexes('video:secure_url') + regexes
 938         return self._html_search_regex(regexes, html, name, **kargs)
 939
 940     def _og_search_url(self, html, **kargs):
 941         return self._og_search_property('url', html, **kargs)
 942
 943     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 944         if not isinstance(name, (list, tuple)):
 945             name = [name]
 946         if display_name is None:
 947             display_name = name[0]
 948         return self._html_search_regex(
 949             [self._meta_regex(n) for n in name],
 950             html, display_name, fatal=fatal, group='content', **kwargs)
 951
 952     def _dc_search_uploader(self, html):
 953         return self._html_search_meta('dc.creator', html, 'uploader')
 954
 955     def _rta_search(self, html):
 956         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 957         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 958                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 959                      html):
 960             return 18
 961         return 0
 962
 963     def _media_rating_search(self, html):
 964         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 965         rating = self._html_search_meta('rating', html)
 966
 967         if not rating:
 968             return None
 969
 970         RATING_TABLE = {
 971             'safe for kids': 0,
 972             'general': 8,
 973             '14 years': 14,
 974             'mature': 17,
 975             'restricted': 19,
 976         }
 977         return RATING_TABLE.get(rating.lower())
 978
 979     def _family_friendly_search(self, html):
 980         # See http://schema.org/VideoObject
 981         family_friendly = self._html_search_meta(
 982             'isFamilyFriendly', html, default=None)
 983
 984         if not family_friendly:
 985             return None
 986
 987         RATING_TABLE = {
 988             '1': 0,
 989             'true': 0,
 990             '0': 18,
 991             'false': 18,
 992         }
 993         return RATING_TABLE.get(family_friendly.lower())
 994
 995     def _twitter_search_player(self, html):
 996         return self._html_search_meta('twitter:player', html,
 997                                       'twitter card player')
 998
 999     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1000         json_ld = self._search_regex(
1001             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1002             html, 'JSON-LD', group='json_ld', **kwargs)
1003         default = kwargs.get('default', NO_DEFAULT)
1004         if not json_ld:
1005             return default if default is not NO_DEFAULT else {}
1006         # JSON-LD may be malformed and thus `fatal` should be respected.
1007         # At the same time `default` may be passed that assumes `fatal=False`
1008         # for _search_regex. Let's simulate the same behavior here as well.
1009         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1010         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1011
1012     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1013         if isinstance(json_ld, compat_str):
1014             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1015         if not json_ld:
1016             return {}
1017         info = {}
1018         if not isinstance(json_ld, (list, tuple, dict)):
1019             return info
1020         if isinstance(json_ld, dict):
1021             json_ld = [json_ld]
1022
1023         def extract_video_object(e):
1024             assert e['@type'] == 'VideoObject'
1025             info.update({
1026                 'url': e.get('contentUrl'),
1027                 'title': unescapeHTML(e.get('name')),
1028                 'description': unescapeHTML(e.get('description')),
1029                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1030                 'duration': parse_duration(e.get('duration')),
1031                 'timestamp': unified_timestamp(e.get('uploadDate')),
1032                 'filesize': float_or_none(e.get('contentSize')),
1033                 'tbr': int_or_none(e.get('bitrate')),
1034                 'width': int_or_none(e.get('width')),
1035                 'height': int_or_none(e.get('height')),
1036                 'view_count': int_or_none(e.get('interactionCount')),
1037             })
1038
1039         for e in json_ld:
1040             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1041                 item_type = e.get('@type')
1042                 if expected_type is not None and expected_type != item_type:
1043                     return info
1044                 if item_type in ('TVEpisode', 'Episode'):
1045                     info.update({
1046                         'episode': unescapeHTML(e.get('name')),
1047                         'episode_number': int_or_none(e.get('episodeNumber')),
1048                         'description': unescapeHTML(e.get('description')),
1049                     })
1050                     part_of_season = e.get('partOfSeason')
1051                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1052                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1053                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1054                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1055                         info['series'] = unescapeHTML(part_of_series.get('name'))
1056                 elif item_type in ('Article', 'NewsArticle'):
1057                     info.update({
1058                         'timestamp': parse_iso8601(e.get('datePublished')),
1059                         'title': unescapeHTML(e.get('headline')),
1060                         'description': unescapeHTML(e.get('articleBody')),
1061                     })
1062                 elif item_type == 'VideoObject':
1063                     extract_video_object(e)
1064                     continue
1065                 video = e.get('video')
1066                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1067                     extract_video_object(video)
1068                 break
1069         return dict((k, v) for k, v in info.items() if v is not None)
1070
1071     @staticmethod
1072     def _hidden_inputs(html):
1073         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1074         hidden_inputs = {}
1075         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1076             attrs = extract_attributes(input)
1077             if not input:
1078                 continue
1079             if attrs.get('type') not in ('hidden', 'submit'):
1080                 continue
1081             name = attrs.get('name') or attrs.get('id')
1082             value = attrs.get('value')
1083             if name and value is not None:
1084                 hidden_inputs[name] = value
1085         return hidden_inputs
1086
1087     def _form_hidden_inputs(self, form_id, html):
1088         form = self._search_regex(
1089             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1090             html, '%s form' % form_id, group='form')
1091         return self._hidden_inputs(form)
1092
1093     def _sort_formats(self, formats, field_preference=None):
1094         if not formats:
1095             raise ExtractorError('No video formats found')
1096
1097         for f in formats:
1098             # Automatically determine tbr when missing based on abr and vbr (improves
1099             # formats sorting in some cases)
1100             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1101                 f['tbr'] = f['abr'] + f['vbr']
1102
1103         def _formats_key(f):
1104             # TODO remove the following workaround
1105             from ..utils import determine_ext
1106             if not f.get('ext') and 'url' in f:
1107                 f['ext'] = determine_ext(f['url'])
1108
1109             if isinstance(field_preference, (list, tuple)):
1110                 return tuple(
1111                     f.get(field)
1112                     if f.get(field) is not None
1113                     else ('' if field == 'format_id' else -1)
1114                     for field in field_preference)
1115
1116             preference = f.get('preference')
1117             if preference is None:
1118                 preference = 0
1119                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1120                     preference -= 0.5
1121
1122             protocol = f.get('protocol') or determine_protocol(f)
1123             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1124
1125             if f.get('vcodec') == 'none':  # audio only
1126                 preference -= 50
1127                 if self._downloader.params.get('prefer_free_formats'):
1128                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1129                 else:
1130                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1131                 ext_preference = 0
1132                 try:
1133                     audio_ext_preference = ORDER.index(f['ext'])
1134                 except ValueError:
1135                     audio_ext_preference = -1
1136             else:
1137                 if f.get('acodec') == 'none':  # video only
1138                     preference -= 40
1139                 if self._downloader.params.get('prefer_free_formats'):
1140                     ORDER = ['flv', 'mp4', 'webm']
1141                 else:
1142                     ORDER = ['webm', 'flv', 'mp4']
1143                 try:
1144                     ext_preference = ORDER.index(f['ext'])
1145                 except ValueError:
1146                     ext_preference = -1
1147                 audio_ext_preference = 0
1148
1149             return (
1150                 preference,
1151                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1152                 f.get('quality') if f.get('quality') is not None else -1,
1153                 f.get('tbr') if f.get('tbr') is not None else -1,
1154                 f.get('filesize') if f.get('filesize') is not None else -1,
1155                 f.get('vbr') if f.get('vbr') is not None else -1,
1156                 f.get('height') if f.get('height') is not None else -1,
1157                 f.get('width') if f.get('width') is not None else -1,
1158                 proto_preference,
1159                 ext_preference,
1160                 f.get('abr') if f.get('abr') is not None else -1,
1161                 audio_ext_preference,
1162                 f.get('fps') if f.get('fps') is not None else -1,
1163                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1164                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1165                 f.get('format_id') if f.get('format_id') is not None else '',
1166             )
1167         formats.sort(key=_formats_key)
1168
1169     def _check_formats(self, formats, video_id):
1170         if formats:
1171             formats[:] = filter(
1172                 lambda f: self._is_valid_url(
1173                     f['url'], video_id,
1174                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1175                 formats)
1176
1177     @staticmethod
1178     def _remove_duplicate_formats(formats):
1179         format_urls = set()
1180         unique_formats = []
1181         for f in formats:
1182             if f['url'] not in format_urls:
1183                 format_urls.add(f['url'])
1184                 unique_formats.append(f)
1185         formats[:] = unique_formats
1186
1187     def _is_valid_url(self, url, video_id, item='video', headers={}):
1188         url = self._proto_relative_url(url, scheme='http:')
1189         # For now assume non HTTP(S) URLs always valid
1190         if not (url.startswith('http://') or url.startswith('https://')):
1191             return True
1192         try:
1193             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1194             return True
1195         except ExtractorError as e:
1196             if isinstance(e.cause, compat_urllib_error.URLError):
1197                 self.to_screen(
1198                     '%s: %s URL is invalid, skipping' % (video_id, item))
1199                 return False
1200             raise
1201
1202     def http_scheme(self):
1203         """ Either "http:" or "https:", depending on the user's preferences """
1204         return (
1205             'http:'
1206             if self._downloader.params.get('prefer_insecure', False)
1207             else 'https:')
1208
1209     def _proto_relative_url(self, url, scheme=None):
1210         if url is None:
1211             return url
1212         if url.startswith('//'):
1213             if scheme is None:
1214                 scheme = self.http_scheme()
1215             return scheme + url
1216         else:
1217             return url
1218
1219     def _sleep(self, timeout, video_id, msg_template=None):
1220         if msg_template is None:
1221             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1222         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1223         self.to_screen(msg)
1224         time.sleep(timeout)
1225
1226     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1227                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1228                              fatal=True, m3u8_id=None):
1229         manifest = self._download_xml(
1230             manifest_url, video_id, 'Downloading f4m manifest',
1231             'Unable to download f4m manifest',
1232             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1233             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1234             transform_source=transform_source,
1235             fatal=fatal)
1236
1237         if manifest is False:
1238             return []
1239
1240         return self._parse_f4m_formats(
1241             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1242             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1243
1244     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1245                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1246                            fatal=True, m3u8_id=None):
1247         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1248         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1249         if akamai_pv is not None and ';' in akamai_pv.text:
1250             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1251             if playerVerificationChallenge.strip() != '':
1252                 return []
1253
1254         formats = []
1255         manifest_version = '1.0'
1256         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1257         if not media_nodes:
1258             manifest_version = '2.0'
1259             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1260         # Remove unsupported DRM protected media from final formats
1261         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1262         media_nodes = remove_encrypted_media(media_nodes)
1263         if not media_nodes:
1264             return formats
1265
1266         manifest_base_url = get_base_url(manifest)
1267
1268         bootstrap_info = xpath_element(
1269             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1270             'bootstrap info', default=None)
1271
1272         vcodec = None
1273         mime_type = xpath_text(
1274             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1275             'base URL', default=None)
1276         if mime_type and mime_type.startswith('audio/'):
1277             vcodec = 'none'
1278
1279         for i, media_el in enumerate(media_nodes):
1280             tbr = int_or_none(media_el.attrib.get('bitrate'))
1281             width = int_or_none(media_el.attrib.get('width'))
1282             height = int_or_none(media_el.attrib.get('height'))
1283             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1284             # If <bootstrapInfo> is present, the specified f4m is a
1285             # stream-level manifest, and only set-level manifests may refer to
1286             # external resources.  See section 11.4 and section 4 of F4M spec
1287             if bootstrap_info is None:
1288                 media_url = None
1289                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1290                 if manifest_version == '2.0':
1291                     media_url = media_el.attrib.get('href')
1292                 if media_url is None:
1293                     media_url = media_el.attrib.get('url')
1294                 if not media_url:
1295                     continue
1296                 manifest_url = (
1297                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1298                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1299                 # If media_url is itself a f4m manifest do the recursive extraction
1300                 # since bitrates in parent manifest (this one) and media_url manifest
1301                 # may differ leading to inability to resolve the format by requested
1302                 # bitrate in f4m downloader
1303                 ext = determine_ext(manifest_url)
1304                 if ext == 'f4m':
1305                     f4m_formats = self._extract_f4m_formats(
1306                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1307                         transform_source=transform_source, fatal=fatal)
1308                     # Sometimes stream-level manifest contains single media entry that
1309                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1310                     # At the same time parent's media entry in set-level manifest may
1311                     # contain it. We will copy it from parent in such cases.
1312                     if len(f4m_formats) == 1:
1313                         f = f4m_formats[0]
1314                         f.update({
1315                             'tbr': f.get('tbr') or tbr,
1316                             'width': f.get('width') or width,
1317                             'height': f.get('height') or height,
1318                             'format_id': f.get('format_id') if not tbr else format_id,
1319                             'vcodec': vcodec,
1320                         })
1321                     formats.extend(f4m_formats)
1322                     continue
1323                 elif ext == 'm3u8':
1324                     formats.extend(self._extract_m3u8_formats(
1325                         manifest_url, video_id, 'mp4', preference=preference,
1326                         m3u8_id=m3u8_id, fatal=fatal))
1327                     continue
1328             formats.append({
1329                 'format_id': format_id,
1330                 'url': manifest_url,
1331                 'manifest_url': manifest_url,
1332                 'ext': 'flv' if bootstrap_info is not None else None,
1333                 'protocol': 'f4m',
1334                 'tbr': tbr,
1335                 'width': width,
1336                 'height': height,
1337                 'vcodec': vcodec,
1338                 'preference': preference,
1339             })
1340         return formats
1341
1342     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1343         return {
1344             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1345             'url': m3u8_url,
1346             'ext': ext,
1347             'protocol': 'm3u8',
1348             'preference': preference - 100 if preference else -100,
1349             'resolution': 'multiple',
1350             'format_note': 'Quality selection URL',
1351         }
1352
1353     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1354                               entry_protocol='m3u8', preference=None,
1355                               m3u8_id=None, note=None, errnote=None,
1356                               fatal=True, live=False):
1357         res = self._download_webpage_handle(
1358             m3u8_url, video_id,
1359             note=note or 'Downloading m3u8 information',
1360             errnote=errnote or 'Failed to download m3u8 information',
1361             fatal=fatal)
1362
1363         if res is False:
1364             return []
1365
1366         m3u8_doc, urlh = res
1367         m3u8_url = urlh.geturl()
1368
1369         return self._parse_m3u8_formats(
1370             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1371             preference=preference, m3u8_id=m3u8_id, live=live)
1372
1373     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1374                             entry_protocol='m3u8', preference=None,
1375                             m3u8_id=None, live=False):
1376         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1377             return []
1378
1379         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1380             return []
1381
1382         formats = []
1383
1384         format_url = lambda u: (
1385             u
1386             if re.match(r'^https?://', u)
1387             else compat_urlparse.urljoin(m3u8_url, u))
1388
1389         # References:
1390         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1391         # 2. https://github.com/rg3/youtube-dl/issues/12211
1392
1393         # We should try extracting formats only from master playlists [1, 4.3.4],
1394         # i.e. playlists that describe available qualities. On the other hand
1395         # media playlists [1, 4.3.3] should be returned as is since they contain
1396         # just the media without qualities renditions.
1397         # Fortunately, master playlist can be easily distinguished from media
1398         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1399         # master playlist tags MUST NOT appear in a media playist and vice versa.
1400         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1401         # media playlist and MUST NOT appear in master playlist thus we can
1402         # clearly detect media playlist with this criterion.
1403
1404         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1405             return [{
1406                 'url': m3u8_url,
1407                 'format_id': m3u8_id,
1408                 'ext': ext,
1409                 'protocol': entry_protocol,
1410                 'preference': preference,
1411             }]
1412
1413         groups = {}
1414         last_stream_inf = {}
1415
1416         def extract_media(x_media_line):
1417             media = parse_m3u8_attributes(x_media_line)
1418             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1419             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1420             if not (media_type and group_id and name):
1421                 return
1422             groups.setdefault(group_id, []).append(media)
1423             if media_type not in ('VIDEO', 'AUDIO'):
1424                 return
1425             media_url = media.get('URI')
1426             if media_url:
1427                 format_id = []
1428                 for v in (m3u8_id, group_id, name):
1429                     if v:
1430                         format_id.append(v)
1431                 f = {
1432                     'format_id': '-'.join(format_id),
1433                     'url': format_url(media_url),
1434                     'manifest_url': m3u8_url,
1435                     'language': media.get('LANGUAGE'),
1436                     'ext': ext,
1437                     'protocol': entry_protocol,
1438                     'preference': preference,
1439                 }
1440                 if media_type == 'AUDIO':
1441                     f['vcodec'] = 'none'
1442                 formats.append(f)
1443
1444         def build_stream_name():
1445             # Despite specification does not mention NAME attribute for
1446             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1447             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1448             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1449             stream_name = last_stream_inf.get('NAME')
1450             if stream_name:
1451                 return stream_name
1452             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1453             # from corresponding rendition group
1454             stream_group_id = last_stream_inf.get('VIDEO')
1455             if not stream_group_id:
1456                 return
1457             stream_group = groups.get(stream_group_id)
1458             if not stream_group:
1459                 return stream_group_id
1460             rendition = stream_group[0]
1461             return rendition.get('NAME') or stream_group_id
1462
1463         for line in m3u8_doc.splitlines():
1464             if line.startswith('#EXT-X-STREAM-INF:'):
1465                 last_stream_inf = parse_m3u8_attributes(line)
1466             elif line.startswith('#EXT-X-MEDIA:'):
1467                 extract_media(line)
1468             elif line.startswith('#') or not line.strip():
1469                 continue
1470             else:
1471                 tbr = float_or_none(
1472                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1473                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1474                 format_id = []
1475                 if m3u8_id:
1476                     format_id.append(m3u8_id)
1477                 stream_name = build_stream_name()
1478                 # Bandwidth of live streams may differ over time thus making
1479                 # format_id unpredictable. So it's better to keep provided
1480                 # format_id intact.
1481                 if not live:
1482                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1483                 manifest_url = format_url(line.strip())
1484                 f = {
1485                     'format_id': '-'.join(format_id),
1486                     'url': manifest_url,
1487                     'manifest_url': m3u8_url,
1488                     'tbr': tbr,
1489                     'ext': ext,
1490                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1491                     'protocol': entry_protocol,
1492                     'preference': preference,
1493                 }
1494                 resolution = last_stream_inf.get('RESOLUTION')
1495                 if resolution:
1496                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1497                     if mobj:
1498                         f['width'] = int(mobj.group('width'))
1499                         f['height'] = int(mobj.group('height'))
1500                 # Unified Streaming Platform
1501                 mobj = re.search(
1502                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1503                 if mobj:
1504                     abr, vbr = mobj.groups()
1505                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1506                     f.update({
1507                         'vbr': vbr,
1508                         'abr': abr,
1509                     })
1510                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1511                 f.update(codecs)
1512                 audio_group_id = last_stream_inf.get('AUDIO')
1513                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1514                 # references a rendition group MUST have a CODECS attribute.
1515                 # However, this is not always respected, for example, [2]
1516                 # contains EXT-X-STREAM-INF tag which references AUDIO
1517                 # rendition group but does not have CODECS and despite
1518                 # referencing audio group an audio group, it represents
1519                 # a complete (with audio and video) format. So, for such cases
1520                 # we will ignore references to rendition groups and treat them
1521                 # as complete formats.
1522                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1523                     audio_group = groups.get(audio_group_id)
1524                     if audio_group and audio_group[0].get('URI'):
1525                         # TODO: update acodec for audio only formats with
1526                         # the same GROUP-ID
1527                         f['acodec'] = 'none'
1528                 formats.append(f)
1529                 last_stream_inf = {}
1530         return formats
1531
1532     @staticmethod
1533     def _xpath_ns(path, namespace=None):
1534         if not namespace:
1535             return path
1536         out = []
1537         for c in path.split('/'):
1538             if not c or c == '.':
1539                 out.append(c)
1540             else:
1541                 out.append('{%s}%s' % (namespace, c))
1542         return '/'.join(out)
1543
1544     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1545         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1546
1547         if smil is False:
1548             assert not fatal
1549             return []
1550
1551         namespace = self._parse_smil_namespace(smil)
1552
1553         return self._parse_smil_formats(
1554             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1555
1556     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1557         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1558         if smil is False:
1559             return {}
1560         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1561
1562     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1563         return self._download_xml(
1564             smil_url, video_id, 'Downloading SMIL file',
1565             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1566
1567     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1568         namespace = self._parse_smil_namespace(smil)
1569
1570         formats = self._parse_smil_formats(
1571             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1572         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1573
1574         video_id = os.path.splitext(url_basename(smil_url))[0]
1575         title = None
1576         description = None
1577         upload_date = None
1578         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1579             name = meta.attrib.get('name')
1580             content = meta.attrib.get('content')
1581             if not name or not content:
1582                 continue
1583             if not title and name == 'title':
1584                 title = content
1585             elif not description and name in ('description', 'abstract'):
1586                 description = content
1587             elif not upload_date and name == 'date':
1588                 upload_date = unified_strdate(content)
1589
1590         thumbnails = [{
1591             'id': image.get('type'),
1592             'url': image.get('src'),
1593             'width': int_or_none(image.get('width')),
1594             'height': int_or_none(image.get('height')),
1595         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1596
1597         return {
1598             'id': video_id,
1599             'title': title or video_id,
1600             'description': description,
1601             'upload_date': upload_date,
1602             'thumbnails': thumbnails,
1603             'formats': formats,
1604             'subtitles': subtitles,
1605         }
1606
1607     def _parse_smil_namespace(self, smil):
1608         return self._search_regex(
1609             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1610
1611     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1612         base = smil_url
1613         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1614             b = meta.get('base') or meta.get('httpBase')
1615             if b:
1616                 base = b
1617                 break
1618
1619         formats = []
1620         rtmp_count = 0
1621         http_count = 0
1622         m3u8_count = 0
1623
1624         srcs = []
1625         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1626         for medium in media:
1627             src = medium.get('src')
1628             if not src or src in srcs:
1629                 continue
1630             srcs.append(src)
1631
1632             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1633             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1634             width = int_or_none(medium.get('width'))
1635             height = int_or_none(medium.get('height'))
1636             proto = medium.get('proto')
1637             ext = medium.get('ext')
1638             src_ext = determine_ext(src)
1639             streamer = medium.get('streamer') or base
1640
1641             if proto == 'rtmp' or streamer.startswith('rtmp'):
1642                 rtmp_count += 1
1643                 formats.append({
1644                     'url': streamer,
1645                     'play_path': src,
1646                     'ext': 'flv',
1647                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1648                     'tbr': bitrate,
1649                     'filesize': filesize,
1650                     'width': width,
1651                     'height': height,
1652                 })
1653                 if transform_rtmp_url:
1654                     streamer, src = transform_rtmp_url(streamer, src)
1655                     formats[-1].update({
1656                         'url': streamer,
1657                         'play_path': src,
1658                     })
1659                 continue
1660
1661             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1662             src_url = src_url.strip()
1663
1664             if proto == 'm3u8' or src_ext == 'm3u8':
1665                 m3u8_formats = self._extract_m3u8_formats(
1666                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1667                 if len(m3u8_formats) == 1:
1668                     m3u8_count += 1
1669                     m3u8_formats[0].update({
1670                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1671                         'tbr': bitrate,
1672                         'width': width,
1673                         'height': height,
1674                     })
1675                 formats.extend(m3u8_formats)
1676                 continue
1677
1678             if src_ext == 'f4m':
1679                 f4m_url = src_url
1680                 if not f4m_params:
1681                     f4m_params = {
1682                         'hdcore': '3.2.0',
1683                         'plugin': 'flowplayer-3.2.0.1',
1684                     }
1685                 f4m_url += '&' if '?' in f4m_url else '?'
1686                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1687                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1688                 continue
1689
1690             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1691                 http_count += 1
1692                 formats.append({
1693                     'url': src_url,
1694                     'ext': ext or src_ext or 'flv',
1695                     'format_id': 'http-%d' % (bitrate or http_count),
1696                     'tbr': bitrate,
1697                     'filesize': filesize,
1698                     'width': width,
1699                     'height': height,
1700                 })
1701                 continue
1702
1703         return formats
1704
1705     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1706         urls = []
1707         subtitles = {}
1708         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1709             src = textstream.get('src')
1710             if not src or src in urls:
1711                 continue
1712             urls.append(src)
1713             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1714             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1715             subtitles.setdefault(lang, []).append({
1716                 'url': src,
1717                 'ext': ext,
1718             })
1719         return subtitles
1720
1721     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1722         xspf = self._download_xml(
1723             xspf_url, playlist_id, 'Downloading xpsf playlist',
1724             'Unable to download xspf manifest', fatal=fatal)
1725         if xspf is False:
1726             return []
1727         return self._parse_xspf(
1728             xspf, playlist_id, xspf_url=xspf_url,
1729             xspf_base_url=base_url(xspf_url))
1730
1731     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1732         NS_MAP = {
1733             'xspf': 'http://xspf.org/ns/0/',
1734             's1': 'http://static.streamone.nl/player/ns/0',
1735         }
1736
1737         entries = []
1738         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1739             title = xpath_text(
1740                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1741             description = xpath_text(
1742                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1743             thumbnail = xpath_text(
1744                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1745             duration = float_or_none(
1746                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1747
1748             formats = []
1749             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1750                 format_url = urljoin(xspf_base_url, location.text)
1751                 if not format_url:
1752                     continue
1753                 formats.append({
1754                     'url': format_url,
1755                     'manifest_url': xspf_url,
1756                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1757                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1758                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1759                 })
1760             self._sort_formats(formats)
1761
1762             entries.append({
1763                 'id': playlist_id,
1764                 'title': title,
1765                 'description': description,
1766                 'thumbnail': thumbnail,
1767                 'duration': duration,
1768                 'formats': formats,
1769             })
1770         return entries
1771
1772     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1773         res = self._download_xml_handle(
1774             mpd_url, video_id,
1775             note=note or 'Downloading MPD manifest',
1776             errnote=errnote or 'Failed to download MPD manifest',
1777             fatal=fatal)
1778         if res is False:
1779             return []
1780         mpd_doc, urlh = res
1781         mpd_base_url = base_url(urlh.geturl())
1782
1783         return self._parse_mpd_formats(
1784             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1785             formats_dict=formats_dict, mpd_url=mpd_url)
1786
1787     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1788         """
1789         Parse formats from MPD manifest.
1790         References:
1791          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1792             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1793          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1794         """
1795         if mpd_doc.get('type') == 'dynamic':
1796             return []
1797
1798         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1799
1800         def _add_ns(path):
1801             return self._xpath_ns(path, namespace)
1802
1803         def is_drm_protected(element):
1804             return element.find(_add_ns('ContentProtection')) is not None
1805
1806         def extract_multisegment_info(element, ms_parent_info):
1807             ms_info = ms_parent_info.copy()
1808
1809             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1810             # common attributes and elements.  We will only extract relevant
1811             # for us.
1812             def extract_common(source):
1813                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1814                 if segment_timeline is not None:
1815                     s_e = segment_timeline.findall(_add_ns('S'))
1816                     if s_e:
1817                         ms_info['total_number'] = 0
1818                         ms_info['s'] = []
1819                         for s in s_e:
1820                             r = int(s.get('r', 0))
1821                             ms_info['total_number'] += 1 + r
1822                             ms_info['s'].append({
1823                                 't': int(s.get('t', 0)),
1824                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1825                                 'd': int(s.attrib['d']),
1826                                 'r': r,
1827                             })
1828                 start_number = source.get('startNumber')
1829                 if start_number:
1830                     ms_info['start_number'] = int(start_number)
1831                 timescale = source.get('timescale')
1832                 if timescale:
1833                     ms_info['timescale'] = int(timescale)
1834                 segment_duration = source.get('duration')
1835                 if segment_duration:
1836                     ms_info['segment_duration'] = float(segment_duration)
1837
1838             def extract_Initialization(source):
1839                 initialization = source.find(_add_ns('Initialization'))
1840                 if initialization is not None:
1841                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1842
1843             segment_list = element.find(_add_ns('SegmentList'))
1844             if segment_list is not None:
1845                 extract_common(segment_list)
1846                 extract_Initialization(segment_list)
1847                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1848                 if segment_urls_e:
1849                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1850             else:
1851                 segment_template = element.find(_add_ns('SegmentTemplate'))
1852                 if segment_template is not None:
1853                     extract_common(segment_template)
1854                     media = segment_template.get('media')
1855                     if media:
1856                         ms_info['media'] = media
1857                     initialization = segment_template.get('initialization')
1858                     if initialization:
1859                         ms_info['initialization'] = initialization
1860                     else:
1861                         extract_Initialization(segment_template)
1862             return ms_info
1863
1864         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1865         formats = []
1866         for period in mpd_doc.findall(_add_ns('Period')):
1867             period_duration = parse_duration(period.get('duration')) or mpd_duration
1868             period_ms_info = extract_multisegment_info(period, {
1869                 'start_number': 1,
1870                 'timescale': 1,
1871             })
1872             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1873                 if is_drm_protected(adaptation_set):
1874                     continue
1875                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1876                 for representation in adaptation_set.findall(_add_ns('Representation')):
1877                     if is_drm_protected(representation):
1878                         continue
1879                     representation_attrib = adaptation_set.attrib.copy()
1880                     representation_attrib.update(representation.attrib)
1881                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1882                     mime_type = representation_attrib['mimeType']
1883                     content_type = mime_type.split('/')[0]
1884                     if content_type == 'text':
1885                         # TODO implement WebVTT downloading
1886                         pass
1887                     elif content_type in ('video', 'audio'):
1888                         base_url = ''
1889                         for element in (representation, adaptation_set, period, mpd_doc):
1890                             base_url_e = element.find(_add_ns('BaseURL'))
1891                             if base_url_e is not None:
1892                                 base_url = base_url_e.text + base_url
1893                                 if re.match(r'^https?://', base_url):
1894                                     break
1895                         if mpd_base_url and not re.match(r'^https?://', base_url):
1896                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1897                                 mpd_base_url += '/'
1898                             base_url = mpd_base_url + base_url
1899                         representation_id = representation_attrib.get('id')
1900                         lang = representation_attrib.get('lang')
1901                         url_el = representation.find(_add_ns('BaseURL'))
1902                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1903                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1904                         f = {
1905                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1906                             'url': base_url,
1907                             'manifest_url': mpd_url,
1908                             'ext': mimetype2ext(mime_type),
1909                             'width': int_or_none(representation_attrib.get('width')),
1910                             'height': int_or_none(representation_attrib.get('height')),
1911                             'tbr': float_or_none(bandwidth, 1000),
1912                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1913                             'fps': int_or_none(representation_attrib.get('frameRate')),
1914                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1915                             'format_note': 'DASH %s' % content_type,
1916                             'filesize': filesize,
1917                             'container': mimetype2ext(mime_type) + '_dash',
1918                         }
1919                         f.update(parse_codecs(representation_attrib.get('codecs')))
1920                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1921
1922                         def prepare_template(template_name, identifiers):
1923                             t = representation_ms_info[template_name]
1924                             t = t.replace('$RepresentationID$', representation_id)
1925                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1926                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1927                             t.replace('$$', '$')
1928                             return t
1929
1930                         # @initialization is a regular template like @media one
1931                         # so it should be handled just the same way (see
1932                         # https://github.com/rg3/youtube-dl/issues/11605)
1933                         if 'initialization' in representation_ms_info:
1934                             initialization_template = prepare_template(
1935                                 'initialization',
1936                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1937                                 # $Time$ shall not be included for @initialization thus
1938                                 # only $Bandwidth$ remains
1939                                 ('Bandwidth', ))
1940                             representation_ms_info['initialization_url'] = initialization_template % {
1941                                 'Bandwidth': bandwidth,
1942                             }
1943
1944                         def location_key(location):
1945                             return 'url' if re.match(r'^https?://', location) else 'path'
1946
1947                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1948
1949                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1950                             media_location_key = location_key(media_template)
1951
1952                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1953                             # can't be used at the same time
1954                             if '%(Number' in media_template and 's' not in representation_ms_info:
1955                                 segment_duration = None
1956                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1957                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1958                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1959                                 representation_ms_info['fragments'] = [{
1960                                     media_location_key: media_template % {
1961                                         'Number': segment_number,
1962                                         'Bandwidth': bandwidth,
1963                                     },
1964                                     'duration': segment_duration,
1965                                 } for segment_number in range(
1966                                     representation_ms_info['start_number'],
1967                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1968                             else:
1969                                 # $Number*$ or $Time$ in media template with S list available
1970                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1971                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1972                                 representation_ms_info['fragments'] = []
1973                                 segment_time = 0
1974                                 segment_d = None
1975                                 segment_number = representation_ms_info['start_number']
1976
1977                                 def add_segment_url():
1978                                     segment_url = media_template % {
1979                                         'Time': segment_time,
1980                                         'Bandwidth': bandwidth,
1981                                         'Number': segment_number,
1982                                     }
1983                                     representation_ms_info['fragments'].append({
1984                                         media_location_key: segment_url,
1985                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1986                                     })
1987
1988                                 for num, s in enumerate(representation_ms_info['s']):
1989                                     segment_time = s.get('t') or segment_time
1990                                     segment_d = s['d']
1991                                     add_segment_url()
1992                                     segment_number += 1
1993                                     for r in range(s.get('r', 0)):
1994                                         segment_time += segment_d
1995                                         add_segment_url()
1996                                         segment_number += 1
1997                                     segment_time += segment_d
1998                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1999                             # No media template
2000                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2001                             # or any YouTube dashsegments video
2002                             fragments = []
2003                             segment_index = 0
2004                             timescale = representation_ms_info['timescale']
2005                             for s in representation_ms_info['s']:
2006                                 duration = float_or_none(s['d'], timescale)
2007                                 for r in range(s.get('r', 0) + 1):
2008                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2009                                     fragments.append({
2010                                         location_key(segment_uri): segment_uri,
2011                                         'duration': duration,
2012                                     })
2013                                     segment_index += 1
2014                             representation_ms_info['fragments'] = fragments
2015                         elif 'segment_urls' in representation_ms_info:
2016                             # Segment URLs with no SegmentTimeline
2017                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2018                             # https://github.com/rg3/youtube-dl/pull/14844
2019                             fragments = []
2020                             segment_duration = float_or_none(
2021                                 representation_ms_info['segment_duration'],
2022                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2023                             for segment_url in representation_ms_info['segment_urls']:
2024                                 fragment = {
2025                                     location_key(segment_url): segment_url,
2026                                 }
2027                                 if segment_duration:
2028                                     fragment['duration'] = segment_duration
2029                                 fragments.append(fragment)
2030                             representation_ms_info['fragments'] = fragments
2031                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2032                         # No fragments key is present in this case.
2033                         if 'fragments' in representation_ms_info:
2034                             f.update({
2035                                 'fragment_base_url': base_url,
2036                                 'fragments': [],
2037                                 'protocol': 'http_dash_segments',
2038                             })
2039                             if 'initialization_url' in representation_ms_info:
2040                                 initialization_url = representation_ms_info['initialization_url']
2041                                 if not f.get('url'):
2042                                     f['url'] = initialization_url
2043                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2044                             f['fragments'].extend(representation_ms_info['fragments'])
2045                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2046                         # is not necessarily unique within a Period thus formats with
2047                         # the same `format_id` are quite possible. There are numerous examples
2048                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2049                         # https://github.com/rg3/youtube-dl/issues/13919)
2050                         full_info = formats_dict.get(representation_id, {}).copy()
2051                         full_info.update(f)
2052                         formats.append(full_info)
2053                     else:
2054                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2055         return formats
2056
2057     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2058         res = self._download_xml_handle(
2059             ism_url, video_id,
2060             note=note or 'Downloading ISM manifest',
2061             errnote=errnote or 'Failed to download ISM manifest',
2062             fatal=fatal)
2063         if res is False:
2064             return []
2065         ism_doc, urlh = res
2066
2067         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2068
2069     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2070         """
2071         Parse formats from ISM manifest.
2072         References:
2073          1. [MS-SSTR]: Smooth Streaming Protocol,
2074             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2075         """
2076         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2077             return []
2078
2079         duration = int(ism_doc.attrib['Duration'])
2080         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2081
2082         formats = []
2083         for stream in ism_doc.findall('StreamIndex'):
2084             stream_type = stream.get('Type')
2085             if stream_type not in ('video', 'audio'):
2086                 continue
2087             url_pattern = stream.attrib['Url']
2088             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2089             stream_name = stream.get('Name')
2090             for track in stream.findall('QualityLevel'):
2091                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2092                 # TODO: add support for WVC1 and WMAP
2093                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2094                     self.report_warning('%s is not a supported codec' % fourcc)
2095                     continue
2096                 tbr = int(track.attrib['Bitrate']) // 1000
2097                 # [1] does not mention Width and Height attributes. However,
2098                 # they're often present while MaxWidth and MaxHeight are
2099                 # missing, so should be used as fallbacks
2100                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2101                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2102                 sampling_rate = int_or_none(track.get('SamplingRate'))
2103
2104                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2105                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2106
2107                 fragments = []
2108                 fragment_ctx = {
2109                     'time': 0,
2110                 }
2111                 stream_fragments = stream.findall('c')
2112                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2113                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2114                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2115                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2116                     if not fragment_ctx['duration']:
2117                         try:
2118                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2119                         except IndexError:
2120                             next_fragment_time = duration
2121                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2122                     for _ in range(fragment_repeat):
2123                         fragments.append({
2124                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2125                             'duration': fragment_ctx['duration'] / stream_timescale,
2126                         })
2127                         fragment_ctx['time'] += fragment_ctx['duration']
2128
2129                 format_id = []
2130                 if ism_id:
2131                     format_id.append(ism_id)
2132                 if stream_name:
2133                     format_id.append(stream_name)
2134                 format_id.append(compat_str(tbr))
2135
2136                 formats.append({
2137                     'format_id': '-'.join(format_id),
2138                     'url': ism_url,
2139                     'manifest_url': ism_url,
2140                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2141                     'width': width,
2142                     'height': height,
2143                     'tbr': tbr,
2144                     'asr': sampling_rate,
2145                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2146                     'acodec': 'none' if stream_type == 'video' else fourcc,
2147                     'protocol': 'ism',
2148                     'fragments': fragments,
2149                     '_download_params': {
2150                         'duration': duration,
2151                         'timescale': stream_timescale,
2152                         'width': width or 0,
2153                         'height': height or 0,
2154                         'fourcc': fourcc,
2155                         'codec_private_data': track.get('CodecPrivateData'),
2156                         'sampling_rate': sampling_rate,
2157                         'channels': int_or_none(track.get('Channels', 2)),
2158                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2159                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2160                     },
2161                 })
2162         return formats
2163
2164     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2165         def absolute_url(item_url):
2166             return urljoin(base_url, item_url)
2167
2168         def parse_content_type(content_type):
2169             if not content_type:
2170                 return {}
2171             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2172             if ctr:
2173                 mimetype, codecs = ctr.groups()
2174                 f = parse_codecs(codecs)
2175                 f['ext'] = mimetype2ext(mimetype)
2176                 return f
2177             return {}
2178
2179         def _media_formats(src, cur_media_type, type_info={}):
2180             full_url = absolute_url(src)
2181             ext = type_info.get('ext') or determine_ext(full_url)
2182             if ext == 'm3u8':
2183                 is_plain_url = False
2184                 formats = self._extract_m3u8_formats(
2185                     full_url, video_id, ext='mp4',
2186                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2187                     preference=preference, fatal=False)
2188             elif ext == 'mpd':
2189                 is_plain_url = False
2190                 formats = self._extract_mpd_formats(
2191                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2192             else:
2193                 is_plain_url = True
2194                 formats = [{
2195                     'url': full_url,
2196                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2197                 }]
2198             return is_plain_url, formats
2199
2200         entries = []
2201         # amp-video and amp-audio are very similar to their HTML5 counterparts
2202         # so we wll include them right here (see
2203         # https://www.ampproject.org/docs/reference/components/amp-video)
2204         media_tags = [(media_tag, media_type, '')
2205                       for media_tag, media_type
2206                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2207         media_tags.extend(re.findall(
2208             # We only allow video|audio followed by a whitespace or '>'.
2209             # Allowing more characters may end up in significant slow down (see
2210             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2211             # http://www.porntrex.com/maps/videositemap.xml).
2212             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2213         for media_tag, media_type, media_content in media_tags:
2214             media_info = {
2215                 'formats': [],
2216                 'subtitles': {},
2217             }
2218             media_attributes = extract_attributes(media_tag)
2219             src = media_attributes.get('src')
2220             if src:
2221                 _, formats = _media_formats(src, media_type)
2222                 media_info['formats'].extend(formats)
2223             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2224             if media_content:
2225                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2226                     source_attributes = extract_attributes(source_tag)
2227                     src = source_attributes.get('src')
2228                     if not src:
2229                         continue
2230                     f = parse_content_type(source_attributes.get('type'))
2231                     is_plain_url, formats = _media_formats(src, media_type, f)
2232                     if is_plain_url:
2233                         # res attribute is not standard but seen several times
2234                         # in the wild
2235                         f.update({
2236                             'height': int_or_none(source_attributes.get('res')),
2237                             'format_id': source_attributes.get('label'),
2238                         })
2239                         f.update(formats[0])
2240                         media_info['formats'].append(f)
2241                     else:
2242                         media_info['formats'].extend(formats)
2243                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2244                     track_attributes = extract_attributes(track_tag)
2245                     kind = track_attributes.get('kind')
2246                     if not kind or kind in ('subtitles', 'captions'):
2247                         src = track_attributes.get('src')
2248                         if not src:
2249                             continue
2250                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2251                         media_info['subtitles'].setdefault(lang, []).append({
2252                             'url': absolute_url(src),
2253                         })
2254             if media_info['formats'] or media_info['subtitles']:
2255                 entries.append(media_info)
2256         return entries
2257
2258     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2259         formats = []
2260         hdcore_sign = 'hdcore=3.7.0'
2261         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2262         hds_host = hosts.get('hds')
2263         if hds_host:
2264             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2265         if 'hdcore=' not in f4m_url:
2266             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2267         f4m_formats = self._extract_f4m_formats(
2268             f4m_url, video_id, f4m_id='hds', fatal=False)
2269         for entry in f4m_formats:
2270             entry.update({'extra_param_to_segment_url': hdcore_sign})
2271         formats.extend(f4m_formats)
2272         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2273         hls_host = hosts.get('hls')
2274         if hls_host:
2275             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2276         formats.extend(self._extract_m3u8_formats(
2277             m3u8_url, video_id, 'mp4', 'm3u8_native',
2278             m3u8_id='hls', fatal=False))
2279         return formats
2280
2281     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2282         query = compat_urlparse.urlparse(url).query
2283         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2284         mobj = re.search(
2285             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2286         url_base = mobj.group('url')
2287         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2288         formats = []
2289
2290         def manifest_url(manifest):
2291             m_url = '%s/%s' % (http_base_url, manifest)
2292             if query:
2293                 m_url += '?%s' % query
2294             return m_url
2295
2296         if 'm3u8' not in skip_protocols:
2297             formats.extend(self._extract_m3u8_formats(
2298                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2299                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2300         if 'f4m' not in skip_protocols:
2301             formats.extend(self._extract_f4m_formats(
2302                 manifest_url('manifest.f4m'),
2303                 video_id, f4m_id='hds', fatal=False))
2304         if 'dash' not in skip_protocols:
2305             formats.extend(self._extract_mpd_formats(
2306                 manifest_url('manifest.mpd'),
2307                 video_id, mpd_id='dash', fatal=False))
2308         if re.search(r'(?:/smil:|\.smil)', url_base):
2309             if 'smil' not in skip_protocols:
2310                 rtmp_formats = self._extract_smil_formats(
2311                     manifest_url('jwplayer.smil'),
2312                     video_id, fatal=False)
2313                 for rtmp_format in rtmp_formats:
2314                     rtsp_format = rtmp_format.copy()
2315                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2316                     del rtsp_format['play_path']
2317                     del rtsp_format['ext']
2318                     rtsp_format.update({
2319                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2320                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2321                         'protocol': 'rtsp',
2322                     })
2323                     formats.extend([rtmp_format, rtsp_format])
2324         else:
2325             for protocol in ('rtmp', 'rtsp'):
2326                 if protocol not in skip_protocols:
2327                     formats.append({
2328                         'url': '%s:%s' % (protocol, url_base),
2329                         'format_id': protocol,
2330                         'protocol': protocol,
2331                     })
2332         return formats
2333
2334     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2335         mobj = re.search(
2336             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2337             webpage)
2338         if mobj:
2339             try:
2340                 jwplayer_data = self._parse_json(mobj.group('options'),
2341                                                  video_id=video_id,
2342                                                  transform_source=transform_source)
2343             except ExtractorError:
2344                 pass
2345             else:
2346                 if isinstance(jwplayer_data, dict):
2347                     return jwplayer_data
2348
2349     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2350         jwplayer_data = self._find_jwplayer_data(
2351             webpage, video_id, transform_source=js_to_json)
2352         return self._parse_jwplayer_data(
2353             jwplayer_data, video_id, *args, **kwargs)
2354
2355     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2356                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2357         # JWPlayer backward compatibility: flattened playlists
2358         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2359         if 'playlist' not in jwplayer_data:
2360             jwplayer_data = {'playlist': [jwplayer_data]}
2361
2362         entries = []
2363
2364         # JWPlayer backward compatibility: single playlist item
2365         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2366         if not isinstance(jwplayer_data['playlist'], list):
2367             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2368
2369         for video_data in jwplayer_data['playlist']:
2370             # JWPlayer backward compatibility: flattened sources
2371             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2372             if 'sources' not in video_data:
2373                 video_data['sources'] = [video_data]
2374
2375             this_video_id = video_id or video_data['mediaid']
2376
2377             formats = self._parse_jwplayer_formats(
2378                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2379                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2380
2381             subtitles = {}
2382             tracks = video_data.get('tracks')
2383             if tracks and isinstance(tracks, list):
2384                 for track in tracks:
2385                     if not isinstance(track, dict):
2386                         continue
2387                     track_kind = track.get('kind')
2388                     if not track_kind or not isinstance(track_kind, compat_str):
2389                         continue
2390                     if track_kind.lower() not in ('captions', 'subtitles'):
2391                         continue
2392                     track_url = urljoin(base_url, track.get('file'))
2393                     if not track_url:
2394                         continue
2395                     subtitles.setdefault(track.get('label') or 'en', []).append({
2396                         'url': self._proto_relative_url(track_url)
2397                     })
2398
2399             entry = {
2400                 'id': this_video_id,
2401                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2402                 'description': video_data.get('description'),
2403                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2404                 'timestamp': int_or_none(video_data.get('pubdate')),
2405                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2406                 'subtitles': subtitles,
2407             }
2408             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2409             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2410                 entry.update({
2411                     '_type': 'url_transparent',
2412                     'url': formats[0]['url'],
2413                 })
2414             else:
2415                 self._sort_formats(formats)
2416                 entry['formats'] = formats
2417             entries.append(entry)
2418         if len(entries) == 1:
2419             return entries[0]
2420         else:
2421             return self.playlist_result(entries)
2422
2423     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2424                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2425         urls = []
2426         formats = []
2427         for source in jwplayer_sources_data:
2428             if not isinstance(source, dict):
2429                 continue
2430             source_url = self._proto_relative_url(source.get('file'))
2431             if not source_url:
2432                 continue
2433             if base_url:
2434                 source_url = compat_urlparse.urljoin(base_url, source_url)
2435             if source_url in urls:
2436                 continue
2437             urls.append(source_url)
2438             source_type = source.get('type') or ''
2439             ext = mimetype2ext(source_type) or determine_ext(source_url)
2440             if source_type == 'hls' or ext == 'm3u8':
2441                 formats.extend(self._extract_m3u8_formats(
2442                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2443                     m3u8_id=m3u8_id, fatal=False))
2444             elif source_type == 'dash' or ext == 'mpd':
2445                 formats.extend(self._extract_mpd_formats(
2446                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2447             elif ext == 'smil':
2448                 formats.extend(self._extract_smil_formats(
2449                     source_url, video_id, fatal=False))
2450             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2451             elif source_type.startswith('audio') or ext in (
2452                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2453                 formats.append({
2454                     'url': source_url,
2455                     'vcodec': 'none',
2456                     'ext': ext,
2457                 })
2458             else:
2459                 height = int_or_none(source.get('height'))
2460                 if height is None:
2461                     # Often no height is provided but there is a label in
2462                     # format like "1080p", "720p SD", or 1080.
2463                     height = int_or_none(self._search_regex(
2464                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2465                         'height', default=None))
2466                 a_format = {
2467                     'url': source_url,
2468                     'width': int_or_none(source.get('width')),
2469                     'height': height,
2470                     'tbr': int_or_none(source.get('bitrate')),
2471                     'ext': ext,
2472                 }
2473                 if source_url.startswith('rtmp'):
2474                     a_format['ext'] = 'flv'
2475                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2476                     # of jwplayer.flash.swf
2477                     rtmp_url_parts = re.split(
2478                         r'((?:mp4|mp3|flv):)', source_url, 1)
2479                     if len(rtmp_url_parts) == 3:
2480                         rtmp_url, prefix, play_path = rtmp_url_parts
2481                         a_format.update({
2482                             'url': rtmp_url,
2483                             'play_path': prefix + play_path,
2484                         })
2485                     if rtmp_params:
2486                         a_format.update(rtmp_params)
2487                 formats.append(a_format)
2488         return formats
2489
2490     def _live_title(self, name):
2491         """ Generate the title for a live video """
2492         now = datetime.datetime.now()
2493         now_str = now.strftime('%Y-%m-%d %H:%M')
2494         return name + ' ' + now_str
2495
2496     def _int(self, v, name, fatal=False, **kwargs):
2497         res = int_or_none(v, **kwargs)
2498         if 'get_attr' in kwargs:
2499             print(getattr(v, kwargs['get_attr']))
2500         if res is None:
2501             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2502             if fatal:
2503                 raise ExtractorError(msg)
2504             else:
2505                 self._downloader.report_warning(msg)
2506         return res
2507
2508     def _float(self, v, name, fatal=False, **kwargs):
2509         res = float_or_none(v, **kwargs)
2510         if res is None:
2511             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2512             if fatal:
2513                 raise ExtractorError(msg)
2514             else:
2515                 self._downloader.report_warning(msg)
2516         return res
2517
2518     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2519                     path='/', secure=False, discard=False, rest={}, **kwargs):
2520         cookie = compat_cookiejar.Cookie(
2521             0, name, value, port, port is not None, domain, True,
2522             domain.startswith('.'), path, True, secure, expire_time,
2523             discard, None, None, rest)
2524         self._downloader.cookiejar.set_cookie(cookie)
2525
2526     def _get_cookies(self, url):
2527         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2528         req = sanitized_Request(url)
2529         self._downloader.cookiejar.add_cookie_header(req)
2530         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2531
2532     def get_testcases(self, include_onlymatching=False):
2533         t = getattr(self, '_TEST', None)
2534         if t:
2535             assert not hasattr(self, '_TESTS'), \
2536                 '%s has _TEST and _TESTS' % type(self).__name__
2537             tests = [t]
2538         else:
2539             tests = getattr(self, '_TESTS', [])
2540         for t in tests:
2541             if not include_onlymatching and t.get('only_matching', False):
2542                 continue
2543             t['name'] = type(self).__name__[:-len('IE')]
2544             yield t
2545
2546     def is_suitable(self, age_limit):
2547         """ Test whether the extractor is generally suitable for the given
2548         age limit (i.e. pornographic sites are not, all others usually are) """
2549
2550         any_restricted = False
2551         for tc in self.get_testcases(include_onlymatching=False):
2552             if tc.get('playlist', []):
2553                 tc = tc['playlist'][0]
2554             is_restricted = age_restricted(
2555                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2556             if not is_restricted:
2557                 return True
2558             any_restricted = any_restricted or is_restricted
2559         return not any_restricted
2560
2561     def extract_subtitles(self, *args, **kwargs):
2562         if (self._downloader.params.get('writesubtitles', False) or
2563                 self._downloader.params.get('listsubtitles')):
2564             return self._get_subtitles(*args, **kwargs)
2565         return {}
2566
2567     def _get_subtitles(self, *args, **kwargs):
2568         raise NotImplementedError('This method must be implemented by subclasses')
2569
2570     @staticmethod
2571     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2572         """ Merge subtitle items for one language. Items with duplicated URLs
2573         will be dropped. """
2574         list1_urls = set([item['url'] for item in subtitle_list1])
2575         ret = list(subtitle_list1)
2576         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2577         return ret
2578
2579     @classmethod
2580     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2581         """ Merge two subtitle dictionaries, language by language. """
2582         ret = dict(subtitle_dict1)
2583         for lang in subtitle_dict2:
2584             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2585         return ret
2586
2587     def extract_automatic_captions(self, *args, **kwargs):
2588         if (self._downloader.params.get('writeautomaticsub', False) or
2589                 self._downloader.params.get('listsubtitles')):
2590             return self._get_automatic_captions(*args, **kwargs)
2591         return {}
2592
2593     def _get_automatic_captions(self, *args, **kwargs):
2594         raise NotImplementedError('This method must be implemented by subclasses')
2595
2596     def mark_watched(self, *args, **kwargs):
2597         if (self._downloader.params.get('mark_watched', False) and
2598                 (self._get_login_info()[0] is not None or
2599                     self._downloader.params.get('cookiefile') is not None)):
2600             self._mark_watched(*args, **kwargs)
2601
2602     def _mark_watched(self, *args, **kwargs):
2603         raise NotImplementedError('This method must be implemented by subclasses')
2604
2605     def geo_verification_headers(self):
2606         headers = {}
2607         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2608         if geo_verification_proxy:
2609             headers['Ytdl-request-proxy'] = geo_verification_proxy
2610         return headers
2611
2612     def _generic_id(self, url):
2613         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2614
2615     def _generic_title(self, url):
2616         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2617
2618
2619 class SearchInfoExtractor(InfoExtractor):
2620     """
2621     Base class for paged search queries extractors.
2622     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2623     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2624     """
2625
2626     @classmethod
2627     def _make_valid_url(cls):
2628         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2629
2630     @classmethod
2631     def suitable(cls, url):
2632         return re.match(cls._make_valid_url(), url) is not None
2633
2634     def _real_extract(self, query):
2635         mobj = re.match(self._make_valid_url(), query)
2636         if mobj is None:
2637             raise ExtractorError('Invalid search query "%s"' % query)
2638
2639         prefix = mobj.group('prefix')
2640         query = mobj.group('query')
2641         if prefix == '':
2642             return self._get_n_results(query, 1)
2643         elif prefix == 'all':
2644             return self._get_n_results(query, self._MAX_RESULTS)
2645         else:
2646             n = int(prefix)
2647             if n <= 0:
2648                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2649             elif n > self._MAX_RESULTS:
2650                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2651                 n = self._MAX_RESULTS
2652             return self._get_n_results(query, n)
2653
2654     def _get_n_results(self, query, n):
2655         """Get a specified number of results for a query"""
2656         raise NotImplementedError('This method must be implemented by subclasses')
2657
2658     @property
2659     def SEARCH_KEY(self):
2660         return self._SEARCH_KEY