_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import remove_encrypted_media
  33 from ..utils import (
  34     NO_DEFAULT,
  35     age_restricted,
  36     base_url,
  37     bug_reports_message,
  38     clean_html,
  39     compiled_regex_type,
  40     determine_ext,
  41     determine_protocol,
  42     error_to_compat_str,
  43     ExtractorError,
  44     extract_attributes,
  45     fix_xml_ampersands,
  46     float_or_none,
  47     GeoRestrictedError,
  48     GeoUtils,
  49     int_or_none,
  50     js_to_json,
  51     mimetype2ext,
  52     orderedSet,
  53     parse_codecs,
  54     parse_duration,
  55     parse_iso8601,
  56     parse_m3u8_attributes,
  57     RegexNotFoundError,
  58     sanitized_Request,
  59     sanitize_filename,
  60     unescapeHTML,
  61     unified_strdate,
  62     unified_timestamp,
  63     update_Request,
  64     update_url_query,
  65     urljoin,
  66     url_basename,
  67     xpath_element,
  68     xpath_text,
  69     xpath_with_ns,
  70 )
  71
  72
  73 class InfoExtractor(object):
  74     """Information Extractor class.
  75
  76     Information extractors are the classes that, given a URL, extract
  77     information about the video (or videos) the URL refers to. This
  78     information includes the real video URL, the video title, author and
  79     others. The information is stored in a dictionary which is then
  80     passed to the YoutubeDL. The YoutubeDL processes this
  81     information possibly downloading the video to the file system, among
  82     other possible outcomes.
  83
  84     The type field determines the type of the result.
  85     By far the most common value (and the default if _type is missing) is
  86     "video", which indicates a single video.
  87
  88     For a video, the dictionaries must include the following fields:
  89
  90     id:             Video identifier.
  91     title:          Video title, unescaped.
  92
  93     Additionally, it must contain either a formats entry or a url one:
  94
  95     formats:        A list of dictionaries for each format available, ordered
  96                     from worst to best quality.
  97
  98                     Potential fields:
  99                     * url        Mandatory. The URL of the video file
 100                     * manifest_url
 101                                  The URL of the manifest file in case of
 102                                  fragmented media (DASH, hls, hds)
 103                     * ext        Will be calculated from URL if missing
 104                     * format     A human-readable description of the format
 105                                  ("mp4 container with h264/opus").
 106                                  Calculated from the format_id, width, height.
 107                                  and format_note fields if missing.
 108                     * format_id  A short description of the format
 109                                  ("mp4_h264_opus" or "19").
 110                                 Technically optional, but strongly recommended.
 111                     * format_note Additional info about the format
 112                                  ("3D" or "DASH video")
 113                     * width      Width of the video, if known
 114                     * height     Height of the video, if known
 115                     * resolution Textual description of width and height
 116                     * tbr        Average bitrate of audio and video in KBit/s
 117                     * abr        Average audio bitrate in KBit/s
 118                     * acodec     Name of the audio codec in use
 119                     * asr        Audio sampling rate in Hertz
 120                     * vbr        Average video bitrate in KBit/s
 121                     * fps        Frame rate
 122                     * vcodec     Name of the video codec in use
 123                     * container  Name of the container format
 124                     * filesize   The number of bytes, if known in advance
 125                     * filesize_approx  An estimate for the number of bytes
 126                     * player_url SWF Player URL (used for rtmpdump).
 127                     * protocol   The protocol that will be used for the actual
 128                                  download, lower-case.
 129                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 130                                  "m3u8", "m3u8_native" or "http_dash_segments".
 131                     * fragment_base_url
 132                                  Base URL for fragments. Each fragment's path
 133                                  value (if present) will be relative to
 134                                  this URL.
 135                     * fragments  A list of fragments of a fragmented media.
 136                                  Each fragment entry must contain either an url
 137                                  or a path. If an url is present it should be
 138                                  considered by a client. Otherwise both path and
 139                                  fragment_base_url must be present. Here is
 140                                  the list of all potential fields:
 141                                  * "url" - fragment's URL
 142                                  * "path" - fragment's path relative to
 143                                             fragment_base_url
 144                                  * "duration" (optional, int or float)
 145                                  * "filesize" (optional, int)
 146                     * preference Order number of this format. If this field is
 147                                  present and not None, the formats get sorted
 148                                  by this field, regardless of all other values.
 149                                  -1 for default (order by other properties),
 150                                  -2 or smaller for less than default.
 151                                  < -1000 to hide the format (if there is
 152                                     another one which is strictly better)
 153                     * language   Language code, e.g. "de" or "en-US".
 154                     * language_preference  Is this in the language mentioned in
 155                                  the URL?
 156                                  10 if it's what the URL is about,
 157                                  -1 for default (don't know),
 158                                  -10 otherwise, other values reserved for now.
 159                     * quality    Order number of the video quality of this
 160                                  format, irrespective of the file format.
 161                                  -1 for default (order by other properties),
 162                                  -2 or smaller for less than default.
 163                     * source_preference  Order number for this video source
 164                                   (quality takes higher priority)
 165                                  -1 for default (order by other properties),
 166                                  -2 or smaller for less than default.
 167                     * http_headers  A dictionary of additional HTTP headers
 168                                  to add to the request.
 169                     * stretched_ratio  If given and not 1, indicates that the
 170                                  video's pixels are not square.
 171                                  width : height ratio as float.
 172                     * no_resume  The server does not support resuming the
 173                                  (HTTP or RTMP) download. Boolean.
 174
 175     url:            Final video URL.
 176     ext:            Video filename extension.
 177     format:         The video format, defaults to ext (used for --get-format)
 178     player_url:     SWF Player URL (used for rtmpdump).
 179
 180     The following fields are optional:
 181
 182     alt_title:      A secondary title of the video.
 183     display_id      An alternative identifier for the video, not necessarily
 184                     unique, but available before title. Typically, id is
 185                     something like "4234987", title "Dancing naked mole rats",
 186                     and display_id "dancing-naked-mole-rats"
 187     thumbnails:     A list of dictionaries, with the following entries:
 188                         * "id" (optional, string) - Thumbnail format ID
 189                         * "url"
 190                         * "preference" (optional, int) - quality of the image
 191                         * "width" (optional, int)
 192                         * "height" (optional, int)
 193                         * "resolution" (optional, string "{width}x{height"},
 194                                         deprecated)
 195                         * "filesize" (optional, int)
 196     thumbnail:      Full URL to a video thumbnail image.
 197     description:    Full video description.
 198     uploader:       Full name of the video uploader.
 199     license:        License name the video is licensed under.
 200     creator:        The creator of the video.
 201     release_date:   The date (YYYYMMDD) when the video was released.
 202     timestamp:      UNIX timestamp of the moment the video became available.
 203     upload_date:    Video upload date (YYYYMMDD).
 204                     If not explicitly set, calculated from timestamp.
 205     uploader_id:    Nickname or id of the video uploader.
 206     uploader_url:   Full URL to a personal webpage of the video uploader.
 207     location:       Physical location where the video was filmed.
 208     subtitles:      The available subtitles as a dictionary in the format
 209                     {tag: subformats}. "tag" is usually a language code, and
 210                     "subformats" is a list sorted from lower to higher
 211                     preference, each element is a dictionary with the "ext"
 212                     entry and one of:
 213                         * "data": The subtitles file contents
 214                         * "url": A URL pointing to the subtitles file
 215                     "ext" will be calculated from URL if missing
 216     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 217                     automatically generated captions
 218     duration:       Length of the video in seconds, as an integer or float.
 219     view_count:     How many users have watched the video on the platform.
 220     like_count:     Number of positive ratings of the video
 221     dislike_count:  Number of negative ratings of the video
 222     repost_count:   Number of reposts of the video
 223     average_rating: Average rating give by users, the scale used depends on the webpage
 224     comment_count:  Number of comments on the video
 225     comments:       A list of comments, each with one or more of the following
 226                     properties (all but one of text or html optional):
 227                         * "author" - human-readable name of the comment author
 228                         * "author_id" - user ID of the comment author
 229                         * "id" - Comment ID
 230                         * "html" - Comment as HTML
 231                         * "text" - Plain text of the comment
 232                         * "timestamp" - UNIX timestamp of comment
 233                         * "parent" - ID of the comment this one is replying to.
 234                                      Set to "root" to indicate that this is a
 235                                      comment to the original video.
 236     age_limit:      Age restriction for the video, as an integer (years)
 237     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 238                     should allow to get the same result again. (It will be set
 239                     by YoutubeDL if it's missing)
 240     categories:     A list of categories that the video falls in, for example
 241                     ["Sports", "Berlin"]
 242     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 243     is_live:        True, False, or None (=unknown). Whether this video is a
 244                     live stream that goes on instead of a fixed-length video.
 245     start_time:     Time in seconds where the reproduction should start, as
 246                     specified in the URL.
 247     end_time:       Time in seconds where the reproduction should end, as
 248                     specified in the URL.
 249     chapters:       A list of dictionaries, with the following entries:
 250                         * "start_time" - The start time of the chapter in seconds
 251                         * "end_time" - The end time of the chapter in seconds
 252                         * "title" (optional, string)
 253
 254     The following fields should only be used when the video belongs to some logical
 255     chapter or section:
 256
 257     chapter:        Name or title of the chapter the video belongs to.
 258     chapter_number: Number of the chapter the video belongs to, as an integer.
 259     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 260
 261     The following fields should only be used when the video is an episode of some
 262     series, programme or podcast:
 263
 264     series:         Title of the series or programme the video episode belongs to.
 265     season:         Title of the season the video episode belongs to.
 266     season_number:  Number of the season the video episode belongs to, as an integer.
 267     season_id:      Id of the season the video episode belongs to, as a unicode string.
 268     episode:        Title of the video episode. Unlike mandatory video title field,
 269                     this field should denote the exact title of the video episode
 270                     without any kind of decoration.
 271     episode_number: Number of the video episode within a season, as an integer.
 272     episode_id:     Id of the video episode, as a unicode string.
 273
 274     The following fields should only be used when the media is a track or a part of
 275     a music album:
 276
 277     track:          Title of the track.
 278     track_number:   Number of the track within an album or a disc, as an integer.
 279     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 280                     as a unicode string.
 281     artist:         Artist(s) of the track.
 282     genre:          Genre(s) of the track.
 283     album:          Title of the album the track belongs to.
 284     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 285     album_artist:   List of all artists appeared on the album (e.g.
 286                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 287                     and compilations).
 288     disc_number:    Number of the disc or other physical medium the track belongs to,
 289                     as an integer.
 290     release_year:   Year (YYYY) when the album was released.
 291
 292     Unless mentioned otherwise, the fields should be Unicode strings.
 293
 294     Unless mentioned otherwise, None is equivalent to absence of information.
 295
 296
 297     _type "playlist" indicates multiple videos.
 298     There must be a key "entries", which is a list, an iterable, or a PagedList
 299     object, each element of which is a valid dictionary by this specification.
 300
 301     Additionally, playlists can have "title", "description" and "id" attributes
 302     with the same semantics as videos (see above).
 303
 304
 305     _type "multi_video" indicates that there are multiple videos that
 306     form a single show, for examples multiple acts of an opera or TV episode.
 307     It must have an entries key like a playlist and contain all the keys
 308     required for a video at the same time.
 309
 310
 311     _type "url" indicates that the video must be extracted from another
 312     location, possibly by a different extractor. Its only required key is:
 313     "url" - the next URL to extract.
 314     The key "ie_key" can be set to the class name (minus the trailing "IE",
 315     e.g. "Youtube") if the extractor class is known in advance.
 316     Additionally, the dictionary may have any properties of the resolved entity
 317     known in advance, for example "title" if the title of the referred video is
 318     known ahead of time.
 319
 320
 321     _type "url_transparent" entities have the same specification as "url", but
 322     indicate that the given additional information is more precise than the one
 323     associated with the resolved URL.
 324     This is useful when a site employs a video service that hosts the video and
 325     its technical metadata, but that video service does not embed a useful
 326     title, description etc.
 327
 328
 329     Subclasses of this one should re-define the _real_initialize() and
 330     _real_extract() methods and define a _VALID_URL regexp.
 331     Probably, they should also be added to the list of extractors.
 332
 333     _GEO_BYPASS attribute may be set to False in order to disable
 334     geo restriction bypass mechanisms for a particular extractor.
 335     Though it won't disable explicit geo restriction bypass based on
 336     country code provided with geo_bypass_country. (experimental)
 337
 338     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 339     countries for this extractor. One of these countries will be used by
 340     geo restriction bypass mechanism right away in order to bypass
 341     geo restriction, of course, if the mechanism is not disabled. (experimental)
 342
 343     NB: both these geo attributes are experimental and may change in future
 344     or be completely removed.
 345
 346     Finally, the _WORKING attribute should be set to False for broken IEs
 347     in order to warn the users and skip the tests.
 348     """
 349
 350     _ready = False
 351     _downloader = None
 352     _x_forwarded_for_ip = None
 353     _GEO_BYPASS = True
 354     _GEO_COUNTRIES = None
 355     _WORKING = True
 356
 357     def __init__(self, downloader=None):
 358         """Constructor. Receives an optional downloader."""
 359         self._ready = False
 360         self._x_forwarded_for_ip = None
 361         self.set_downloader(downloader)
 362
 363     @classmethod
 364     def suitable(cls, url):
 365         """Receives a URL and returns True if suitable for this IE."""
 366
 367         # This does not use has/getattr intentionally - we want to know whether
 368         # we have cached the regexp for *this* class, whereas getattr would also
 369         # match the superclass
 370         if '_VALID_URL_RE' not in cls.__dict__:
 371             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 372         return cls._VALID_URL_RE.match(url) is not None
 373
 374     @classmethod
 375     def _match_id(cls, url):
 376         if '_VALID_URL_RE' not in cls.__dict__:
 377             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 378         m = cls._VALID_URL_RE.match(url)
 379         assert m
 380         return compat_str(m.group('id'))
 381
 382     @classmethod
 383     def working(cls):
 384         """Getter method for _WORKING."""
 385         return cls._WORKING
 386
 387     def initialize(self):
 388         """Initializes an instance (authentication, etc)."""
 389         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 390         if not self._ready:
 391             self._real_initialize()
 392             self._ready = True
 393
 394     def _initialize_geo_bypass(self, countries):
 395         """
 396         Initialize geo restriction bypass mechanism.
 397
 398         This method is used to initialize geo bypass mechanism based on faking
 399         X-Forwarded-For HTTP header. A random country from provided country list
 400         is selected and a random IP belonging to this country is generated. This
 401         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 402         HTTP requests.
 403
 404         This method will be used for initial geo bypass mechanism initialization
 405         during the instance initialization with _GEO_COUNTRIES.
 406
 407         You may also manually call it from extractor's code if geo countries
 408         information is not available beforehand (e.g. obtained during
 409         extraction) or due to some another reason.
 410         """
 411         if not self._x_forwarded_for_ip:
 412             country_code = self._downloader.params.get('geo_bypass_country', None)
 413             # If there is no explicit country for geo bypass specified and
 414             # the extractor is known to be geo restricted let's fake IP
 415             # as X-Forwarded-For right away.
 416             if (not country_code and
 417                     self._GEO_BYPASS and
 418                     self._downloader.params.get('geo_bypass', True) and
 419                     countries):
 420                 country_code = random.choice(countries)
 421             if country_code:
 422                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 423                 if self._downloader.params.get('verbose', False):
 424                     self._downloader.to_screen(
 425                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 426                         % (self._x_forwarded_for_ip, country_code.upper()))
 427
 428     def extract(self, url):
 429         """Extracts URL information and returns it in list of dicts."""
 430         try:
 431             for _ in range(2):
 432                 try:
 433                     self.initialize()
 434                     ie_result = self._real_extract(url)
 435                     if self._x_forwarded_for_ip:
 436                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 437                     return ie_result
 438                 except GeoRestrictedError as e:
 439                     if self.__maybe_fake_ip_and_retry(e.countries):
 440                         continue
 441                     raise
 442         except ExtractorError:
 443             raise
 444         except compat_http_client.IncompleteRead as e:
 445             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 446         except (KeyError, StopIteration) as e:
 447             raise ExtractorError('An extractor error has occurred.', cause=e)
 448
 449     def __maybe_fake_ip_and_retry(self, countries):
 450         if (not self._downloader.params.get('geo_bypass_country', None) and
 451                 self._GEO_BYPASS and
 452                 self._downloader.params.get('geo_bypass', True) and
 453                 not self._x_forwarded_for_ip and
 454                 countries):
 455             country_code = random.choice(countries)
 456             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 457             if self._x_forwarded_for_ip:
 458                 self.report_warning(
 459                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 460                     % (self._x_forwarded_for_ip, country_code.upper()))
 461                 return True
 462         return False
 463
 464     def set_downloader(self, downloader):
 465         """Sets the downloader for this IE."""
 466         self._downloader = downloader
 467
 468     def _real_initialize(self):
 469         """Real initialization process. Redefine in subclasses."""
 470         pass
 471
 472     def _real_extract(self, url):
 473         """Real extraction process. Redefine in subclasses."""
 474         pass
 475
 476     @classmethod
 477     def ie_key(cls):
 478         """A string for getting the InfoExtractor with get_info_extractor"""
 479         return compat_str(cls.__name__[:-2])
 480
 481     @property
 482     def IE_NAME(self):
 483         return compat_str(type(self).__name__[:-2])
 484
 485     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 486         """ Returns the response handle """
 487         if note is None:
 488             self.report_download_webpage(video_id)
 489         elif note is not False:
 490             if video_id is None:
 491                 self.to_screen('%s' % (note,))
 492             else:
 493                 self.to_screen('%s: %s' % (video_id, note))
 494         if isinstance(url_or_request, compat_urllib_request.Request):
 495             url_or_request = update_Request(
 496                 url_or_request, data=data, headers=headers, query=query)
 497         else:
 498             if query:
 499                 url_or_request = update_url_query(url_or_request, query)
 500             if data is not None or headers:
 501                 url_or_request = sanitized_Request(url_or_request, data, headers)
 502         try:
 503             return self._downloader.urlopen(url_or_request)
 504         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 505             if errnote is False:
 506                 return False
 507             if errnote is None:
 508                 errnote = 'Unable to download webpage'
 509
 510             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 511             if fatal:
 512                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 513             else:
 514                 self._downloader.report_warning(errmsg)
 515                 return False
 516
 517     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 518         """ Returns a tuple (page content as string, URL handle) """
 519         # Strip hashes from the URL (#1038)
 520         if isinstance(url_or_request, (compat_str, str)):
 521             url_or_request = url_or_request.partition('#')[0]
 522
 523         # Some sites check X-Forwarded-For HTTP header in order to figure out
 524         # the origin of the client behind proxy. This allows bypassing geo
 525         # restriction by faking this header's value to IP that belongs to some
 526         # geo unrestricted country. We will do so once we encounter any
 527         # geo restriction error.
 528         if self._x_forwarded_for_ip:
 529             if 'X-Forwarded-For' not in headers:
 530                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 531
 532         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 533         if urlh is False:
 534             assert not fatal
 535             return False
 536         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 537         return (content, urlh)
 538
 539     @staticmethod
 540     def _guess_encoding_from_content(content_type, webpage_bytes):
 541         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 542         if m:
 543             encoding = m.group(1)
 544         else:
 545             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 546                           webpage_bytes[:1024])
 547             if m:
 548                 encoding = m.group(1).decode('ascii')
 549             elif webpage_bytes.startswith(b'\xff\xfe'):
 550                 encoding = 'utf-16'
 551             else:
 552                 encoding = 'utf-8'
 553
 554         return encoding
 555
 556     def __check_blocked(self, content):
 557         first_block = content[:512]
 558         if ('<title>Access to this site is blocked</title>' in content and
 559                 'Websense' in first_block):
 560             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 561             blocked_iframe = self._html_search_regex(
 562                 r'<iframe src="([^"]+)"', content,
 563                 'Websense information URL', default=None)
 564             if blocked_iframe:
 565                 msg += ' Visit %s for more details' % blocked_iframe
 566             raise ExtractorError(msg, expected=True)
 567         if '<title>The URL you requested has been blocked</title>' in first_block:
 568             msg = (
 569                 'Access to this webpage has been blocked by Indian censorship. '
 570                 'Use a VPN or proxy server (with --proxy) to route around it.')
 571             block_msg = self._html_search_regex(
 572                 r'</h1><p>(.*?)</p>',
 573                 content, 'block message', default=None)
 574             if block_msg:
 575                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 576             raise ExtractorError(msg, expected=True)
 577         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 578                 'blocklist.rkn.gov.ru' in content):
 579             raise ExtractorError(
 580                 'Access to this webpage has been blocked by decision of the Russian government. '
 581                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 582                 expected=True)
 583
 584     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 585         content_type = urlh.headers.get('Content-Type', '')
 586         webpage_bytes = urlh.read()
 587         if prefix is not None:
 588             webpage_bytes = prefix + webpage_bytes
 589         if not encoding:
 590             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 591         if self._downloader.params.get('dump_intermediate_pages', False):
 592             try:
 593                 url = url_or_request.get_full_url()
 594             except AttributeError:
 595                 url = url_or_request
 596             self.to_screen('Dumping request to ' + url)
 597             dump = base64.b64encode(webpage_bytes).decode('ascii')
 598             self._downloader.to_screen(dump)
 599         if self._downloader.params.get('write_pages', False):
 600             try:
 601                 url = url_or_request.get_full_url()
 602             except AttributeError:
 603                 url = url_or_request
 604             basen = '%s_%s' % (video_id, url)
 605             if len(basen) > 240:
 606                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 607                 basen = basen[:240 - len(h)] + h
 608             raw_filename = basen + '.dump'
 609             filename = sanitize_filename(raw_filename, restricted=True)
 610             self.to_screen('Saving request to ' + filename)
 611             # Working around MAX_PATH limitation on Windows (see
 612             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 613             if compat_os_name == 'nt':
 614                 absfilepath = os.path.abspath(filename)
 615                 if len(absfilepath) > 259:
 616                     filename = '\\\\?\\' + absfilepath
 617             with open(filename, 'wb') as outf:
 618                 outf.write(webpage_bytes)
 619
 620         try:
 621             content = webpage_bytes.decode(encoding, 'replace')
 622         except LookupError:
 623             content = webpage_bytes.decode('utf-8', 'replace')
 624
 625         self.__check_blocked(content)
 626
 627         return content
 628
 629     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 630         """ Returns the data of the page as a string """
 631         success = False
 632         try_count = 0
 633         while success is False:
 634             try:
 635                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 636                 success = True
 637             except compat_http_client.IncompleteRead as e:
 638                 try_count += 1
 639                 if try_count >= tries:
 640                     raise e
 641                 self._sleep(timeout, video_id)
 642         if res is False:
 643             return res
 644         else:
 645             content, _ = res
 646             return content
 647
 648     def _download_xml(self, url_or_request, video_id,
 649                       note='Downloading XML', errnote='Unable to download XML',
 650                       transform_source=None, fatal=True, encoding=None,
 651                       data=None, headers={}, query={}):
 652         """Return the xml as an xml.etree.ElementTree.Element"""
 653         xml_string = self._download_webpage(
 654             url_or_request, video_id, note, errnote, fatal=fatal,
 655             encoding=encoding, data=data, headers=headers, query=query)
 656         if xml_string is False:
 657             return xml_string
 658         return self._parse_xml(
 659             xml_string, video_id, transform_source=transform_source,
 660             fatal=fatal)
 661
 662     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 663         if transform_source:
 664             xml_string = transform_source(xml_string)
 665         try:
 666             return compat_etree_fromstring(xml_string.encode('utf-8'))
 667         except compat_xml_parse_error as ve:
 668             errmsg = '%s: Failed to parse XML ' % video_id
 669             if fatal:
 670                 raise ExtractorError(errmsg, cause=ve)
 671             else:
 672                 self.report_warning(errmsg + str(ve))
 673
 674     def _download_json(self, url_or_request, video_id,
 675                        note='Downloading JSON metadata',
 676                        errnote='Unable to download JSON metadata',
 677                        transform_source=None,
 678                        fatal=True, encoding=None, data=None, headers={}, query={}):
 679         json_string = self._download_webpage(
 680             url_or_request, video_id, note, errnote, fatal=fatal,
 681             encoding=encoding, data=data, headers=headers, query=query)
 682         if (not fatal) and json_string is False:
 683             return None
 684         return self._parse_json(
 685             json_string, video_id, transform_source=transform_source, fatal=fatal)
 686
 687     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 688         if transform_source:
 689             json_string = transform_source(json_string)
 690         try:
 691             return json.loads(json_string)
 692         except ValueError as ve:
 693             errmsg = '%s: Failed to parse JSON ' % video_id
 694             if fatal:
 695                 raise ExtractorError(errmsg, cause=ve)
 696             else:
 697                 self.report_warning(errmsg + str(ve))
 698
 699     def report_warning(self, msg, video_id=None):
 700         idstr = '' if video_id is None else '%s: ' % video_id
 701         self._downloader.report_warning(
 702             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 703
 704     def to_screen(self, msg):
 705         """Print msg to screen, prefixing it with '[ie_name]'"""
 706         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 707
 708     def report_extraction(self, id_or_name):
 709         """Report information extraction."""
 710         self.to_screen('%s: Extracting information' % id_or_name)
 711
 712     def report_download_webpage(self, video_id):
 713         """Report webpage download."""
 714         self.to_screen('%s: Downloading webpage' % video_id)
 715
 716     def report_age_confirmation(self):
 717         """Report attempt to confirm age."""
 718         self.to_screen('Confirming age')
 719
 720     def report_login(self):
 721         """Report attempt to log in."""
 722         self.to_screen('Logging in')
 723
 724     @staticmethod
 725     def raise_login_required(msg='This video is only available for registered users'):
 726         raise ExtractorError(
 727             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 728             expected=True)
 729
 730     @staticmethod
 731     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 732         raise GeoRestrictedError(msg, countries=countries)
 733
 734     # Methods for following #608
 735     @staticmethod
 736     def url_result(url, ie=None, video_id=None, video_title=None):
 737         """Returns a URL that points to a page that should be processed"""
 738         # TODO: ie should be the class used for getting the info
 739         video_info = {'_type': 'url',
 740                       'url': url,
 741                       'ie_key': ie}
 742         if video_id is not None:
 743             video_info['id'] = video_id
 744         if video_title is not None:
 745             video_info['title'] = video_title
 746         return video_info
 747
 748     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 749         urls = orderedSet(
 750             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 751             for m in matches)
 752         return self.playlist_result(
 753             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 754
 755     @staticmethod
 756     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 757         """Returns a playlist"""
 758         video_info = {'_type': 'playlist',
 759                       'entries': entries}
 760         if playlist_id:
 761             video_info['id'] = playlist_id
 762         if playlist_title:
 763             video_info['title'] = playlist_title
 764         if playlist_description:
 765             video_info['description'] = playlist_description
 766         return video_info
 767
 768     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 769         """
 770         Perform a regex search on the given string, using a single or a list of
 771         patterns returning the first matching group.
 772         In case of failure return a default value or raise a WARNING or a
 773         RegexNotFoundError, depending on fatal, specifying the field name.
 774         """
 775         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 776             mobj = re.search(pattern, string, flags)
 777         else:
 778             for p in pattern:
 779                 mobj = re.search(p, string, flags)
 780                 if mobj:
 781                     break
 782
 783         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 784             _name = '\033[0;34m%s\033[0m' % name
 785         else:
 786             _name = name
 787
 788         if mobj:
 789             if group is None:
 790                 # return the first matching group
 791                 return next(g for g in mobj.groups() if g is not None)
 792             else:
 793                 return mobj.group(group)
 794         elif default is not NO_DEFAULT:
 795             return default
 796         elif fatal:
 797             raise RegexNotFoundError('Unable to extract %s' % _name)
 798         else:
 799             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 800             return None
 801
 802     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 803         """
 804         Like _search_regex, but strips HTML tags and unescapes entities.
 805         """
 806         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 807         if res:
 808             return clean_html(res).strip()
 809         else:
 810             return res
 811
 812     def _get_netrc_login_info(self, netrc_machine=None):
 813         username = None
 814         password = None
 815         netrc_machine = netrc_machine or self._NETRC_MACHINE
 816
 817         if self._downloader.params.get('usenetrc', False):
 818             try:
 819                 info = netrc.netrc().authenticators(netrc_machine)
 820                 if info is not None:
 821                     username = info[0]
 822                     password = info[2]
 823                 else:
 824                     raise netrc.NetrcParseError(
 825                         'No authenticators for %s' % netrc_machine)
 826             except (IOError, netrc.NetrcParseError) as err:
 827                 self._downloader.report_warning(
 828                     'parsing .netrc: %s' % error_to_compat_str(err))
 829
 830         return username, password
 831
 832     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 833         """
 834         Get the login info as (username, password)
 835         First look for the manually specified credentials using username_option
 836         and password_option as keys in params dictionary. If no such credentials
 837         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 838         value.
 839         If there's no info available, return (None, None)
 840         """
 841         if self._downloader is None:
 842             return (None, None)
 843
 844         downloader_params = self._downloader.params
 845
 846         # Attempt to use provided username and password or .netrc data
 847         if downloader_params.get(username_option) is not None:
 848             username = downloader_params[username_option]
 849             password = downloader_params[password_option]
 850         else:
 851             username, password = self._get_netrc_login_info(netrc_machine)
 852
 853         return username, password
 854
 855     def _get_tfa_info(self, note='two-factor verification code'):
 856         """
 857         Get the two-factor authentication info
 858         TODO - asking the user will be required for sms/phone verify
 859         currently just uses the command line option
 860         If there's no info available, return None
 861         """
 862         if self._downloader is None:
 863             return None
 864         downloader_params = self._downloader.params
 865
 866         if downloader_params.get('twofactor') is not None:
 867             return downloader_params['twofactor']
 868
 869         return compat_getpass('Type %s and press [Return]: ' % note)
 870
 871     # Helper functions for extracting OpenGraph info
 872     @staticmethod
 873     def _og_regexes(prop):
 874         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 875         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 876                        % {'prop': re.escape(prop)})
 877         template = r'<meta[^>]+?%s[^>]+?%s'
 878         return [
 879             template % (property_re, content_re),
 880             template % (content_re, property_re),
 881         ]
 882
 883     @staticmethod
 884     def _meta_regex(prop):
 885         return r'''(?isx)<meta
 886                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 887                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 888
 889     def _og_search_property(self, prop, html, name=None, **kargs):
 890         if not isinstance(prop, (list, tuple)):
 891             prop = [prop]
 892         if name is None:
 893             name = 'OpenGraph %s' % prop[0]
 894         og_regexes = []
 895         for p in prop:
 896             og_regexes.extend(self._og_regexes(p))
 897         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 898         if escaped is None:
 899             return None
 900         return unescapeHTML(escaped)
 901
 902     def _og_search_thumbnail(self, html, **kargs):
 903         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 904
 905     def _og_search_description(self, html, **kargs):
 906         return self._og_search_property('description', html, fatal=False, **kargs)
 907
 908     def _og_search_title(self, html, **kargs):
 909         return self._og_search_property('title', html, **kargs)
 910
 911     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 912         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 913         if secure:
 914             regexes = self._og_regexes('video:secure_url') + regexes
 915         return self._html_search_regex(regexes, html, name, **kargs)
 916
 917     def _og_search_url(self, html, **kargs):
 918         return self._og_search_property('url', html, **kargs)
 919
 920     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 921         if not isinstance(name, (list, tuple)):
 922             name = [name]
 923         if display_name is None:
 924             display_name = name[0]
 925         return self._html_search_regex(
 926             [self._meta_regex(n) for n in name],
 927             html, display_name, fatal=fatal, group='content', **kwargs)
 928
 929     def _dc_search_uploader(self, html):
 930         return self._html_search_meta('dc.creator', html, 'uploader')
 931
 932     def _rta_search(self, html):
 933         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 934         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 935                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 936                      html):
 937             return 18
 938         return 0
 939
 940     def _media_rating_search(self, html):
 941         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 942         rating = self._html_search_meta('rating', html)
 943
 944         if not rating:
 945             return None
 946
 947         RATING_TABLE = {
 948             'safe for kids': 0,
 949             'general': 8,
 950             '14 years': 14,
 951             'mature': 17,
 952             'restricted': 19,
 953         }
 954         return RATING_TABLE.get(rating.lower())
 955
 956     def _family_friendly_search(self, html):
 957         # See http://schema.org/VideoObject
 958         family_friendly = self._html_search_meta(
 959             'isFamilyFriendly', html, default=None)
 960
 961         if not family_friendly:
 962             return None
 963
 964         RATING_TABLE = {
 965             '1': 0,
 966             'true': 0,
 967             '0': 18,
 968             'false': 18,
 969         }
 970         return RATING_TABLE.get(family_friendly.lower())
 971
 972     def _twitter_search_player(self, html):
 973         return self._html_search_meta('twitter:player', html,
 974                                       'twitter card player')
 975
 976     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 977         json_ld = self._search_regex(
 978             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 979             html, 'JSON-LD', group='json_ld', **kwargs)
 980         default = kwargs.get('default', NO_DEFAULT)
 981         if not json_ld:
 982             return default if default is not NO_DEFAULT else {}
 983         # JSON-LD may be malformed and thus `fatal` should be respected.
 984         # At the same time `default` may be passed that assumes `fatal=False`
 985         # for _search_regex. Let's simulate the same behavior here as well.
 986         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 987         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 988
 989     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 990         if isinstance(json_ld, compat_str):
 991             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 992         if not json_ld:
 993             return {}
 994         info = {}
 995         if not isinstance(json_ld, (list, tuple, dict)):
 996             return info
 997         if isinstance(json_ld, dict):
 998             json_ld = [json_ld]
 999
1000         def extract_video_object(e):
1001             assert e['@type'] == 'VideoObject'
1002             info.update({
1003                 'url': e.get('contentUrl'),
1004                 'title': unescapeHTML(e.get('name')),
1005                 'description': unescapeHTML(e.get('description')),
1006                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1007                 'duration': parse_duration(e.get('duration')),
1008                 'timestamp': unified_timestamp(e.get('uploadDate')),
1009                 'filesize': float_or_none(e.get('contentSize')),
1010                 'tbr': int_or_none(e.get('bitrate')),
1011                 'width': int_or_none(e.get('width')),
1012                 'height': int_or_none(e.get('height')),
1013                 'view_count': int_or_none(e.get('interactionCount')),
1014             })
1015
1016         for e in json_ld:
1017             if e.get('@context') == 'http://schema.org':
1018                 item_type = e.get('@type')
1019                 if expected_type is not None and expected_type != item_type:
1020                     return info
1021                 if item_type in ('TVEpisode', 'Episode'):
1022                     info.update({
1023                         'episode': unescapeHTML(e.get('name')),
1024                         'episode_number': int_or_none(e.get('episodeNumber')),
1025                         'description': unescapeHTML(e.get('description')),
1026                     })
1027                     part_of_season = e.get('partOfSeason')
1028                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1029                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1030                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1031                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1032                         info['series'] = unescapeHTML(part_of_series.get('name'))
1033                 elif item_type == 'Article':
1034                     info.update({
1035                         'timestamp': parse_iso8601(e.get('datePublished')),
1036                         'title': unescapeHTML(e.get('headline')),
1037                         'description': unescapeHTML(e.get('articleBody')),
1038                     })
1039                 elif item_type == 'VideoObject':
1040                     extract_video_object(e)
1041                     continue
1042                 video = e.get('video')
1043                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1044                     extract_video_object(video)
1045                 break
1046         return dict((k, v) for k, v in info.items() if v is not None)
1047
1048     @staticmethod
1049     def _hidden_inputs(html):
1050         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1051         hidden_inputs = {}
1052         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1053             attrs = extract_attributes(input)
1054             if not input:
1055                 continue
1056             if attrs.get('type') not in ('hidden', 'submit'):
1057                 continue
1058             name = attrs.get('name') or attrs.get('id')
1059             value = attrs.get('value')
1060             if name and value is not None:
1061                 hidden_inputs[name] = value
1062         return hidden_inputs
1063
1064     def _form_hidden_inputs(self, form_id, html):
1065         form = self._search_regex(
1066             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1067             html, '%s form' % form_id, group='form')
1068         return self._hidden_inputs(form)
1069
1070     def _sort_formats(self, formats, field_preference=None):
1071         if not formats:
1072             raise ExtractorError('No video formats found')
1073
1074         for f in formats:
1075             # Automatically determine tbr when missing based on abr and vbr (improves
1076             # formats sorting in some cases)
1077             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1078                 f['tbr'] = f['abr'] + f['vbr']
1079
1080         def _formats_key(f):
1081             # TODO remove the following workaround
1082             from ..utils import determine_ext
1083             if not f.get('ext') and 'url' in f:
1084                 f['ext'] = determine_ext(f['url'])
1085
1086             if isinstance(field_preference, (list, tuple)):
1087                 return tuple(
1088                     f.get(field)
1089                     if f.get(field) is not None
1090                     else ('' if field == 'format_id' else -1)
1091                     for field in field_preference)
1092
1093             preference = f.get('preference')
1094             if preference is None:
1095                 preference = 0
1096                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1097                     preference -= 0.5
1098
1099             protocol = f.get('protocol') or determine_protocol(f)
1100             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1101
1102             if f.get('vcodec') == 'none':  # audio only
1103                 preference -= 50
1104                 if self._downloader.params.get('prefer_free_formats'):
1105                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1106                 else:
1107                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1108                 ext_preference = 0
1109                 try:
1110                     audio_ext_preference = ORDER.index(f['ext'])
1111                 except ValueError:
1112                     audio_ext_preference = -1
1113             else:
1114                 if f.get('acodec') == 'none':  # video only
1115                     preference -= 40
1116                 if self._downloader.params.get('prefer_free_formats'):
1117                     ORDER = ['flv', 'mp4', 'webm']
1118                 else:
1119                     ORDER = ['webm', 'flv', 'mp4']
1120                 try:
1121                     ext_preference = ORDER.index(f['ext'])
1122                 except ValueError:
1123                     ext_preference = -1
1124                 audio_ext_preference = 0
1125
1126             return (
1127                 preference,
1128                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1129                 f.get('quality') if f.get('quality') is not None else -1,
1130                 f.get('tbr') if f.get('tbr') is not None else -1,
1131                 f.get('filesize') if f.get('filesize') is not None else -1,
1132                 f.get('vbr') if f.get('vbr') is not None else -1,
1133                 f.get('height') if f.get('height') is not None else -1,
1134                 f.get('width') if f.get('width') is not None else -1,
1135                 proto_preference,
1136                 ext_preference,
1137                 f.get('abr') if f.get('abr') is not None else -1,
1138                 audio_ext_preference,
1139                 f.get('fps') if f.get('fps') is not None else -1,
1140                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1141                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1142                 f.get('format_id') if f.get('format_id') is not None else '',
1143             )
1144         formats.sort(key=_formats_key)
1145
1146     def _check_formats(self, formats, video_id):
1147         if formats:
1148             formats[:] = filter(
1149                 lambda f: self._is_valid_url(
1150                     f['url'], video_id,
1151                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1152                 formats)
1153
1154     @staticmethod
1155     def _remove_duplicate_formats(formats):
1156         format_urls = set()
1157         unique_formats = []
1158         for f in formats:
1159             if f['url'] not in format_urls:
1160                 format_urls.add(f['url'])
1161                 unique_formats.append(f)
1162         formats[:] = unique_formats
1163
1164     def _is_valid_url(self, url, video_id, item='video', headers={}):
1165         url = self._proto_relative_url(url, scheme='http:')
1166         # For now assume non HTTP(S) URLs always valid
1167         if not (url.startswith('http://') or url.startswith('https://')):
1168             return True
1169         try:
1170             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1171             return True
1172         except ExtractorError as e:
1173             if isinstance(e.cause, compat_urllib_error.URLError):
1174                 self.to_screen(
1175                     '%s: %s URL is invalid, skipping' % (video_id, item))
1176                 return False
1177             raise
1178
1179     def http_scheme(self):
1180         """ Either "http:" or "https:", depending on the user's preferences """
1181         return (
1182             'http:'
1183             if self._downloader.params.get('prefer_insecure', False)
1184             else 'https:')
1185
1186     def _proto_relative_url(self, url, scheme=None):
1187         if url is None:
1188             return url
1189         if url.startswith('//'):
1190             if scheme is None:
1191                 scheme = self.http_scheme()
1192             return scheme + url
1193         else:
1194             return url
1195
1196     def _sleep(self, timeout, video_id, msg_template=None):
1197         if msg_template is None:
1198             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1199         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1200         self.to_screen(msg)
1201         time.sleep(timeout)
1202
1203     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1204                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1205                              fatal=True, m3u8_id=None):
1206         manifest = self._download_xml(
1207             manifest_url, video_id, 'Downloading f4m manifest',
1208             'Unable to download f4m manifest',
1209             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1210             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1211             transform_source=transform_source,
1212             fatal=fatal)
1213
1214         if manifest is False:
1215             return []
1216
1217         return self._parse_f4m_formats(
1218             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1219             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1220
1221     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1222                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1223                            fatal=True, m3u8_id=None):
1224         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1225         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1226         if akamai_pv is not None and ';' in akamai_pv.text:
1227             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1228             if playerVerificationChallenge.strip() != '':
1229                 return []
1230
1231         formats = []
1232         manifest_version = '1.0'
1233         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1234         if not media_nodes:
1235             manifest_version = '2.0'
1236             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1237         # Remove unsupported DRM protected media from final formats
1238         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1239         media_nodes = remove_encrypted_media(media_nodes)
1240         if not media_nodes:
1241             return formats
1242         base_url = xpath_text(
1243             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1244             'base URL', default=None)
1245         if base_url:
1246             base_url = base_url.strip()
1247
1248         bootstrap_info = xpath_element(
1249             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1250             'bootstrap info', default=None)
1251
1252         vcodec = None
1253         mime_type = xpath_text(
1254             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1255             'base URL', default=None)
1256         if mime_type and mime_type.startswith('audio/'):
1257             vcodec = 'none'
1258
1259         for i, media_el in enumerate(media_nodes):
1260             tbr = int_or_none(media_el.attrib.get('bitrate'))
1261             width = int_or_none(media_el.attrib.get('width'))
1262             height = int_or_none(media_el.attrib.get('height'))
1263             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1264             # If <bootstrapInfo> is present, the specified f4m is a
1265             # stream-level manifest, and only set-level manifests may refer to
1266             # external resources.  See section 11.4 and section 4 of F4M spec
1267             if bootstrap_info is None:
1268                 media_url = None
1269                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1270                 if manifest_version == '2.0':
1271                     media_url = media_el.attrib.get('href')
1272                 if media_url is None:
1273                     media_url = media_el.attrib.get('url')
1274                 if not media_url:
1275                     continue
1276                 manifest_url = (
1277                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1278                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1279                 # If media_url is itself a f4m manifest do the recursive extraction
1280                 # since bitrates in parent manifest (this one) and media_url manifest
1281                 # may differ leading to inability to resolve the format by requested
1282                 # bitrate in f4m downloader
1283                 ext = determine_ext(manifest_url)
1284                 if ext == 'f4m':
1285                     f4m_formats = self._extract_f4m_formats(
1286                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1287                         transform_source=transform_source, fatal=fatal)
1288                     # Sometimes stream-level manifest contains single media entry that
1289                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1290                     # At the same time parent's media entry in set-level manifest may
1291                     # contain it. We will copy it from parent in such cases.
1292                     if len(f4m_formats) == 1:
1293                         f = f4m_formats[0]
1294                         f.update({
1295                             'tbr': f.get('tbr') or tbr,
1296                             'width': f.get('width') or width,
1297                             'height': f.get('height') or height,
1298                             'format_id': f.get('format_id') if not tbr else format_id,
1299                             'vcodec': vcodec,
1300                         })
1301                     formats.extend(f4m_formats)
1302                     continue
1303                 elif ext == 'm3u8':
1304                     formats.extend(self._extract_m3u8_formats(
1305                         manifest_url, video_id, 'mp4', preference=preference,
1306                         m3u8_id=m3u8_id, fatal=fatal))
1307                     continue
1308             formats.append({
1309                 'format_id': format_id,
1310                 'url': manifest_url,
1311                 'manifest_url': manifest_url,
1312                 'ext': 'flv' if bootstrap_info is not None else None,
1313                 'tbr': tbr,
1314                 'width': width,
1315                 'height': height,
1316                 'vcodec': vcodec,
1317                 'preference': preference,
1318             })
1319         return formats
1320
1321     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1322         return {
1323             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1324             'url': m3u8_url,
1325             'ext': ext,
1326             'protocol': 'm3u8',
1327             'preference': preference - 100 if preference else -100,
1328             'resolution': 'multiple',
1329             'format_note': 'Quality selection URL',
1330         }
1331
1332     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1333                               entry_protocol='m3u8', preference=None,
1334                               m3u8_id=None, note=None, errnote=None,
1335                               fatal=True, live=False):
1336         res = self._download_webpage_handle(
1337             m3u8_url, video_id,
1338             note=note or 'Downloading m3u8 information',
1339             errnote=errnote or 'Failed to download m3u8 information',
1340             fatal=fatal)
1341
1342         if res is False:
1343             return []
1344
1345         m3u8_doc, urlh = res
1346         m3u8_url = urlh.geturl()
1347
1348         return self._parse_m3u8_formats(
1349             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1350             preference=preference, m3u8_id=m3u8_id, live=live)
1351
1352     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1353                             entry_protocol='m3u8', preference=None,
1354                             m3u8_id=None, live=False):
1355         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1356             return []
1357
1358         formats = []
1359
1360         format_url = lambda u: (
1361             u
1362             if re.match(r'^https?://', u)
1363             else compat_urlparse.urljoin(m3u8_url, u))
1364
1365         # References:
1366         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1367         # 2. https://github.com/rg3/youtube-dl/issues/12211
1368
1369         # We should try extracting formats only from master playlists [1, 4.3.4],
1370         # i.e. playlists that describe available qualities. On the other hand
1371         # media playlists [1, 4.3.3] should be returned as is since they contain
1372         # just the media without qualities renditions.
1373         # Fortunately, master playlist can be easily distinguished from media
1374         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1375         # master playlist tags MUST NOT appear in a media playist and vice versa.
1376         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1377         # media playlist and MUST NOT appear in master playlist thus we can
1378         # clearly detect media playlist with this criterion.
1379
1380         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1381             return [{
1382                 'url': m3u8_url,
1383                 'format_id': m3u8_id,
1384                 'ext': ext,
1385                 'protocol': entry_protocol,
1386                 'preference': preference,
1387             }]
1388
1389         groups = {}
1390         last_stream_inf = {}
1391
1392         def extract_media(x_media_line):
1393             media = parse_m3u8_attributes(x_media_line)
1394             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1395             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1396             if not (media_type and group_id and name):
1397                 return
1398             groups.setdefault(group_id, []).append(media)
1399             if media_type not in ('VIDEO', 'AUDIO'):
1400                 return
1401             media_url = media.get('URI')
1402             if media_url:
1403                 format_id = []
1404                 for v in (group_id, name):
1405                     if v:
1406                         format_id.append(v)
1407                 f = {
1408                     'format_id': '-'.join(format_id),
1409                     'url': format_url(media_url),
1410                     'manifest_url': m3u8_url,
1411                     'language': media.get('LANGUAGE'),
1412                     'ext': ext,
1413                     'protocol': entry_protocol,
1414                     'preference': preference,
1415                 }
1416                 if media_type == 'AUDIO':
1417                     f['vcodec'] = 'none'
1418                 formats.append(f)
1419
1420         def build_stream_name():
1421             # Despite specification does not mention NAME attribute for
1422             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1423             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1424             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1425             stream_name = last_stream_inf.get('NAME')
1426             if stream_name:
1427                 return stream_name
1428             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1429             # from corresponding rendition group
1430             stream_group_id = last_stream_inf.get('VIDEO')
1431             if not stream_group_id:
1432                 return
1433             stream_group = groups.get(stream_group_id)
1434             if not stream_group:
1435                 return stream_group_id
1436             rendition = stream_group[0]
1437             return rendition.get('NAME') or stream_group_id
1438
1439         for line in m3u8_doc.splitlines():
1440             if line.startswith('#EXT-X-STREAM-INF:'):
1441                 last_stream_inf = parse_m3u8_attributes(line)
1442             elif line.startswith('#EXT-X-MEDIA:'):
1443                 extract_media(line)
1444             elif line.startswith('#') or not line.strip():
1445                 continue
1446             else:
1447                 tbr = float_or_none(
1448                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1449                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1450                 format_id = []
1451                 if m3u8_id:
1452                     format_id.append(m3u8_id)
1453                 stream_name = build_stream_name()
1454                 # Bandwidth of live streams may differ over time thus making
1455                 # format_id unpredictable. So it's better to keep provided
1456                 # format_id intact.
1457                 if not live:
1458                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1459                 manifest_url = format_url(line.strip())
1460                 f = {
1461                     'format_id': '-'.join(format_id),
1462                     'url': manifest_url,
1463                     'manifest_url': m3u8_url,
1464                     'tbr': tbr,
1465                     'ext': ext,
1466                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1467                     'protocol': entry_protocol,
1468                     'preference': preference,
1469                 }
1470                 resolution = last_stream_inf.get('RESOLUTION')
1471                 if resolution:
1472                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1473                     if mobj:
1474                         f['width'] = int(mobj.group('width'))
1475                         f['height'] = int(mobj.group('height'))
1476                 # Unified Streaming Platform
1477                 mobj = re.search(
1478                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1479                 if mobj:
1480                     abr, vbr = mobj.groups()
1481                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1482                     f.update({
1483                         'vbr': vbr,
1484                         'abr': abr,
1485                     })
1486                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1487                 f.update(codecs)
1488                 audio_group_id = last_stream_inf.get('AUDIO')
1489                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1490                 # references a rendition group MUST have a CODECS attribute.
1491                 # However, this is not always respected, for example, [2]
1492                 # contains EXT-X-STREAM-INF tag which references AUDIO
1493                 # rendition group but does not have CODECS and despite
1494                 # referencing audio group an audio group, it represents
1495                 # a complete (with audio and video) format. So, for such cases
1496                 # we will ignore references to rendition groups and treat them
1497                 # as complete formats.
1498                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1499                     audio_group = groups.get(audio_group_id)
1500                     if audio_group and audio_group[0].get('URI'):
1501                         # TODO: update acodec for audio only formats with
1502                         # the same GROUP-ID
1503                         f['acodec'] = 'none'
1504                 formats.append(f)
1505                 last_stream_inf = {}
1506         return formats
1507
1508     @staticmethod
1509     def _xpath_ns(path, namespace=None):
1510         if not namespace:
1511             return path
1512         out = []
1513         for c in path.split('/'):
1514             if not c or c == '.':
1515                 out.append(c)
1516             else:
1517                 out.append('{%s}%s' % (namespace, c))
1518         return '/'.join(out)
1519
1520     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1521         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1522
1523         if smil is False:
1524             assert not fatal
1525             return []
1526
1527         namespace = self._parse_smil_namespace(smil)
1528
1529         return self._parse_smil_formats(
1530             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1531
1532     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1533         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1534         if smil is False:
1535             return {}
1536         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1537
1538     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1539         return self._download_xml(
1540             smil_url, video_id, 'Downloading SMIL file',
1541             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1542
1543     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1544         namespace = self._parse_smil_namespace(smil)
1545
1546         formats = self._parse_smil_formats(
1547             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1548         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1549
1550         video_id = os.path.splitext(url_basename(smil_url))[0]
1551         title = None
1552         description = None
1553         upload_date = None
1554         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1555             name = meta.attrib.get('name')
1556             content = meta.attrib.get('content')
1557             if not name or not content:
1558                 continue
1559             if not title and name == 'title':
1560                 title = content
1561             elif not description and name in ('description', 'abstract'):
1562                 description = content
1563             elif not upload_date and name == 'date':
1564                 upload_date = unified_strdate(content)
1565
1566         thumbnails = [{
1567             'id': image.get('type'),
1568             'url': image.get('src'),
1569             'width': int_or_none(image.get('width')),
1570             'height': int_or_none(image.get('height')),
1571         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1572
1573         return {
1574             'id': video_id,
1575             'title': title or video_id,
1576             'description': description,
1577             'upload_date': upload_date,
1578             'thumbnails': thumbnails,
1579             'formats': formats,
1580             'subtitles': subtitles,
1581         }
1582
1583     def _parse_smil_namespace(self, smil):
1584         return self._search_regex(
1585             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1586
1587     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1588         base = smil_url
1589         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1590             b = meta.get('base') or meta.get('httpBase')
1591             if b:
1592                 base = b
1593                 break
1594
1595         formats = []
1596         rtmp_count = 0
1597         http_count = 0
1598         m3u8_count = 0
1599
1600         srcs = []
1601         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1602         for medium in media:
1603             src = medium.get('src')
1604             if not src or src in srcs:
1605                 continue
1606             srcs.append(src)
1607
1608             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1609             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1610             width = int_or_none(medium.get('width'))
1611             height = int_or_none(medium.get('height'))
1612             proto = medium.get('proto')
1613             ext = medium.get('ext')
1614             src_ext = determine_ext(src)
1615             streamer = medium.get('streamer') or base
1616
1617             if proto == 'rtmp' or streamer.startswith('rtmp'):
1618                 rtmp_count += 1
1619                 formats.append({
1620                     'url': streamer,
1621                     'play_path': src,
1622                     'ext': 'flv',
1623                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1624                     'tbr': bitrate,
1625                     'filesize': filesize,
1626                     'width': width,
1627                     'height': height,
1628                 })
1629                 if transform_rtmp_url:
1630                     streamer, src = transform_rtmp_url(streamer, src)
1631                     formats[-1].update({
1632                         'url': streamer,
1633                         'play_path': src,
1634                     })
1635                 continue
1636
1637             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1638             src_url = src_url.strip()
1639
1640             if proto == 'm3u8' or src_ext == 'm3u8':
1641                 m3u8_formats = self._extract_m3u8_formats(
1642                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1643                 if len(m3u8_formats) == 1:
1644                     m3u8_count += 1
1645                     m3u8_formats[0].update({
1646                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1647                         'tbr': bitrate,
1648                         'width': width,
1649                         'height': height,
1650                     })
1651                 formats.extend(m3u8_formats)
1652                 continue
1653
1654             if src_ext == 'f4m':
1655                 f4m_url = src_url
1656                 if not f4m_params:
1657                     f4m_params = {
1658                         'hdcore': '3.2.0',
1659                         'plugin': 'flowplayer-3.2.0.1',
1660                     }
1661                 f4m_url += '&' if '?' in f4m_url else '?'
1662                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1663                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1664                 continue
1665
1666             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1667                 http_count += 1
1668                 formats.append({
1669                     'url': src_url,
1670                     'ext': ext or src_ext or 'flv',
1671                     'format_id': 'http-%d' % (bitrate or http_count),
1672                     'tbr': bitrate,
1673                     'filesize': filesize,
1674                     'width': width,
1675                     'height': height,
1676                 })
1677                 continue
1678
1679         return formats
1680
1681     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1682         urls = []
1683         subtitles = {}
1684         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1685             src = textstream.get('src')
1686             if not src or src in urls:
1687                 continue
1688             urls.append(src)
1689             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1690             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1691             subtitles.setdefault(lang, []).append({
1692                 'url': src,
1693                 'ext': ext,
1694             })
1695         return subtitles
1696
1697     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1698         xspf = self._download_xml(
1699             playlist_url, playlist_id, 'Downloading xpsf playlist',
1700             'Unable to download xspf manifest', fatal=fatal)
1701         if xspf is False:
1702             return []
1703         return self._parse_xspf(xspf, playlist_id)
1704
1705     def _parse_xspf(self, playlist, playlist_id):
1706         NS_MAP = {
1707             'xspf': 'http://xspf.org/ns/0/',
1708             's1': 'http://static.streamone.nl/player/ns/0',
1709         }
1710
1711         entries = []
1712         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1713             title = xpath_text(
1714                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1715             description = xpath_text(
1716                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1717             thumbnail = xpath_text(
1718                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1719             duration = float_or_none(
1720                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1721
1722             formats = [{
1723                 'url': location.text,
1724                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1725                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1726                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1727             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1728             self._sort_formats(formats)
1729
1730             entries.append({
1731                 'id': playlist_id,
1732                 'title': title,
1733                 'description': description,
1734                 'thumbnail': thumbnail,
1735                 'duration': duration,
1736                 'formats': formats,
1737             })
1738         return entries
1739
1740     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1741         res = self._download_webpage_handle(
1742             mpd_url, video_id,
1743             note=note or 'Downloading MPD manifest',
1744             errnote=errnote or 'Failed to download MPD manifest',
1745             fatal=fatal)
1746         if res is False:
1747             return []
1748         mpd, urlh = res
1749         mpd_base_url = base_url(urlh.geturl())
1750
1751         return self._parse_mpd_formats(
1752             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1753             formats_dict=formats_dict, mpd_url=mpd_url)
1754
1755     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1756         """
1757         Parse formats from MPD manifest.
1758         References:
1759          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1760             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1761          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1762         """
1763         if mpd_doc.get('type') == 'dynamic':
1764             return []
1765
1766         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1767
1768         def _add_ns(path):
1769             return self._xpath_ns(path, namespace)
1770
1771         def is_drm_protected(element):
1772             return element.find(_add_ns('ContentProtection')) is not None
1773
1774         def extract_multisegment_info(element, ms_parent_info):
1775             ms_info = ms_parent_info.copy()
1776
1777             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1778             # common attributes and elements.  We will only extract relevant
1779             # for us.
1780             def extract_common(source):
1781                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1782                 if segment_timeline is not None:
1783                     s_e = segment_timeline.findall(_add_ns('S'))
1784                     if s_e:
1785                         ms_info['total_number'] = 0
1786                         ms_info['s'] = []
1787                         for s in s_e:
1788                             r = int(s.get('r', 0))
1789                             ms_info['total_number'] += 1 + r
1790                             ms_info['s'].append({
1791                                 't': int(s.get('t', 0)),
1792                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1793                                 'd': int(s.attrib['d']),
1794                                 'r': r,
1795                             })
1796                 start_number = source.get('startNumber')
1797                 if start_number:
1798                     ms_info['start_number'] = int(start_number)
1799                 timescale = source.get('timescale')
1800                 if timescale:
1801                     ms_info['timescale'] = int(timescale)
1802                 segment_duration = source.get('duration')
1803                 if segment_duration:
1804                     ms_info['segment_duration'] = float(segment_duration)
1805
1806             def extract_Initialization(source):
1807                 initialization = source.find(_add_ns('Initialization'))
1808                 if initialization is not None:
1809                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1810
1811             segment_list = element.find(_add_ns('SegmentList'))
1812             if segment_list is not None:
1813                 extract_common(segment_list)
1814                 extract_Initialization(segment_list)
1815                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1816                 if segment_urls_e:
1817                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1818             else:
1819                 segment_template = element.find(_add_ns('SegmentTemplate'))
1820                 if segment_template is not None:
1821                     extract_common(segment_template)
1822                     media = segment_template.get('media')
1823                     if media:
1824                         ms_info['media'] = media
1825                     initialization = segment_template.get('initialization')
1826                     if initialization:
1827                         ms_info['initialization'] = initialization
1828                     else:
1829                         extract_Initialization(segment_template)
1830             return ms_info
1831
1832         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1833         formats = []
1834         for period in mpd_doc.findall(_add_ns('Period')):
1835             period_duration = parse_duration(period.get('duration')) or mpd_duration
1836             period_ms_info = extract_multisegment_info(period, {
1837                 'start_number': 1,
1838                 'timescale': 1,
1839             })
1840             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1841                 if is_drm_protected(adaptation_set):
1842                     continue
1843                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1844                 for representation in adaptation_set.findall(_add_ns('Representation')):
1845                     if is_drm_protected(representation):
1846                         continue
1847                     representation_attrib = adaptation_set.attrib.copy()
1848                     representation_attrib.update(representation.attrib)
1849                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1850                     mime_type = representation_attrib['mimeType']
1851                     content_type = mime_type.split('/')[0]
1852                     if content_type == 'text':
1853                         # TODO implement WebVTT downloading
1854                         pass
1855                     elif content_type in ('video', 'audio'):
1856                         base_url = ''
1857                         for element in (representation, adaptation_set, period, mpd_doc):
1858                             base_url_e = element.find(_add_ns('BaseURL'))
1859                             if base_url_e is not None:
1860                                 base_url = base_url_e.text + base_url
1861                                 if re.match(r'^https?://', base_url):
1862                                     break
1863                         if mpd_base_url and not re.match(r'^https?://', base_url):
1864                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1865                                 mpd_base_url += '/'
1866                             base_url = mpd_base_url + base_url
1867                         representation_id = representation_attrib.get('id')
1868                         lang = representation_attrib.get('lang')
1869                         url_el = representation.find(_add_ns('BaseURL'))
1870                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1871                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1872                         f = {
1873                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1874                             'url': base_url,
1875                             'manifest_url': mpd_url,
1876                             'ext': mimetype2ext(mime_type),
1877                             'width': int_or_none(representation_attrib.get('width')),
1878                             'height': int_or_none(representation_attrib.get('height')),
1879                             'tbr': float_or_none(bandwidth, 1000),
1880                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1881                             'fps': int_or_none(representation_attrib.get('frameRate')),
1882                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1883                             'format_note': 'DASH %s' % content_type,
1884                             'filesize': filesize,
1885                         }
1886                         f.update(parse_codecs(representation_attrib.get('codecs')))
1887                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1888
1889                         def prepare_template(template_name, identifiers):
1890                             t = representation_ms_info[template_name]
1891                             t = t.replace('$RepresentationID$', representation_id)
1892                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1893                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1894                             t.replace('$$', '$')
1895                             return t
1896
1897                         # @initialization is a regular template like @media one
1898                         # so it should be handled just the same way (see
1899                         # https://github.com/rg3/youtube-dl/issues/11605)
1900                         if 'initialization' in representation_ms_info:
1901                             initialization_template = prepare_template(
1902                                 'initialization',
1903                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1904                                 # $Time$ shall not be included for @initialization thus
1905                                 # only $Bandwidth$ remains
1906                                 ('Bandwidth', ))
1907                             representation_ms_info['initialization_url'] = initialization_template % {
1908                                 'Bandwidth': bandwidth,
1909                             }
1910
1911                         def location_key(location):
1912                             return 'url' if re.match(r'^https?://', location) else 'path'
1913
1914                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1915
1916                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1917                             media_location_key = location_key(media_template)
1918
1919                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1920                             # can't be used at the same time
1921                             if '%(Number' in media_template and 's' not in representation_ms_info:
1922                                 segment_duration = None
1923                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1924                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1925                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1926                                 representation_ms_info['fragments'] = [{
1927                                     media_location_key: media_template % {
1928                                         'Number': segment_number,
1929                                         'Bandwidth': bandwidth,
1930                                     },
1931                                     'duration': segment_duration,
1932                                 } for segment_number in range(
1933                                     representation_ms_info['start_number'],
1934                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1935                             else:
1936                                 # $Number*$ or $Time$ in media template with S list available
1937                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1938                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1939                                 representation_ms_info['fragments'] = []
1940                                 segment_time = 0
1941                                 segment_d = None
1942                                 segment_number = representation_ms_info['start_number']
1943
1944                                 def add_segment_url():
1945                                     segment_url = media_template % {
1946                                         'Time': segment_time,
1947                                         'Bandwidth': bandwidth,
1948                                         'Number': segment_number,
1949                                     }
1950                                     representation_ms_info['fragments'].append({
1951                                         media_location_key: segment_url,
1952                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1953                                     })
1954
1955                                 for num, s in enumerate(representation_ms_info['s']):
1956                                     segment_time = s.get('t') or segment_time
1957                                     segment_d = s['d']
1958                                     add_segment_url()
1959                                     segment_number += 1
1960                                     for r in range(s.get('r', 0)):
1961                                         segment_time += segment_d
1962                                         add_segment_url()
1963                                         segment_number += 1
1964                                     segment_time += segment_d
1965                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1966                             # No media template
1967                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1968                             # or any YouTube dashsegments video
1969                             fragments = []
1970                             segment_index = 0
1971                             timescale = representation_ms_info['timescale']
1972                             for s in representation_ms_info['s']:
1973                                 duration = float_or_none(s['d'], timescale)
1974                                 for r in range(s.get('r', 0) + 1):
1975                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1976                                     fragments.append({
1977                                         location_key(segment_uri): segment_uri,
1978                                         'duration': duration,
1979                                     })
1980                                     segment_index += 1
1981                             representation_ms_info['fragments'] = fragments
1982                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1983                         # No fragments key is present in this case.
1984                         if 'fragments' in representation_ms_info:
1985                             f.update({
1986                                 'fragment_base_url': base_url,
1987                                 'fragments': [],
1988                                 'protocol': 'http_dash_segments',
1989                             })
1990                             if 'initialization_url' in representation_ms_info:
1991                                 initialization_url = representation_ms_info['initialization_url']
1992                                 if not f.get('url'):
1993                                     f['url'] = initialization_url
1994                                 f['fragments'].append({location_key(initialization_url): initialization_url})
1995                             f['fragments'].extend(representation_ms_info['fragments'])
1996                         try:
1997                             existing_format = next(
1998                                 fo for fo in formats
1999                                 if fo['format_id'] == representation_id)
2000                         except StopIteration:
2001                             full_info = formats_dict.get(representation_id, {}).copy()
2002                             full_info.update(f)
2003                             formats.append(full_info)
2004                         else:
2005                             existing_format.update(f)
2006                     else:
2007                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2008         return formats
2009
2010     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2011         res = self._download_webpage_handle(
2012             ism_url, video_id,
2013             note=note or 'Downloading ISM manifest',
2014             errnote=errnote or 'Failed to download ISM manifest',
2015             fatal=fatal)
2016         if res is False:
2017             return []
2018         ism, urlh = res
2019
2020         return self._parse_ism_formats(
2021             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2022
2023     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2024         """
2025         Parse formats from ISM manifest.
2026         References:
2027          1. [MS-SSTR]: Smooth Streaming Protocol,
2028             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2029         """
2030         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2031             return []
2032
2033         duration = int(ism_doc.attrib['Duration'])
2034         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2035
2036         formats = []
2037         for stream in ism_doc.findall('StreamIndex'):
2038             stream_type = stream.get('Type')
2039             if stream_type not in ('video', 'audio'):
2040                 continue
2041             url_pattern = stream.attrib['Url']
2042             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2043             stream_name = stream.get('Name')
2044             for track in stream.findall('QualityLevel'):
2045                 fourcc = track.get('FourCC')
2046                 # TODO: add support for WVC1 and WMAP
2047                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2048                     self.report_warning('%s is not a supported codec' % fourcc)
2049                     continue
2050                 tbr = int(track.attrib['Bitrate']) // 1000
2051                 # [1] does not mention Width and Height attributes. However,
2052                 # they're often present while MaxWidth and MaxHeight are
2053                 # missing, so should be used as fallbacks
2054                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2055                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2056                 sampling_rate = int_or_none(track.get('SamplingRate'))
2057
2058                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2059                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2060
2061                 fragments = []
2062                 fragment_ctx = {
2063                     'time': 0,
2064                 }
2065                 stream_fragments = stream.findall('c')
2066                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2067                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2068                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2069                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2070                     if not fragment_ctx['duration']:
2071                         try:
2072                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2073                         except IndexError:
2074                             next_fragment_time = duration
2075                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2076                     for _ in range(fragment_repeat):
2077                         fragments.append({
2078                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2079                             'duration': fragment_ctx['duration'] / stream_timescale,
2080                         })
2081                         fragment_ctx['time'] += fragment_ctx['duration']
2082
2083                 format_id = []
2084                 if ism_id:
2085                     format_id.append(ism_id)
2086                 if stream_name:
2087                     format_id.append(stream_name)
2088                 format_id.append(compat_str(tbr))
2089
2090                 formats.append({
2091                     'format_id': '-'.join(format_id),
2092                     'url': ism_url,
2093                     'manifest_url': ism_url,
2094                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2095                     'width': width,
2096                     'height': height,
2097                     'tbr': tbr,
2098                     'asr': sampling_rate,
2099                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2100                     'acodec': 'none' if stream_type == 'video' else fourcc,
2101                     'protocol': 'ism',
2102                     'fragments': fragments,
2103                     '_download_params': {
2104                         'duration': duration,
2105                         'timescale': stream_timescale,
2106                         'width': width or 0,
2107                         'height': height or 0,
2108                         'fourcc': fourcc,
2109                         'codec_private_data': track.get('CodecPrivateData'),
2110                         'sampling_rate': sampling_rate,
2111                         'channels': int_or_none(track.get('Channels', 2)),
2112                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2113                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2114                     },
2115                 })
2116         return formats
2117
2118     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2119         def absolute_url(video_url):
2120             return compat_urlparse.urljoin(base_url, video_url)
2121
2122         def parse_content_type(content_type):
2123             if not content_type:
2124                 return {}
2125             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2126             if ctr:
2127                 mimetype, codecs = ctr.groups()
2128                 f = parse_codecs(codecs)
2129                 f['ext'] = mimetype2ext(mimetype)
2130                 return f
2131             return {}
2132
2133         def _media_formats(src, cur_media_type, type_info={}):
2134             full_url = absolute_url(src)
2135             ext = type_info.get('ext') or determine_ext(full_url)
2136             if ext == 'm3u8':
2137                 is_plain_url = False
2138                 formats = self._extract_m3u8_formats(
2139                     full_url, video_id, ext='mp4',
2140                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2141                     preference=preference, fatal=False)
2142             elif ext == 'mpd':
2143                 is_plain_url = False
2144                 formats = self._extract_mpd_formats(
2145                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2146             else:
2147                 is_plain_url = True
2148                 formats = [{
2149                     'url': full_url,
2150                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2151                 }]
2152             return is_plain_url, formats
2153
2154         entries = []
2155         # amp-video and amp-audio are very similar to their HTML5 counterparts
2156         # so we wll include them right here (see
2157         # https://www.ampproject.org/docs/reference/components/amp-video)
2158         media_tags = [(media_tag, media_type, '')
2159                       for media_tag, media_type
2160                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2161         media_tags.extend(re.findall(
2162             # We only allow video|audio followed by a whitespace or '>'.
2163             # Allowing more characters may end up in significant slow down (see
2164             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2165             # http://www.porntrex.com/maps/videositemap.xml).
2166             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2167         for media_tag, media_type, media_content in media_tags:
2168             media_info = {
2169                 'formats': [],
2170                 'subtitles': {},
2171             }
2172             media_attributes = extract_attributes(media_tag)
2173             src = media_attributes.get('src')
2174             if src:
2175                 _, formats = _media_formats(src, media_type)
2176                 media_info['formats'].extend(formats)
2177             media_info['thumbnail'] = media_attributes.get('poster')
2178             if media_content:
2179                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2180                     source_attributes = extract_attributes(source_tag)
2181                     src = source_attributes.get('src')
2182                     if not src:
2183                         continue
2184                     f = parse_content_type(source_attributes.get('type'))
2185                     is_plain_url, formats = _media_formats(src, media_type, f)
2186                     if is_plain_url:
2187                         # res attribute is not standard but seen several times
2188                         # in the wild
2189                         f.update({
2190                             'height': int_or_none(source_attributes.get('res')),
2191                             'format_id': source_attributes.get('label'),
2192                         })
2193                         f.update(formats[0])
2194                         media_info['formats'].append(f)
2195                     else:
2196                         media_info['formats'].extend(formats)
2197                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2198                     track_attributes = extract_attributes(track_tag)
2199                     kind = track_attributes.get('kind')
2200                     if not kind or kind in ('subtitles', 'captions'):
2201                         src = track_attributes.get('src')
2202                         if not src:
2203                             continue
2204                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2205                         media_info['subtitles'].setdefault(lang, []).append({
2206                             'url': absolute_url(src),
2207                         })
2208             if media_info['formats'] or media_info['subtitles']:
2209                 entries.append(media_info)
2210         return entries
2211
2212     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2213         formats = []
2214         hdcore_sign = 'hdcore=3.7.0'
2215         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2216         hds_host = hosts.get('hds')
2217         if hds_host:
2218             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2219         if 'hdcore=' not in f4m_url:
2220             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2221         f4m_formats = self._extract_f4m_formats(
2222             f4m_url, video_id, f4m_id='hds', fatal=False)
2223         for entry in f4m_formats:
2224             entry.update({'extra_param_to_segment_url': hdcore_sign})
2225         formats.extend(f4m_formats)
2226         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2227         hls_host = hosts.get('hls')
2228         if hls_host:
2229             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2230         formats.extend(self._extract_m3u8_formats(
2231             m3u8_url, video_id, 'mp4', 'm3u8_native',
2232             m3u8_id='hls', fatal=False))
2233         return formats
2234
2235     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2236         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2237         url_base = self._search_regex(
2238             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2239         http_base_url = '%s:%s' % ('http', url_base)
2240         formats = []
2241         if 'm3u8' not in skip_protocols:
2242             formats.extend(self._extract_m3u8_formats(
2243                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2244                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2245         if 'f4m' not in skip_protocols:
2246             formats.extend(self._extract_f4m_formats(
2247                 http_base_url + '/manifest.f4m',
2248                 video_id, f4m_id='hds', fatal=False))
2249         if 'dash' not in skip_protocols:
2250             formats.extend(self._extract_mpd_formats(
2251                 http_base_url + '/manifest.mpd',
2252                 video_id, mpd_id='dash', fatal=False))
2253         if re.search(r'(?:/smil:|\.smil)', url_base):
2254             if 'smil' not in skip_protocols:
2255                 rtmp_formats = self._extract_smil_formats(
2256                     http_base_url + '/jwplayer.smil',
2257                     video_id, fatal=False)
2258                 for rtmp_format in rtmp_formats:
2259                     rtsp_format = rtmp_format.copy()
2260                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2261                     del rtsp_format['play_path']
2262                     del rtsp_format['ext']
2263                     rtsp_format.update({
2264                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2265                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2266                         'protocol': 'rtsp',
2267                     })
2268                     formats.extend([rtmp_format, rtsp_format])
2269         else:
2270             for protocol in ('rtmp', 'rtsp'):
2271                 if protocol not in skip_protocols:
2272                     formats.append({
2273                         'url': '%s:%s' % (protocol, url_base),
2274                         'format_id': protocol,
2275                         'protocol': protocol,
2276                     })
2277         return formats
2278
2279     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2280         mobj = re.search(
2281             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2282             webpage)
2283         if mobj:
2284             try:
2285                 jwplayer_data = self._parse_json(mobj.group('options'),
2286                                                  video_id=video_id,
2287                                                  transform_source=transform_source)
2288             except ExtractorError:
2289                 pass
2290             else:
2291                 if isinstance(jwplayer_data, dict):
2292                     return jwplayer_data
2293
2294     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2295         jwplayer_data = self._find_jwplayer_data(
2296             webpage, video_id, transform_source=js_to_json)
2297         return self._parse_jwplayer_data(
2298             jwplayer_data, video_id, *args, **kwargs)
2299
2300     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2301                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2302         # JWPlayer backward compatibility: flattened playlists
2303         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2304         if 'playlist' not in jwplayer_data:
2305             jwplayer_data = {'playlist': [jwplayer_data]}
2306
2307         entries = []
2308
2309         # JWPlayer backward compatibility: single playlist item
2310         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2311         if not isinstance(jwplayer_data['playlist'], list):
2312             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2313
2314         for video_data in jwplayer_data['playlist']:
2315             # JWPlayer backward compatibility: flattened sources
2316             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2317             if 'sources' not in video_data:
2318                 video_data['sources'] = [video_data]
2319
2320             this_video_id = video_id or video_data['mediaid']
2321
2322             formats = self._parse_jwplayer_formats(
2323                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2324                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2325             self._sort_formats(formats)
2326
2327             subtitles = {}
2328             tracks = video_data.get('tracks')
2329             if tracks and isinstance(tracks, list):
2330                 for track in tracks:
2331                     if not isinstance(track, dict):
2332                         continue
2333                     if track.get('kind') != 'captions':
2334                         continue
2335                     track_url = urljoin(base_url, track.get('file'))
2336                     if not track_url:
2337                         continue
2338                     subtitles.setdefault(track.get('label') or 'en', []).append({
2339                         'url': self._proto_relative_url(track_url)
2340                     })
2341
2342             entries.append({
2343                 'id': this_video_id,
2344                 'title': video_data['title'] if require_title else video_data.get('title'),
2345                 'description': video_data.get('description'),
2346                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2347                 'timestamp': int_or_none(video_data.get('pubdate')),
2348                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2349                 'subtitles': subtitles,
2350                 'formats': formats,
2351             })
2352         if len(entries) == 1:
2353             return entries[0]
2354         else:
2355             return self.playlist_result(entries)
2356
2357     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2358                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2359         urls = []
2360         formats = []
2361         for source in jwplayer_sources_data:
2362             if not isinstance(source, dict):
2363                 continue
2364             source_url = self._proto_relative_url(source.get('file'))
2365             if not source_url:
2366                 continue
2367             if base_url:
2368                 source_url = compat_urlparse.urljoin(base_url, source_url)
2369             if source_url in urls:
2370                 continue
2371             urls.append(source_url)
2372             source_type = source.get('type') or ''
2373             ext = mimetype2ext(source_type) or determine_ext(source_url)
2374             if source_type == 'hls' or ext == 'm3u8':
2375                 formats.extend(self._extract_m3u8_formats(
2376                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2377                     m3u8_id=m3u8_id, fatal=False))
2378             elif ext == 'mpd':
2379                 formats.extend(self._extract_mpd_formats(
2380                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2381             elif ext == 'smil':
2382                 formats.extend(self._extract_smil_formats(
2383                     source_url, video_id, fatal=False))
2384             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2385             elif source_type.startswith('audio') or ext in (
2386                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2387                 formats.append({
2388                     'url': source_url,
2389                     'vcodec': 'none',
2390                     'ext': ext,
2391                 })
2392             else:
2393                 height = int_or_none(source.get('height'))
2394                 if height is None:
2395                     # Often no height is provided but there is a label in
2396                     # format like "1080p", "720p SD", or 1080.
2397                     height = int_or_none(self._search_regex(
2398                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2399                         'height', default=None))
2400                 a_format = {
2401                     'url': source_url,
2402                     'width': int_or_none(source.get('width')),
2403                     'height': height,
2404                     'tbr': int_or_none(source.get('bitrate')),
2405                     'ext': ext,
2406                 }
2407                 if source_url.startswith('rtmp'):
2408                     a_format['ext'] = 'flv'
2409                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2410                     # of jwplayer.flash.swf
2411                     rtmp_url_parts = re.split(
2412                         r'((?:mp4|mp3|flv):)', source_url, 1)
2413                     if len(rtmp_url_parts) == 3:
2414                         rtmp_url, prefix, play_path = rtmp_url_parts
2415                         a_format.update({
2416                             'url': rtmp_url,
2417                             'play_path': prefix + play_path,
2418                         })
2419                     if rtmp_params:
2420                         a_format.update(rtmp_params)
2421                 formats.append(a_format)
2422         return formats
2423
2424     def _live_title(self, name):
2425         """ Generate the title for a live video """
2426         now = datetime.datetime.now()
2427         now_str = now.strftime('%Y-%m-%d %H:%M')
2428         return name + ' ' + now_str
2429
2430     def _int(self, v, name, fatal=False, **kwargs):
2431         res = int_or_none(v, **kwargs)
2432         if 'get_attr' in kwargs:
2433             print(getattr(v, kwargs['get_attr']))
2434         if res is None:
2435             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2436             if fatal:
2437                 raise ExtractorError(msg)
2438             else:
2439                 self._downloader.report_warning(msg)
2440         return res
2441
2442     def _float(self, v, name, fatal=False, **kwargs):
2443         res = float_or_none(v, **kwargs)
2444         if res is None:
2445             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2446             if fatal:
2447                 raise ExtractorError(msg)
2448             else:
2449                 self._downloader.report_warning(msg)
2450         return res
2451
2452     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2453                     path='/', secure=False, discard=False, rest={}, **kwargs):
2454         cookie = compat_cookiejar.Cookie(
2455             0, name, value, port, not port is None, domain, True,
2456             domain.startswith('.'), path, True, secure, expire_time,
2457             discard, None, None, rest)
2458         self._downloader.cookiejar.set_cookie(cookie)
2459
2460     def _get_cookies(self, url):
2461         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2462         req = sanitized_Request(url)
2463         self._downloader.cookiejar.add_cookie_header(req)
2464         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2465
2466     def get_testcases(self, include_onlymatching=False):
2467         t = getattr(self, '_TEST', None)
2468         if t:
2469             assert not hasattr(self, '_TESTS'), \
2470                 '%s has _TEST and _TESTS' % type(self).__name__
2471             tests = [t]
2472         else:
2473             tests = getattr(self, '_TESTS', [])
2474         for t in tests:
2475             if not include_onlymatching and t.get('only_matching', False):
2476                 continue
2477             t['name'] = type(self).__name__[:-len('IE')]
2478             yield t
2479
2480     def is_suitable(self, age_limit):
2481         """ Test whether the extractor is generally suitable for the given
2482         age limit (i.e. pornographic sites are not, all others usually are) """
2483
2484         any_restricted = False
2485         for tc in self.get_testcases(include_onlymatching=False):
2486             if tc.get('playlist', []):
2487                 tc = tc['playlist'][0]
2488             is_restricted = age_restricted(
2489                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2490             if not is_restricted:
2491                 return True
2492             any_restricted = any_restricted or is_restricted
2493         return not any_restricted
2494
2495     def extract_subtitles(self, *args, **kwargs):
2496         if (self._downloader.params.get('writesubtitles', False) or
2497                 self._downloader.params.get('listsubtitles')):
2498             return self._get_subtitles(*args, **kwargs)
2499         return {}
2500
2501     def _get_subtitles(self, *args, **kwargs):
2502         raise NotImplementedError('This method must be implemented by subclasses')
2503
2504     @staticmethod
2505     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2506         """ Merge subtitle items for one language. Items with duplicated URLs
2507         will be dropped. """
2508         list1_urls = set([item['url'] for item in subtitle_list1])
2509         ret = list(subtitle_list1)
2510         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2511         return ret
2512
2513     @classmethod
2514     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2515         """ Merge two subtitle dictionaries, language by language. """
2516         ret = dict(subtitle_dict1)
2517         for lang in subtitle_dict2:
2518             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2519         return ret
2520
2521     def extract_automatic_captions(self, *args, **kwargs):
2522         if (self._downloader.params.get('writeautomaticsub', False) or
2523                 self._downloader.params.get('listsubtitles')):
2524             return self._get_automatic_captions(*args, **kwargs)
2525         return {}
2526
2527     def _get_automatic_captions(self, *args, **kwargs):
2528         raise NotImplementedError('This method must be implemented by subclasses')
2529
2530     def mark_watched(self, *args, **kwargs):
2531         if (self._downloader.params.get('mark_watched', False) and
2532                 (self._get_login_info()[0] is not None or
2533                     self._downloader.params.get('cookiefile') is not None)):
2534             self._mark_watched(*args, **kwargs)
2535
2536     def _mark_watched(self, *args, **kwargs):
2537         raise NotImplementedError('This method must be implemented by subclasses')
2538
2539     def geo_verification_headers(self):
2540         headers = {}
2541         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2542         if geo_verification_proxy:
2543             headers['Ytdl-request-proxy'] = geo_verification_proxy
2544         return headers
2545
2546     def _generic_id(self, url):
2547         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2548
2549     def _generic_title(self, url):
2550         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2551
2552
2553 class SearchInfoExtractor(InfoExtractor):
2554     """
2555     Base class for paged search queries extractors.
2556     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2557     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2558     """
2559
2560     @classmethod
2561     def _make_valid_url(cls):
2562         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2563
2564     @classmethod
2565     def suitable(cls, url):
2566         return re.match(cls._make_valid_url(), url) is not None
2567
2568     def _real_extract(self, query):
2569         mobj = re.match(self._make_valid_url(), query)
2570         if mobj is None:
2571             raise ExtractorError('Invalid search query "%s"' % query)
2572
2573         prefix = mobj.group('prefix')
2574         query = mobj.group('query')
2575         if prefix == '':
2576             return self._get_n_results(query, 1)
2577         elif prefix == 'all':
2578             return self._get_n_results(query, self._MAX_RESULTS)
2579         else:
2580             n = int(prefix)
2581             if n <= 0:
2582                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2583             elif n > self._MAX_RESULTS:
2584                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2585                 n = self._MAX_RESULTS
2586             return self._get_n_results(query, n)
2587
2588     def _get_n_results(self, query, n):
2589         """Get a specified number of results for a query"""
2590         raise NotImplementedError('This method must be implemented by subclasses')
2591
2592     @property
2593     def SEARCH_KEY(self):
2594         return self._SEARCH_KEY