git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177
 178     url:            Final video URL.
 179     ext:            Video filename extension.
 180     format:         The video format, defaults to ext (used for --get-format)
 181     player_url:     SWF Player URL (used for rtmpdump).
 182
 183     The following fields are optional:
 184
 185     alt_title:      A secondary title of the video.
 186     display_id      An alternative identifier for the video, not necessarily
 187                     unique, but available before title. Typically, id is
 188                     something like "4234987", title "Dancing naked mole rats",
 189                     and display_id "dancing-naked-mole-rats"
 190     thumbnails:     A list of dictionaries, with the following entries:
 191                         * "id" (optional, string) - Thumbnail format ID
 192                         * "url"
 193                         * "preference" (optional, int) - quality of the image
 194                         * "width" (optional, int)
 195                         * "height" (optional, int)
 196                         * "resolution" (optional, string "{width}x{height"},
 197                                         deprecated)
 198                         * "filesize" (optional, int)
 199     thumbnail:      Full URL to a video thumbnail image.
 200     description:    Full video description.
 201     uploader:       Full name of the video uploader.
 202     license:        License name the video is licensed under.
 203     creator:        The creator of the video.
 204     release_date:   The date (YYYYMMDD) when the video was released.
 205     timestamp:      UNIX timestamp of the moment the video became available.
 206     upload_date:    Video upload date (YYYYMMDD).
 207                     If not explicitly set, calculated from timestamp.
 208     uploader_id:    Nickname or id of the video uploader.
 209     uploader_url:   Full URL to a personal webpage of the video uploader.
 210     location:       Physical location where the video was filmed.
 211     subtitles:      The available subtitles as a dictionary in the format
 212                     {tag: subformats}. "tag" is usually a language code, and
 213                     "subformats" is a list sorted from lower to higher
 214                     preference, each element is a dictionary with the "ext"
 215                     entry and one of:
 216                         * "data": The subtitles file contents
 217                         * "url": A URL pointing to the subtitles file
 218                     "ext" will be calculated from URL if missing
 219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 220                     automatically generated captions
 221     duration:       Length of the video in seconds, as an integer or float.
 222     view_count:     How many users have watched the video on the platform.
 223     like_count:     Number of positive ratings of the video
 224     dislike_count:  Number of negative ratings of the video
 225     repost_count:   Number of reposts of the video
 226     average_rating: Average rating give by users, the scale used depends on the webpage
 227     comment_count:  Number of comments on the video
 228     comments:       A list of comments, each with one or more of the following
 229                     properties (all but one of text or html optional):
 230                         * "author" - human-readable name of the comment author
 231                         * "author_id" - user ID of the comment author
 232                         * "id" - Comment ID
 233                         * "html" - Comment as HTML
 234                         * "text" - Plain text of the comment
 235                         * "timestamp" - UNIX timestamp of comment
 236                         * "parent" - ID of the comment this one is replying to.
 237                                      Set to "root" to indicate that this is a
 238                                      comment to the original video.
 239     age_limit:      Age restriction for the video, as an integer (years)
 240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 241                     should allow to get the same result again. (It will be set
 242                     by YoutubeDL if it's missing)
 243     categories:     A list of categories that the video falls in, for example
 244                     ["Sports", "Berlin"]
 245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 246     is_live:        True, False, or None (=unknown). Whether this video is a
 247                     live stream that goes on instead of a fixed-length video.
 248     start_time:     Time in seconds where the reproduction should start, as
 249                     specified in the URL.
 250     end_time:       Time in seconds where the reproduction should end, as
 251                     specified in the URL.
 252     chapters:       A list of dictionaries, with the following entries:
 253                         * "start_time" - The start time of the chapter in seconds
 254                         * "end_time" - The end time of the chapter in seconds
 255                         * "title" (optional, string)
 256
 257     The following fields should only be used when the video belongs to some logical
 258     chapter or section:
 259
 260     chapter:        Name or title of the chapter the video belongs to.
 261     chapter_number: Number of the chapter the video belongs to, as an integer.
 262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 263
 264     The following fields should only be used when the video is an episode of some
 265     series, programme or podcast:
 266
 267     series:         Title of the series or programme the video episode belongs to.
 268     season:         Title of the season the video episode belongs to.
 269     season_number:  Number of the season the video episode belongs to, as an integer.
 270     season_id:      Id of the season the video episode belongs to, as a unicode string.
 271     episode:        Title of the video episode. Unlike mandatory video title field,
 272                     this field should denote the exact title of the video episode
 273                     without any kind of decoration.
 274     episode_number: Number of the video episode within a season, as an integer.
 275     episode_id:     Id of the video episode, as a unicode string.
 276
 277     The following fields should only be used when the media is a track or a part of
 278     a music album:
 279
 280     track:          Title of the track.
 281     track_number:   Number of the track within an album or a disc, as an integer.
 282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 283                     as a unicode string.
 284     artist:         Artist(s) of the track.
 285     genre:          Genre(s) of the track.
 286     album:          Title of the album the track belongs to.
 287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 288     album_artist:   List of all artists appeared on the album (e.g.
 289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 290                     and compilations).
 291     disc_number:    Number of the disc or other physical medium the track belongs to,
 292                     as an integer.
 293     release_year:   Year (YYYY) when the album was released.
 294
 295     Unless mentioned otherwise, the fields should be Unicode strings.
 296
 297     Unless mentioned otherwise, None is equivalent to absence of information.
 298
 299
 300     _type "playlist" indicates multiple videos.
 301     There must be a key "entries", which is a list, an iterable, or a PagedList
 302     object, each element of which is a valid dictionary by this specification.
 303
 304     Additionally, playlists can have "title", "description" and "id" attributes
 305     with the same semantics as videos (see above).
 306
 307
 308     _type "multi_video" indicates that there are multiple videos that
 309     form a single show, for examples multiple acts of an opera or TV episode.
 310     It must have an entries key like a playlist and contain all the keys
 311     required for a video at the same time.
 312
 313
 314     _type "url" indicates that the video must be extracted from another
 315     location, possibly by a different extractor. Its only required key is:
 316     "url" - the next URL to extract.
 317     The key "ie_key" can be set to the class name (minus the trailing "IE",
 318     e.g. "Youtube") if the extractor class is known in advance.
 319     Additionally, the dictionary may have any properties of the resolved entity
 320     known in advance, for example "title" if the title of the referred video is
 321     known ahead of time.
 322
 323
 324     _type "url_transparent" entities have the same specification as "url", but
 325     indicate that the given additional information is more precise than the one
 326     associated with the resolved URL.
 327     This is useful when a site employs a video service that hosts the video and
 328     its technical metadata, but that video service does not embed a useful
 329     title, description etc.
 330
 331
 332     Subclasses of this one should re-define the _real_initialize() and
 333     _real_extract() methods and define a _VALID_URL regexp.
 334     Probably, they should also be added to the list of extractors.
 335
 336     _GEO_BYPASS attribute may be set to False in order to disable
 337     geo restriction bypass mechanisms for a particular extractor.
 338     Though it won't disable explicit geo restriction bypass based on
 339     country code provided with geo_bypass_country. (experimental)
 340
 341     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 342     countries for this extractor. One of these countries will be used by
 343     geo restriction bypass mechanism right away in order to bypass
 344     geo restriction, of course, if the mechanism is not disabled. (experimental)
 345
 346     NB: both these geo attributes are experimental and may change in future
 347     or be completely removed.
 348
 349     Finally, the _WORKING attribute should be set to False for broken IEs
 350     in order to warn the users and skip the tests.
 351     """
 352
 353     _ready = False
 354     _downloader = None
 355     _x_forwarded_for_ip = None
 356     _GEO_BYPASS = True
 357     _GEO_COUNTRIES = None
 358     _WORKING = True
 359
 360     def __init__(self, downloader=None):
 361         """Constructor. Receives an optional downloader."""
 362         self._ready = False
 363         self._x_forwarded_for_ip = None
 364         self.set_downloader(downloader)
 365
 366     @classmethod
 367     def suitable(cls, url):
 368         """Receives a URL and returns True if suitable for this IE."""
 369
 370         # This does not use has/getattr intentionally - we want to know whether
 371         # we have cached the regexp for *this* class, whereas getattr would also
 372         # match the superclass
 373         if '_VALID_URL_RE' not in cls.__dict__:
 374             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 375         return cls._VALID_URL_RE.match(url) is not None
 376
 377     @classmethod
 378     def _match_id(cls, url):
 379         if '_VALID_URL_RE' not in cls.__dict__:
 380             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 381         m = cls._VALID_URL_RE.match(url)
 382         assert m
 383         return compat_str(m.group('id'))
 384
 385     @classmethod
 386     def working(cls):
 387         """Getter method for _WORKING."""
 388         return cls._WORKING
 389
 390     def initialize(self):
 391         """Initializes an instance (authentication, etc)."""
 392         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 393         if not self._ready:
 394             self._real_initialize()
 395             self._ready = True
 396
 397     def _initialize_geo_bypass(self, countries):
 398         """
 399         Initialize geo restriction bypass mechanism.
 400
 401         This method is used to initialize geo bypass mechanism based on faking
 402         X-Forwarded-For HTTP header. A random country from provided country list
 403         is selected and a random IP belonging to this country is generated. This
 404         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 405         HTTP requests.
 406
 407         This method will be used for initial geo bypass mechanism initialization
 408         during the instance initialization with _GEO_COUNTRIES.
 409
 410         You may also manually call it from extractor's code if geo countries
 411         information is not available beforehand (e.g. obtained during
 412         extraction) or due to some another reason.
 413         """
 414         if not self._x_forwarded_for_ip:
 415             country_code = self._downloader.params.get('geo_bypass_country', None)
 416             # If there is no explicit country for geo bypass specified and
 417             # the extractor is known to be geo restricted let's fake IP
 418             # as X-Forwarded-For right away.
 419             if (not country_code and
 420                     self._GEO_BYPASS and
 421                     self._downloader.params.get('geo_bypass', True) and
 422                     countries):
 423                 country_code = random.choice(countries)
 424             if country_code:
 425                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 426                 if self._downloader.params.get('verbose', False):
 427                     self._downloader.to_screen(
 428                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 429                         % (self._x_forwarded_for_ip, country_code.upper()))
 430
 431     def extract(self, url):
 432         """Extracts URL information and returns it in list of dicts."""
 433         try:
 434             for _ in range(2):
 435                 try:
 436                     self.initialize()
 437                     ie_result = self._real_extract(url)
 438                     if self._x_forwarded_for_ip:
 439                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 440                     return ie_result
 441                 except GeoRestrictedError as e:
 442                     if self.__maybe_fake_ip_and_retry(e.countries):
 443                         continue
 444                     raise
 445         except ExtractorError:
 446             raise
 447         except compat_http_client.IncompleteRead as e:
 448             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 449         except (KeyError, StopIteration) as e:
 450             raise ExtractorError('An extractor error has occurred.', cause=e)
 451
 452     def __maybe_fake_ip_and_retry(self, countries):
 453         if (not self._downloader.params.get('geo_bypass_country', None) and
 454                 self._GEO_BYPASS and
 455                 self._downloader.params.get('geo_bypass', True) and
 456                 not self._x_forwarded_for_ip and
 457                 countries):
 458             country_code = random.choice(countries)
 459             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 460             if self._x_forwarded_for_ip:
 461                 self.report_warning(
 462                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 463                     % (self._x_forwarded_for_ip, country_code.upper()))
 464                 return True
 465         return False
 466
 467     def set_downloader(self, downloader):
 468         """Sets the downloader for this IE."""
 469         self._downloader = downloader
 470
 471     def _real_initialize(self):
 472         """Real initialization process. Redefine in subclasses."""
 473         pass
 474
 475     def _real_extract(self, url):
 476         """Real extraction process. Redefine in subclasses."""
 477         pass
 478
 479     @classmethod
 480     def ie_key(cls):
 481         """A string for getting the InfoExtractor with get_info_extractor"""
 482         return compat_str(cls.__name__[:-2])
 483
 484     @property
 485     def IE_NAME(self):
 486         return compat_str(type(self).__name__[:-2])
 487
 488     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 489         """ Returns the response handle """
 490         if note is None:
 491             self.report_download_webpage(video_id)
 492         elif note is not False:
 493             if video_id is None:
 494                 self.to_screen('%s' % (note,))
 495             else:
 496                 self.to_screen('%s: %s' % (video_id, note))
 497         if isinstance(url_or_request, compat_urllib_request.Request):
 498             url_or_request = update_Request(
 499                 url_or_request, data=data, headers=headers, query=query)
 500         else:
 501             if query:
 502                 url_or_request = update_url_query(url_or_request, query)
 503             if data is not None or headers:
 504                 url_or_request = sanitized_Request(url_or_request, data, headers)
 505         try:
 506             return self._downloader.urlopen(url_or_request)
 507         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 508             if errnote is False:
 509                 return False
 510             if errnote is None:
 511                 errnote = 'Unable to download webpage'
 512
 513             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 514             if fatal:
 515                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 516             else:
 517                 self._downloader.report_warning(errmsg)
 518                 return False
 519
 520     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 521         """ Returns a tuple (page content as string, URL handle) """
 522         # Strip hashes from the URL (#1038)
 523         if isinstance(url_or_request, (compat_str, str)):
 524             url_or_request = url_or_request.partition('#')[0]
 525
 526         # Some sites check X-Forwarded-For HTTP header in order to figure out
 527         # the origin of the client behind proxy. This allows bypassing geo
 528         # restriction by faking this header's value to IP that belongs to some
 529         # geo unrestricted country. We will do so once we encounter any
 530         # geo restriction error.
 531         if self._x_forwarded_for_ip:
 532             if 'X-Forwarded-For' not in headers:
 533                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 534
 535         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 536         if urlh is False:
 537             assert not fatal
 538             return False
 539         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 540         return (content, urlh)
 541
 542     @staticmethod
 543     def _guess_encoding_from_content(content_type, webpage_bytes):
 544         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 545         if m:
 546             encoding = m.group(1)
 547         else:
 548             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 549                           webpage_bytes[:1024])
 550             if m:
 551                 encoding = m.group(1).decode('ascii')
 552             elif webpage_bytes.startswith(b'\xff\xfe'):
 553                 encoding = 'utf-16'
 554             else:
 555                 encoding = 'utf-8'
 556
 557         return encoding
 558
 559     def __check_blocked(self, content):
 560         first_block = content[:512]
 561         if ('<title>Access to this site is blocked</title>' in content and
 562                 'Websense' in first_block):
 563             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 564             blocked_iframe = self._html_search_regex(
 565                 r'<iframe src="([^"]+)"', content,
 566                 'Websense information URL', default=None)
 567             if blocked_iframe:
 568                 msg += ' Visit %s for more details' % blocked_iframe
 569             raise ExtractorError(msg, expected=True)
 570         if '<title>The URL you requested has been blocked</title>' in first_block:
 571             msg = (
 572                 'Access to this webpage has been blocked by Indian censorship. '
 573                 'Use a VPN or proxy server (with --proxy) to route around it.')
 574             block_msg = self._html_search_regex(
 575                 r'</h1><p>(.*?)</p>',
 576                 content, 'block message', default=None)
 577             if block_msg:
 578                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 579             raise ExtractorError(msg, expected=True)
 580         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 581                 'blocklist.rkn.gov.ru' in content):
 582             raise ExtractorError(
 583                 'Access to this webpage has been blocked by decision of the Russian government. '
 584                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 585                 expected=True)
 586
 587     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 588         content_type = urlh.headers.get('Content-Type', '')
 589         webpage_bytes = urlh.read()
 590         if prefix is not None:
 591             webpage_bytes = prefix + webpage_bytes
 592         if not encoding:
 593             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 594         if self._downloader.params.get('dump_intermediate_pages', False):
 595             try:
 596                 url = url_or_request.get_full_url()
 597             except AttributeError:
 598                 url = url_or_request
 599             self.to_screen('Dumping request to ' + url)
 600             dump = base64.b64encode(webpage_bytes).decode('ascii')
 601             self._downloader.to_screen(dump)
 602         if self._downloader.params.get('write_pages', False):
 603             try:
 604                 url = url_or_request.get_full_url()
 605             except AttributeError:
 606                 url = url_or_request
 607             basen = '%s_%s' % (video_id, url)
 608             if len(basen) > 240:
 609                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 610                 basen = basen[:240 - len(h)] + h
 611             raw_filename = basen + '.dump'
 612             filename = sanitize_filename(raw_filename, restricted=True)
 613             self.to_screen('Saving request to ' + filename)
 614             # Working around MAX_PATH limitation on Windows (see
 615             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 616             if compat_os_name == 'nt':
 617                 absfilepath = os.path.abspath(filename)
 618                 if len(absfilepath) > 259:
 619                     filename = '\\\\?\\' + absfilepath
 620             with open(filename, 'wb') as outf:
 621                 outf.write(webpage_bytes)
 622
 623         try:
 624             content = webpage_bytes.decode(encoding, 'replace')
 625         except LookupError:
 626             content = webpage_bytes.decode('utf-8', 'replace')
 627
 628         self.__check_blocked(content)
 629
 630         return content
 631
 632     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 633         """ Returns the data of the page as a string """
 634         success = False
 635         try_count = 0
 636         while success is False:
 637             try:
 638                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 639                 success = True
 640             except compat_http_client.IncompleteRead as e:
 641                 try_count += 1
 642                 if try_count >= tries:
 643                     raise e
 644                 self._sleep(timeout, video_id)
 645         if res is False:
 646             return res
 647         else:
 648             content, _ = res
 649             return content
 650
 651     def _download_xml(self, url_or_request, video_id,
 652                       note='Downloading XML', errnote='Unable to download XML',
 653                       transform_source=None, fatal=True, encoding=None,
 654                       data=None, headers={}, query={}):
 655         """Return the xml as an xml.etree.ElementTree.Element"""
 656         xml_string = self._download_webpage(
 657             url_or_request, video_id, note, errnote, fatal=fatal,
 658             encoding=encoding, data=data, headers=headers, query=query)
 659         if xml_string is False:
 660             return xml_string
 661         return self._parse_xml(
 662             xml_string, video_id, transform_source=transform_source,
 663             fatal=fatal)
 664
 665     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 666         if transform_source:
 667             xml_string = transform_source(xml_string)
 668         try:
 669             return compat_etree_fromstring(xml_string.encode('utf-8'))
 670         except compat_xml_parse_error as ve:
 671             errmsg = '%s: Failed to parse XML ' % video_id
 672             if fatal:
 673                 raise ExtractorError(errmsg, cause=ve)
 674             else:
 675                 self.report_warning(errmsg + str(ve))
 676
 677     def _download_json(self, url_or_request, video_id,
 678                        note='Downloading JSON metadata',
 679                        errnote='Unable to download JSON metadata',
 680                        transform_source=None,
 681                        fatal=True, encoding=None, data=None, headers={}, query={}):
 682         json_string = self._download_webpage(
 683             url_or_request, video_id, note, errnote, fatal=fatal,
 684             encoding=encoding, data=data, headers=headers, query=query)
 685         if (not fatal) and json_string is False:
 686             return None
 687         return self._parse_json(
 688             json_string, video_id, transform_source=transform_source, fatal=fatal)
 689
 690     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 691         if transform_source:
 692             json_string = transform_source(json_string)
 693         try:
 694             return json.loads(json_string)
 695         except ValueError as ve:
 696             errmsg = '%s: Failed to parse JSON ' % video_id
 697             if fatal:
 698                 raise ExtractorError(errmsg, cause=ve)
 699             else:
 700                 self.report_warning(errmsg + str(ve))
 701
 702     def report_warning(self, msg, video_id=None):
 703         idstr = '' if video_id is None else '%s: ' % video_id
 704         self._downloader.report_warning(
 705             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 706
 707     def to_screen(self, msg):
 708         """Print msg to screen, prefixing it with '[ie_name]'"""
 709         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 710
 711     def report_extraction(self, id_or_name):
 712         """Report information extraction."""
 713         self.to_screen('%s: Extracting information' % id_or_name)
 714
 715     def report_download_webpage(self, video_id):
 716         """Report webpage download."""
 717         self.to_screen('%s: Downloading webpage' % video_id)
 718
 719     def report_age_confirmation(self):
 720         """Report attempt to confirm age."""
 721         self.to_screen('Confirming age')
 722
 723     def report_login(self):
 724         """Report attempt to log in."""
 725         self.to_screen('Logging in')
 726
 727     @staticmethod
 728     def raise_login_required(msg='This video is only available for registered users'):
 729         raise ExtractorError(
 730             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 731             expected=True)
 732
 733     @staticmethod
 734     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 735         raise GeoRestrictedError(msg, countries=countries)
 736
 737     # Methods for following #608
 738     @staticmethod
 739     def url_result(url, ie=None, video_id=None, video_title=None):
 740         """Returns a URL that points to a page that should be processed"""
 741         # TODO: ie should be the class used for getting the info
 742         video_info = {'_type': 'url',
 743                       'url': url,
 744                       'ie_key': ie}
 745         if video_id is not None:
 746             video_info['id'] = video_id
 747         if video_title is not None:
 748             video_info['title'] = video_title
 749         return video_info
 750
 751     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 752         urls = orderedSet(
 753             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 754             for m in matches)
 755         return self.playlist_result(
 756             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 757
 758     @staticmethod
 759     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 760         """Returns a playlist"""
 761         video_info = {'_type': 'playlist',
 762                       'entries': entries}
 763         if playlist_id:
 764             video_info['id'] = playlist_id
 765         if playlist_title:
 766             video_info['title'] = playlist_title
 767         if playlist_description:
 768             video_info['description'] = playlist_description
 769         return video_info
 770
 771     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 772         """
 773         Perform a regex search on the given string, using a single or a list of
 774         patterns returning the first matching group.
 775         In case of failure return a default value or raise a WARNING or a
 776         RegexNotFoundError, depending on fatal, specifying the field name.
 777         """
 778         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 779             mobj = re.search(pattern, string, flags)
 780         else:
 781             for p in pattern:
 782                 mobj = re.search(p, string, flags)
 783                 if mobj:
 784                     break
 785
 786         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 787             _name = '\033[0;34m%s\033[0m' % name
 788         else:
 789             _name = name
 790
 791         if mobj:
 792             if group is None:
 793                 # return the first matching group
 794                 return next(g for g in mobj.groups() if g is not None)
 795             else:
 796                 return mobj.group(group)
 797         elif default is not NO_DEFAULT:
 798             return default
 799         elif fatal:
 800             raise RegexNotFoundError('Unable to extract %s' % _name)
 801         else:
 802             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 803             return None
 804
 805     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 806         """
 807         Like _search_regex, but strips HTML tags and unescapes entities.
 808         """
 809         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 810         if res:
 811             return clean_html(res).strip()
 812         else:
 813             return res
 814
 815     def _get_netrc_login_info(self, netrc_machine=None):
 816         username = None
 817         password = None
 818         netrc_machine = netrc_machine or self._NETRC_MACHINE
 819
 820         if self._downloader.params.get('usenetrc', False):
 821             try:
 822                 info = netrc.netrc().authenticators(netrc_machine)
 823                 if info is not None:
 824                     username = info[0]
 825                     password = info[2]
 826                 else:
 827                     raise netrc.NetrcParseError(
 828                         'No authenticators for %s' % netrc_machine)
 829             except (IOError, netrc.NetrcParseError) as err:
 830                 self._downloader.report_warning(
 831                     'parsing .netrc: %s' % error_to_compat_str(err))
 832
 833         return username, password
 834
 835     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 836         """
 837         Get the login info as (username, password)
 838         First look for the manually specified credentials using username_option
 839         and password_option as keys in params dictionary. If no such credentials
 840         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 841         value.
 842         If there's no info available, return (None, None)
 843         """
 844         if self._downloader is None:
 845             return (None, None)
 846
 847         downloader_params = self._downloader.params
 848
 849         # Attempt to use provided username and password or .netrc data
 850         if downloader_params.get(username_option) is not None:
 851             username = downloader_params[username_option]
 852             password = downloader_params[password_option]
 853         else:
 854             username, password = self._get_netrc_login_info(netrc_machine)
 855
 856         return username, password
 857
 858     def _get_tfa_info(self, note='two-factor verification code'):
 859         """
 860         Get the two-factor authentication info
 861         TODO - asking the user will be required for sms/phone verify
 862         currently just uses the command line option
 863         If there's no info available, return None
 864         """
 865         if self._downloader is None:
 866             return None
 867         downloader_params = self._downloader.params
 868
 869         if downloader_params.get('twofactor') is not None:
 870             return downloader_params['twofactor']
 871
 872         return compat_getpass('Type %s and press [Return]: ' % note)
 873
 874     # Helper functions for extracting OpenGraph info
 875     @staticmethod
 876     def _og_regexes(prop):
 877         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 878         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 879                        % {'prop': re.escape(prop)})
 880         template = r'<meta[^>]+?%s[^>]+?%s'
 881         return [
 882             template % (property_re, content_re),
 883             template % (content_re, property_re),
 884         ]
 885
 886     @staticmethod
 887     def _meta_regex(prop):
 888         return r'''(?isx)<meta
 889                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 890                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 891
 892     def _og_search_property(self, prop, html, name=None, **kargs):
 893         if not isinstance(prop, (list, tuple)):
 894             prop = [prop]
 895         if name is None:
 896             name = 'OpenGraph %s' % prop[0]
 897         og_regexes = []
 898         for p in prop:
 899             og_regexes.extend(self._og_regexes(p))
 900         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 901         if escaped is None:
 902             return None
 903         return unescapeHTML(escaped)
 904
 905     def _og_search_thumbnail(self, html, **kargs):
 906         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 907
 908     def _og_search_description(self, html, **kargs):
 909         return self._og_search_property('description', html, fatal=False, **kargs)
 910
 911     def _og_search_title(self, html, **kargs):
 912         return self._og_search_property('title', html, **kargs)
 913
 914     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 915         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 916         if secure:
 917             regexes = self._og_regexes('video:secure_url') + regexes
 918         return self._html_search_regex(regexes, html, name, **kargs)
 919
 920     def _og_search_url(self, html, **kargs):
 921         return self._og_search_property('url', html, **kargs)
 922
 923     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 924         if not isinstance(name, (list, tuple)):
 925             name = [name]
 926         if display_name is None:
 927             display_name = name[0]
 928         return self._html_search_regex(
 929             [self._meta_regex(n) for n in name],
 930             html, display_name, fatal=fatal, group='content', **kwargs)
 931
 932     def _dc_search_uploader(self, html):
 933         return self._html_search_meta('dc.creator', html, 'uploader')
 934
 935     def _rta_search(self, html):
 936         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 937         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 938                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 939                      html):
 940             return 18
 941         return 0
 942
 943     def _media_rating_search(self, html):
 944         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 945         rating = self._html_search_meta('rating', html)
 946
 947         if not rating:
 948             return None
 949
 950         RATING_TABLE = {
 951             'safe for kids': 0,
 952             'general': 8,
 953             '14 years': 14,
 954             'mature': 17,
 955             'restricted': 19,
 956         }
 957         return RATING_TABLE.get(rating.lower())
 958
 959     def _family_friendly_search(self, html):
 960         # See http://schema.org/VideoObject
 961         family_friendly = self._html_search_meta(
 962             'isFamilyFriendly', html, default=None)
 963
 964         if not family_friendly:
 965             return None
 966
 967         RATING_TABLE = {
 968             '1': 0,
 969             'true': 0,
 970             '0': 18,
 971             'false': 18,
 972         }
 973         return RATING_TABLE.get(family_friendly.lower())
 974
 975     def _twitter_search_player(self, html):
 976         return self._html_search_meta('twitter:player', html,
 977                                       'twitter card player')
 978
 979     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 980         json_ld = self._search_regex(
 981             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 982             html, 'JSON-LD', group='json_ld', **kwargs)
 983         default = kwargs.get('default', NO_DEFAULT)
 984         if not json_ld:
 985             return default if default is not NO_DEFAULT else {}
 986         # JSON-LD may be malformed and thus `fatal` should be respected.
 987         # At the same time `default` may be passed that assumes `fatal=False`
 988         # for _search_regex. Let's simulate the same behavior here as well.
 989         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 990         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 991
 992     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 993         if isinstance(json_ld, compat_str):
 994             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 995         if not json_ld:
 996             return {}
 997         info = {}
 998         if not isinstance(json_ld, (list, tuple, dict)):
 999             return info
1000         if isinstance(json_ld, dict):
1001             json_ld = [json_ld]
1002
1003         def extract_video_object(e):
1004             assert e['@type'] == 'VideoObject'
1005             info.update({
1006                 'url': e.get('contentUrl'),
1007                 'title': unescapeHTML(e.get('name')),
1008                 'description': unescapeHTML(e.get('description')),
1009                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1010                 'duration': parse_duration(e.get('duration')),
1011                 'timestamp': unified_timestamp(e.get('uploadDate')),
1012                 'filesize': float_or_none(e.get('contentSize')),
1013                 'tbr': int_or_none(e.get('bitrate')),
1014                 'width': int_or_none(e.get('width')),
1015                 'height': int_or_none(e.get('height')),
1016                 'view_count': int_or_none(e.get('interactionCount')),
1017             })
1018
1019         for e in json_ld:
1020             if e.get('@context') == 'http://schema.org':
1021                 item_type = e.get('@type')
1022                 if expected_type is not None and expected_type != item_type:
1023                     return info
1024                 if item_type in ('TVEpisode', 'Episode'):
1025                     info.update({
1026                         'episode': unescapeHTML(e.get('name')),
1027                         'episode_number': int_or_none(e.get('episodeNumber')),
1028                         'description': unescapeHTML(e.get('description')),
1029                     })
1030                     part_of_season = e.get('partOfSeason')
1031                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1032                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1033                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1034                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1035                         info['series'] = unescapeHTML(part_of_series.get('name'))
1036                 elif item_type == 'Article':
1037                     info.update({
1038                         'timestamp': parse_iso8601(e.get('datePublished')),
1039                         'title': unescapeHTML(e.get('headline')),
1040                         'description': unescapeHTML(e.get('articleBody')),
1041                     })
1042                 elif item_type == 'VideoObject':
1043                     extract_video_object(e)
1044                     continue
1045                 video = e.get('video')
1046                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1047                     extract_video_object(video)
1048                 break
1049         return dict((k, v) for k, v in info.items() if v is not None)
1050
1051     @staticmethod
1052     def _hidden_inputs(html):
1053         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1054         hidden_inputs = {}
1055         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1056             attrs = extract_attributes(input)
1057             if not input:
1058                 continue
1059             if attrs.get('type') not in ('hidden', 'submit'):
1060                 continue
1061             name = attrs.get('name') or attrs.get('id')
1062             value = attrs.get('value')
1063             if name and value is not None:
1064                 hidden_inputs[name] = value
1065         return hidden_inputs
1066
1067     def _form_hidden_inputs(self, form_id, html):
1068         form = self._search_regex(
1069             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1070             html, '%s form' % form_id, group='form')
1071         return self._hidden_inputs(form)
1072
1073     def _sort_formats(self, formats, field_preference=None):
1074         if not formats:
1075             raise ExtractorError('No video formats found')
1076
1077         for f in formats:
1078             # Automatically determine tbr when missing based on abr and vbr (improves
1079             # formats sorting in some cases)
1080             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1081                 f['tbr'] = f['abr'] + f['vbr']
1082
1083         def _formats_key(f):
1084             # TODO remove the following workaround
1085             from ..utils import determine_ext
1086             if not f.get('ext') and 'url' in f:
1087                 f['ext'] = determine_ext(f['url'])
1088
1089             if isinstance(field_preference, (list, tuple)):
1090                 return tuple(
1091                     f.get(field)
1092                     if f.get(field) is not None
1093                     else ('' if field == 'format_id' else -1)
1094                     for field in field_preference)
1095
1096             preference = f.get('preference')
1097             if preference is None:
1098                 preference = 0
1099                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1100                     preference -= 0.5
1101
1102             protocol = f.get('protocol') or determine_protocol(f)
1103             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1104
1105             if f.get('vcodec') == 'none':  # audio only
1106                 preference -= 50
1107                 if self._downloader.params.get('prefer_free_formats'):
1108                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1109                 else:
1110                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1111                 ext_preference = 0
1112                 try:
1113                     audio_ext_preference = ORDER.index(f['ext'])
1114                 except ValueError:
1115                     audio_ext_preference = -1
1116             else:
1117                 if f.get('acodec') == 'none':  # video only
1118                     preference -= 40
1119                 if self._downloader.params.get('prefer_free_formats'):
1120                     ORDER = ['flv', 'mp4', 'webm']
1121                 else:
1122                     ORDER = ['webm', 'flv', 'mp4']
1123                 try:
1124                     ext_preference = ORDER.index(f['ext'])
1125                 except ValueError:
1126                     ext_preference = -1
1127                 audio_ext_preference = 0
1128
1129             return (
1130                 preference,
1131                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1132                 f.get('quality') if f.get('quality') is not None else -1,
1133                 f.get('tbr') if f.get('tbr') is not None else -1,
1134                 f.get('filesize') if f.get('filesize') is not None else -1,
1135                 f.get('vbr') if f.get('vbr') is not None else -1,
1136                 f.get('height') if f.get('height') is not None else -1,
1137                 f.get('width') if f.get('width') is not None else -1,
1138                 proto_preference,
1139                 ext_preference,
1140                 f.get('abr') if f.get('abr') is not None else -1,
1141                 audio_ext_preference,
1142                 f.get('fps') if f.get('fps') is not None else -1,
1143                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1144                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1145                 f.get('format_id') if f.get('format_id') is not None else '',
1146             )
1147         formats.sort(key=_formats_key)
1148
1149     def _check_formats(self, formats, video_id):
1150         if formats:
1151             formats[:] = filter(
1152                 lambda f: self._is_valid_url(
1153                     f['url'], video_id,
1154                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1155                 formats)
1156
1157     @staticmethod
1158     def _remove_duplicate_formats(formats):
1159         format_urls = set()
1160         unique_formats = []
1161         for f in formats:
1162             if f['url'] not in format_urls:
1163                 format_urls.add(f['url'])
1164                 unique_formats.append(f)
1165         formats[:] = unique_formats
1166
1167     def _is_valid_url(self, url, video_id, item='video', headers={}):
1168         url = self._proto_relative_url(url, scheme='http:')
1169         # For now assume non HTTP(S) URLs always valid
1170         if not (url.startswith('http://') or url.startswith('https://')):
1171             return True
1172         try:
1173             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1174             return True
1175         except ExtractorError as e:
1176             if isinstance(e.cause, compat_urllib_error.URLError):
1177                 self.to_screen(
1178                     '%s: %s URL is invalid, skipping' % (video_id, item))
1179                 return False
1180             raise
1181
1182     def http_scheme(self):
1183         """ Either "http:" or "https:", depending on the user's preferences """
1184         return (
1185             'http:'
1186             if self._downloader.params.get('prefer_insecure', False)
1187             else 'https:')
1188
1189     def _proto_relative_url(self, url, scheme=None):
1190         if url is None:
1191             return url
1192         if url.startswith('//'):
1193             if scheme is None:
1194                 scheme = self.http_scheme()
1195             return scheme + url
1196         else:
1197             return url
1198
1199     def _sleep(self, timeout, video_id, msg_template=None):
1200         if msg_template is None:
1201             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1202         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1203         self.to_screen(msg)
1204         time.sleep(timeout)
1205
1206     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1207                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1208                              fatal=True, m3u8_id=None):
1209         manifest = self._download_xml(
1210             manifest_url, video_id, 'Downloading f4m manifest',
1211             'Unable to download f4m manifest',
1212             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1213             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1214             transform_source=transform_source,
1215             fatal=fatal)
1216
1217         if manifest is False:
1218             return []
1219
1220         return self._parse_f4m_formats(
1221             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1222             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1223
1224     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1225                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1226                            fatal=True, m3u8_id=None):
1227         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1228         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1229         if akamai_pv is not None and ';' in akamai_pv.text:
1230             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1231             if playerVerificationChallenge.strip() != '':
1232                 return []
1233
1234         formats = []
1235         manifest_version = '1.0'
1236         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1237         if not media_nodes:
1238             manifest_version = '2.0'
1239             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1240         # Remove unsupported DRM protected media from final formats
1241         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1242         media_nodes = remove_encrypted_media(media_nodes)
1243         if not media_nodes:
1244             return formats
1245
1246         manifest_base_url = get_base_url(manifest)
1247
1248         bootstrap_info = xpath_element(
1249             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1250             'bootstrap info', default=None)
1251
1252         vcodec = None
1253         mime_type = xpath_text(
1254             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1255             'base URL', default=None)
1256         if mime_type and mime_type.startswith('audio/'):
1257             vcodec = 'none'
1258
1259         for i, media_el in enumerate(media_nodes):
1260             tbr = int_or_none(media_el.attrib.get('bitrate'))
1261             width = int_or_none(media_el.attrib.get('width'))
1262             height = int_or_none(media_el.attrib.get('height'))
1263             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1264             # If <bootstrapInfo> is present, the specified f4m is a
1265             # stream-level manifest, and only set-level manifests may refer to
1266             # external resources.  See section 11.4 and section 4 of F4M spec
1267             if bootstrap_info is None:
1268                 media_url = None
1269                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1270                 if manifest_version == '2.0':
1271                     media_url = media_el.attrib.get('href')
1272                 if media_url is None:
1273                     media_url = media_el.attrib.get('url')
1274                 if not media_url:
1275                     continue
1276                 manifest_url = (
1277                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1278                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1279                 # If media_url is itself a f4m manifest do the recursive extraction
1280                 # since bitrates in parent manifest (this one) and media_url manifest
1281                 # may differ leading to inability to resolve the format by requested
1282                 # bitrate in f4m downloader
1283                 ext = determine_ext(manifest_url)
1284                 if ext == 'f4m':
1285                     f4m_formats = self._extract_f4m_formats(
1286                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1287                         transform_source=transform_source, fatal=fatal)
1288                     # Sometimes stream-level manifest contains single media entry that
1289                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1290                     # At the same time parent's media entry in set-level manifest may
1291                     # contain it. We will copy it from parent in such cases.
1292                     if len(f4m_formats) == 1:
1293                         f = f4m_formats[0]
1294                         f.update({
1295                             'tbr': f.get('tbr') or tbr,
1296                             'width': f.get('width') or width,
1297                             'height': f.get('height') or height,
1298                             'format_id': f.get('format_id') if not tbr else format_id,
1299                             'vcodec': vcodec,
1300                         })
1301                     formats.extend(f4m_formats)
1302                     continue
1303                 elif ext == 'm3u8':
1304                     formats.extend(self._extract_m3u8_formats(
1305                         manifest_url, video_id, 'mp4', preference=preference,
1306                         m3u8_id=m3u8_id, fatal=fatal))
1307                     continue
1308             formats.append({
1309                 'format_id': format_id,
1310                 'url': manifest_url,
1311                 'manifest_url': manifest_url,
1312                 'ext': 'flv' if bootstrap_info is not None else None,
1313                 'protocol': 'f4m',
1314                 'tbr': tbr,
1315                 'width': width,
1316                 'height': height,
1317                 'vcodec': vcodec,
1318                 'preference': preference,
1319             })
1320         return formats
1321
1322     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1323         return {
1324             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1325             'url': m3u8_url,
1326             'ext': ext,
1327             'protocol': 'm3u8',
1328             'preference': preference - 100 if preference else -100,
1329             'resolution': 'multiple',
1330             'format_note': 'Quality selection URL',
1331         }
1332
1333     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1334                               entry_protocol='m3u8', preference=None,
1335                               m3u8_id=None, note=None, errnote=None,
1336                               fatal=True, live=False):
1337         res = self._download_webpage_handle(
1338             m3u8_url, video_id,
1339             note=note or 'Downloading m3u8 information',
1340             errnote=errnote or 'Failed to download m3u8 information',
1341             fatal=fatal)
1342
1343         if res is False:
1344             return []
1345
1346         m3u8_doc, urlh = res
1347         m3u8_url = urlh.geturl()
1348
1349         return self._parse_m3u8_formats(
1350             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1351             preference=preference, m3u8_id=m3u8_id, live=live)
1352
1353     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1354                             entry_protocol='m3u8', preference=None,
1355                             m3u8_id=None, live=False):
1356         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1357             return []
1358
1359         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1360             return []
1361
1362         formats = []
1363
1364         format_url = lambda u: (
1365             u
1366             if re.match(r'^https?://', u)
1367             else compat_urlparse.urljoin(m3u8_url, u))
1368
1369         # References:
1370         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1371         # 2. https://github.com/rg3/youtube-dl/issues/12211
1372
1373         # We should try extracting formats only from master playlists [1, 4.3.4],
1374         # i.e. playlists that describe available qualities. On the other hand
1375         # media playlists [1, 4.3.3] should be returned as is since they contain
1376         # just the media without qualities renditions.
1377         # Fortunately, master playlist can be easily distinguished from media
1378         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1379         # master playlist tags MUST NOT appear in a media playist and vice versa.
1380         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1381         # media playlist and MUST NOT appear in master playlist thus we can
1382         # clearly detect media playlist with this criterion.
1383
1384         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1385             return [{
1386                 'url': m3u8_url,
1387                 'format_id': m3u8_id,
1388                 'ext': ext,
1389                 'protocol': entry_protocol,
1390                 'preference': preference,
1391             }]
1392
1393         groups = {}
1394         last_stream_inf = {}
1395
1396         def extract_media(x_media_line):
1397             media = parse_m3u8_attributes(x_media_line)
1398             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1399             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1400             if not (media_type and group_id and name):
1401                 return
1402             groups.setdefault(group_id, []).append(media)
1403             if media_type not in ('VIDEO', 'AUDIO'):
1404                 return
1405             media_url = media.get('URI')
1406             if media_url:
1407                 format_id = []
1408                 for v in (m3u8_id, group_id, name):
1409                     if v:
1410                         format_id.append(v)
1411                 f = {
1412                     'format_id': '-'.join(format_id),
1413                     'url': format_url(media_url),
1414                     'manifest_url': m3u8_url,
1415                     'language': media.get('LANGUAGE'),
1416                     'ext': ext,
1417                     'protocol': entry_protocol,
1418                     'preference': preference,
1419                 }
1420                 if media_type == 'AUDIO':
1421                     f['vcodec'] = 'none'
1422                 formats.append(f)
1423
1424         def build_stream_name():
1425             # Despite specification does not mention NAME attribute for
1426             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1427             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1428             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1429             stream_name = last_stream_inf.get('NAME')
1430             if stream_name:
1431                 return stream_name
1432             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1433             # from corresponding rendition group
1434             stream_group_id = last_stream_inf.get('VIDEO')
1435             if not stream_group_id:
1436                 return
1437             stream_group = groups.get(stream_group_id)
1438             if not stream_group:
1439                 return stream_group_id
1440             rendition = stream_group[0]
1441             return rendition.get('NAME') or stream_group_id
1442
1443         for line in m3u8_doc.splitlines():
1444             if line.startswith('#EXT-X-STREAM-INF:'):
1445                 last_stream_inf = parse_m3u8_attributes(line)
1446             elif line.startswith('#EXT-X-MEDIA:'):
1447                 extract_media(line)
1448             elif line.startswith('#') or not line.strip():
1449                 continue
1450             else:
1451                 tbr = float_or_none(
1452                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1453                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1454                 format_id = []
1455                 if m3u8_id:
1456                     format_id.append(m3u8_id)
1457                 stream_name = build_stream_name()
1458                 # Bandwidth of live streams may differ over time thus making
1459                 # format_id unpredictable. So it's better to keep provided
1460                 # format_id intact.
1461                 if not live:
1462                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1463                 manifest_url = format_url(line.strip())
1464                 f = {
1465                     'format_id': '-'.join(format_id),
1466                     'url': manifest_url,
1467                     'manifest_url': m3u8_url,
1468                     'tbr': tbr,
1469                     'ext': ext,
1470                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1471                     'protocol': entry_protocol,
1472                     'preference': preference,
1473                 }
1474                 resolution = last_stream_inf.get('RESOLUTION')
1475                 if resolution:
1476                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1477                     if mobj:
1478                         f['width'] = int(mobj.group('width'))
1479                         f['height'] = int(mobj.group('height'))
1480                 # Unified Streaming Platform
1481                 mobj = re.search(
1482                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1483                 if mobj:
1484                     abr, vbr = mobj.groups()
1485                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1486                     f.update({
1487                         'vbr': vbr,
1488                         'abr': abr,
1489                     })
1490                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1491                 f.update(codecs)
1492                 audio_group_id = last_stream_inf.get('AUDIO')
1493                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1494                 # references a rendition group MUST have a CODECS attribute.
1495                 # However, this is not always respected, for example, [2]
1496                 # contains EXT-X-STREAM-INF tag which references AUDIO
1497                 # rendition group but does not have CODECS and despite
1498                 # referencing audio group an audio group, it represents
1499                 # a complete (with audio and video) format. So, for such cases
1500                 # we will ignore references to rendition groups and treat them
1501                 # as complete formats.
1502                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1503                     audio_group = groups.get(audio_group_id)
1504                     if audio_group and audio_group[0].get('URI'):
1505                         # TODO: update acodec for audio only formats with
1506                         # the same GROUP-ID
1507                         f['acodec'] = 'none'
1508                 formats.append(f)
1509                 last_stream_inf = {}
1510         return formats
1511
1512     @staticmethod
1513     def _xpath_ns(path, namespace=None):
1514         if not namespace:
1515             return path
1516         out = []
1517         for c in path.split('/'):
1518             if not c or c == '.':
1519                 out.append(c)
1520             else:
1521                 out.append('{%s}%s' % (namespace, c))
1522         return '/'.join(out)
1523
1524     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1525         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1526
1527         if smil is False:
1528             assert not fatal
1529             return []
1530
1531         namespace = self._parse_smil_namespace(smil)
1532
1533         return self._parse_smil_formats(
1534             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1535
1536     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1537         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1538         if smil is False:
1539             return {}
1540         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1541
1542     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1543         return self._download_xml(
1544             smil_url, video_id, 'Downloading SMIL file',
1545             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1546
1547     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1548         namespace = self._parse_smil_namespace(smil)
1549
1550         formats = self._parse_smil_formats(
1551             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1552         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1553
1554         video_id = os.path.splitext(url_basename(smil_url))[0]
1555         title = None
1556         description = None
1557         upload_date = None
1558         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1559             name = meta.attrib.get('name')
1560             content = meta.attrib.get('content')
1561             if not name or not content:
1562                 continue
1563             if not title and name == 'title':
1564                 title = content
1565             elif not description and name in ('description', 'abstract'):
1566                 description = content
1567             elif not upload_date and name == 'date':
1568                 upload_date = unified_strdate(content)
1569
1570         thumbnails = [{
1571             'id': image.get('type'),
1572             'url': image.get('src'),
1573             'width': int_or_none(image.get('width')),
1574             'height': int_or_none(image.get('height')),
1575         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1576
1577         return {
1578             'id': video_id,
1579             'title': title or video_id,
1580             'description': description,
1581             'upload_date': upload_date,
1582             'thumbnails': thumbnails,
1583             'formats': formats,
1584             'subtitles': subtitles,
1585         }
1586
1587     def _parse_smil_namespace(self, smil):
1588         return self._search_regex(
1589             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1590
1591     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1592         base = smil_url
1593         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1594             b = meta.get('base') or meta.get('httpBase')
1595             if b:
1596                 base = b
1597                 break
1598
1599         formats = []
1600         rtmp_count = 0
1601         http_count = 0
1602         m3u8_count = 0
1603
1604         srcs = []
1605         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1606         for medium in media:
1607             src = medium.get('src')
1608             if not src or src in srcs:
1609                 continue
1610             srcs.append(src)
1611
1612             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1613             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1614             width = int_or_none(medium.get('width'))
1615             height = int_or_none(medium.get('height'))
1616             proto = medium.get('proto')
1617             ext = medium.get('ext')
1618             src_ext = determine_ext(src)
1619             streamer = medium.get('streamer') or base
1620
1621             if proto == 'rtmp' or streamer.startswith('rtmp'):
1622                 rtmp_count += 1
1623                 formats.append({
1624                     'url': streamer,
1625                     'play_path': src,
1626                     'ext': 'flv',
1627                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1628                     'tbr': bitrate,
1629                     'filesize': filesize,
1630                     'width': width,
1631                     'height': height,
1632                 })
1633                 if transform_rtmp_url:
1634                     streamer, src = transform_rtmp_url(streamer, src)
1635                     formats[-1].update({
1636                         'url': streamer,
1637                         'play_path': src,
1638                     })
1639                 continue
1640
1641             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1642             src_url = src_url.strip()
1643
1644             if proto == 'm3u8' or src_ext == 'm3u8':
1645                 m3u8_formats = self._extract_m3u8_formats(
1646                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1647                 if len(m3u8_formats) == 1:
1648                     m3u8_count += 1
1649                     m3u8_formats[0].update({
1650                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1651                         'tbr': bitrate,
1652                         'width': width,
1653                         'height': height,
1654                     })
1655                 formats.extend(m3u8_formats)
1656                 continue
1657
1658             if src_ext == 'f4m':
1659                 f4m_url = src_url
1660                 if not f4m_params:
1661                     f4m_params = {
1662                         'hdcore': '3.2.0',
1663                         'plugin': 'flowplayer-3.2.0.1',
1664                     }
1665                 f4m_url += '&' if '?' in f4m_url else '?'
1666                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1667                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1668                 continue
1669
1670             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1671                 http_count += 1
1672                 formats.append({
1673                     'url': src_url,
1674                     'ext': ext or src_ext or 'flv',
1675                     'format_id': 'http-%d' % (bitrate or http_count),
1676                     'tbr': bitrate,
1677                     'filesize': filesize,
1678                     'width': width,
1679                     'height': height,
1680                 })
1681                 continue
1682
1683         return formats
1684
1685     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1686         urls = []
1687         subtitles = {}
1688         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1689             src = textstream.get('src')
1690             if not src or src in urls:
1691                 continue
1692             urls.append(src)
1693             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1694             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1695             subtitles.setdefault(lang, []).append({
1696                 'url': src,
1697                 'ext': ext,
1698             })
1699         return subtitles
1700
1701     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1702         xspf = self._download_xml(
1703             playlist_url, playlist_id, 'Downloading xpsf playlist',
1704             'Unable to download xspf manifest', fatal=fatal)
1705         if xspf is False:
1706             return []
1707         return self._parse_xspf(xspf, playlist_id)
1708
1709     def _parse_xspf(self, playlist, playlist_id):
1710         NS_MAP = {
1711             'xspf': 'http://xspf.org/ns/0/',
1712             's1': 'http://static.streamone.nl/player/ns/0',
1713         }
1714
1715         entries = []
1716         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1717             title = xpath_text(
1718                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1719             description = xpath_text(
1720                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1721             thumbnail = xpath_text(
1722                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1723             duration = float_or_none(
1724                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1725
1726             formats = [{
1727                 'url': location.text,
1728                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1729                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1730                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1731             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1732             self._sort_formats(formats)
1733
1734             entries.append({
1735                 'id': playlist_id,
1736                 'title': title,
1737                 'description': description,
1738                 'thumbnail': thumbnail,
1739                 'duration': duration,
1740                 'formats': formats,
1741             })
1742         return entries
1743
1744     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1745         res = self._download_webpage_handle(
1746             mpd_url, video_id,
1747             note=note or 'Downloading MPD manifest',
1748             errnote=errnote or 'Failed to download MPD manifest',
1749             fatal=fatal)
1750         if res is False:
1751             return []
1752         mpd, urlh = res
1753         mpd_base_url = base_url(urlh.geturl())
1754
1755         return self._parse_mpd_formats(
1756             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1757             formats_dict=formats_dict, mpd_url=mpd_url)
1758
1759     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1760         """
1761         Parse formats from MPD manifest.
1762         References:
1763          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1764             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1765          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1766         """
1767         if mpd_doc.get('type') == 'dynamic':
1768             return []
1769
1770         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1771
1772         def _add_ns(path):
1773             return self._xpath_ns(path, namespace)
1774
1775         def is_drm_protected(element):
1776             return element.find(_add_ns('ContentProtection')) is not None
1777
1778         def extract_multisegment_info(element, ms_parent_info):
1779             ms_info = ms_parent_info.copy()
1780
1781             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1782             # common attributes and elements.  We will only extract relevant
1783             # for us.
1784             def extract_common(source):
1785                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1786                 if segment_timeline is not None:
1787                     s_e = segment_timeline.findall(_add_ns('S'))
1788                     if s_e:
1789                         ms_info['total_number'] = 0
1790                         ms_info['s'] = []
1791                         for s in s_e:
1792                             r = int(s.get('r', 0))
1793                             ms_info['total_number'] += 1 + r
1794                             ms_info['s'].append({
1795                                 't': int(s.get('t', 0)),
1796                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1797                                 'd': int(s.attrib['d']),
1798                                 'r': r,
1799                             })
1800                 start_number = source.get('startNumber')
1801                 if start_number:
1802                     ms_info['start_number'] = int(start_number)
1803                 timescale = source.get('timescale')
1804                 if timescale:
1805                     ms_info['timescale'] = int(timescale)
1806                 segment_duration = source.get('duration')
1807                 if segment_duration:
1808                     ms_info['segment_duration'] = float(segment_duration)
1809
1810             def extract_Initialization(source):
1811                 initialization = source.find(_add_ns('Initialization'))
1812                 if initialization is not None:
1813                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1814
1815             segment_list = element.find(_add_ns('SegmentList'))
1816             if segment_list is not None:
1817                 extract_common(segment_list)
1818                 extract_Initialization(segment_list)
1819                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1820                 if segment_urls_e:
1821                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1822             else:
1823                 segment_template = element.find(_add_ns('SegmentTemplate'))
1824                 if segment_template is not None:
1825                     extract_common(segment_template)
1826                     media = segment_template.get('media')
1827                     if media:
1828                         ms_info['media'] = media
1829                     initialization = segment_template.get('initialization')
1830                     if initialization:
1831                         ms_info['initialization'] = initialization
1832                     else:
1833                         extract_Initialization(segment_template)
1834             return ms_info
1835
1836         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1837         formats = []
1838         for period in mpd_doc.findall(_add_ns('Period')):
1839             period_duration = parse_duration(period.get('duration')) or mpd_duration
1840             period_ms_info = extract_multisegment_info(period, {
1841                 'start_number': 1,
1842                 'timescale': 1,
1843             })
1844             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1845                 if is_drm_protected(adaptation_set):
1846                     continue
1847                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1848                 for representation in adaptation_set.findall(_add_ns('Representation')):
1849                     if is_drm_protected(representation):
1850                         continue
1851                     representation_attrib = adaptation_set.attrib.copy()
1852                     representation_attrib.update(representation.attrib)
1853                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1854                     mime_type = representation_attrib['mimeType']
1855                     content_type = mime_type.split('/')[0]
1856                     if content_type == 'text':
1857                         # TODO implement WebVTT downloading
1858                         pass
1859                     elif content_type in ('video', 'audio'):
1860                         base_url = ''
1861                         for element in (representation, adaptation_set, period, mpd_doc):
1862                             base_url_e = element.find(_add_ns('BaseURL'))
1863                             if base_url_e is not None:
1864                                 base_url = base_url_e.text + base_url
1865                                 if re.match(r'^https?://', base_url):
1866                                     break
1867                         if mpd_base_url and not re.match(r'^https?://', base_url):
1868                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1869                                 mpd_base_url += '/'
1870                             base_url = mpd_base_url + base_url
1871                         representation_id = representation_attrib.get('id')
1872                         lang = representation_attrib.get('lang')
1873                         url_el = representation.find(_add_ns('BaseURL'))
1874                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1875                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1876                         f = {
1877                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1878                             'url': base_url,
1879                             'manifest_url': mpd_url,
1880                             'ext': mimetype2ext(mime_type),
1881                             'width': int_or_none(representation_attrib.get('width')),
1882                             'height': int_or_none(representation_attrib.get('height')),
1883                             'tbr': float_or_none(bandwidth, 1000),
1884                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1885                             'fps': int_or_none(representation_attrib.get('frameRate')),
1886                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1887                             'format_note': 'DASH %s' % content_type,
1888                             'filesize': filesize,
1889                         }
1890                         f.update(parse_codecs(representation_attrib.get('codecs')))
1891                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1892
1893                         def prepare_template(template_name, identifiers):
1894                             t = representation_ms_info[template_name]
1895                             t = t.replace('$RepresentationID$', representation_id)
1896                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1897                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1898                             t.replace('$$', '$')
1899                             return t
1900
1901                         # @initialization is a regular template like @media one
1902                         # so it should be handled just the same way (see
1903                         # https://github.com/rg3/youtube-dl/issues/11605)
1904                         if 'initialization' in representation_ms_info:
1905                             initialization_template = prepare_template(
1906                                 'initialization',
1907                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1908                                 # $Time$ shall not be included for @initialization thus
1909                                 # only $Bandwidth$ remains
1910                                 ('Bandwidth', ))
1911                             representation_ms_info['initialization_url'] = initialization_template % {
1912                                 'Bandwidth': bandwidth,
1913                             }
1914
1915                         def location_key(location):
1916                             return 'url' if re.match(r'^https?://', location) else 'path'
1917
1918                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1919
1920                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1921                             media_location_key = location_key(media_template)
1922
1923                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1924                             # can't be used at the same time
1925                             if '%(Number' in media_template and 's' not in representation_ms_info:
1926                                 segment_duration = None
1927                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1928                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1929                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1930                                 representation_ms_info['fragments'] = [{
1931                                     media_location_key: media_template % {
1932                                         'Number': segment_number,
1933                                         'Bandwidth': bandwidth,
1934                                     },
1935                                     'duration': segment_duration,
1936                                 } for segment_number in range(
1937                                     representation_ms_info['start_number'],
1938                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1939                             else:
1940                                 # $Number*$ or $Time$ in media template with S list available
1941                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1942                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1943                                 representation_ms_info['fragments'] = []
1944                                 segment_time = 0
1945                                 segment_d = None
1946                                 segment_number = representation_ms_info['start_number']
1947
1948                                 def add_segment_url():
1949                                     segment_url = media_template % {
1950                                         'Time': segment_time,
1951                                         'Bandwidth': bandwidth,
1952                                         'Number': segment_number,
1953                                     }
1954                                     representation_ms_info['fragments'].append({
1955                                         media_location_key: segment_url,
1956                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1957                                     })
1958
1959                                 for num, s in enumerate(representation_ms_info['s']):
1960                                     segment_time = s.get('t') or segment_time
1961                                     segment_d = s['d']
1962                                     add_segment_url()
1963                                     segment_number += 1
1964                                     for r in range(s.get('r', 0)):
1965                                         segment_time += segment_d
1966                                         add_segment_url()
1967                                         segment_number += 1
1968                                     segment_time += segment_d
1969                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1970                             # No media template
1971                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1972                             # or any YouTube dashsegments video
1973                             fragments = []
1974                             segment_index = 0
1975                             timescale = representation_ms_info['timescale']
1976                             for s in representation_ms_info['s']:
1977                                 duration = float_or_none(s['d'], timescale)
1978                                 for r in range(s.get('r', 0) + 1):
1979                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1980                                     fragments.append({
1981                                         location_key(segment_uri): segment_uri,
1982                                         'duration': duration,
1983                                     })
1984                                     segment_index += 1
1985                             representation_ms_info['fragments'] = fragments
1986                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1987                         # No fragments key is present in this case.
1988                         if 'fragments' in representation_ms_info:
1989                             f.update({
1990                                 'fragment_base_url': base_url,
1991                                 'fragments': [],
1992                                 'protocol': 'http_dash_segments',
1993                             })
1994                             if 'initialization_url' in representation_ms_info:
1995                                 initialization_url = representation_ms_info['initialization_url']
1996                                 if not f.get('url'):
1997                                     f['url'] = initialization_url
1998                                 f['fragments'].append({location_key(initialization_url): initialization_url})
1999                             f['fragments'].extend(representation_ms_info['fragments'])
2000                         try:
2001                             existing_format = next(
2002                                 fo for fo in formats
2003                                 if fo['format_id'] == representation_id)
2004                         except StopIteration:
2005                             full_info = formats_dict.get(representation_id, {}).copy()
2006                             full_info.update(f)
2007                             formats.append(full_info)
2008                         else:
2009                             existing_format.update(f)
2010                     else:
2011                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2012         return formats
2013
2014     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2015         res = self._download_webpage_handle(
2016             ism_url, video_id,
2017             note=note or 'Downloading ISM manifest',
2018             errnote=errnote or 'Failed to download ISM manifest',
2019             fatal=fatal)
2020         if res is False:
2021             return []
2022         ism, urlh = res
2023
2024         return self._parse_ism_formats(
2025             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2026
2027     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2028         """
2029         Parse formats from ISM manifest.
2030         References:
2031          1. [MS-SSTR]: Smooth Streaming Protocol,
2032             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2033         """
2034         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2035             return []
2036
2037         duration = int(ism_doc.attrib['Duration'])
2038         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2039
2040         formats = []
2041         for stream in ism_doc.findall('StreamIndex'):
2042             stream_type = stream.get('Type')
2043             if stream_type not in ('video', 'audio'):
2044                 continue
2045             url_pattern = stream.attrib['Url']
2046             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2047             stream_name = stream.get('Name')
2048             for track in stream.findall('QualityLevel'):
2049                 fourcc = track.get('FourCC')
2050                 # TODO: add support for WVC1 and WMAP
2051                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2052                     self.report_warning('%s is not a supported codec' % fourcc)
2053                     continue
2054                 tbr = int(track.attrib['Bitrate']) // 1000
2055                 # [1] does not mention Width and Height attributes. However,
2056                 # they're often present while MaxWidth and MaxHeight are
2057                 # missing, so should be used as fallbacks
2058                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2059                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2060                 sampling_rate = int_or_none(track.get('SamplingRate'))
2061
2062                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2063                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2064
2065                 fragments = []
2066                 fragment_ctx = {
2067                     'time': 0,
2068                 }
2069                 stream_fragments = stream.findall('c')
2070                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2071                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2072                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2073                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2074                     if not fragment_ctx['duration']:
2075                         try:
2076                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2077                         except IndexError:
2078                             next_fragment_time = duration
2079                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2080                     for _ in range(fragment_repeat):
2081                         fragments.append({
2082                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2083                             'duration': fragment_ctx['duration'] / stream_timescale,
2084                         })
2085                         fragment_ctx['time'] += fragment_ctx['duration']
2086
2087                 format_id = []
2088                 if ism_id:
2089                     format_id.append(ism_id)
2090                 if stream_name:
2091                     format_id.append(stream_name)
2092                 format_id.append(compat_str(tbr))
2093
2094                 formats.append({
2095                     'format_id': '-'.join(format_id),
2096                     'url': ism_url,
2097                     'manifest_url': ism_url,
2098                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2099                     'width': width,
2100                     'height': height,
2101                     'tbr': tbr,
2102                     'asr': sampling_rate,
2103                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2104                     'acodec': 'none' if stream_type == 'video' else fourcc,
2105                     'protocol': 'ism',
2106                     'fragments': fragments,
2107                     '_download_params': {
2108                         'duration': duration,
2109                         'timescale': stream_timescale,
2110                         'width': width or 0,
2111                         'height': height or 0,
2112                         'fourcc': fourcc,
2113                         'codec_private_data': track.get('CodecPrivateData'),
2114                         'sampling_rate': sampling_rate,
2115                         'channels': int_or_none(track.get('Channels', 2)),
2116                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2117                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2118                     },
2119                 })
2120         return formats
2121
2122     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2123         def absolute_url(video_url):
2124             return compat_urlparse.urljoin(base_url, video_url)
2125
2126         def parse_content_type(content_type):
2127             if not content_type:
2128                 return {}
2129             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2130             if ctr:
2131                 mimetype, codecs = ctr.groups()
2132                 f = parse_codecs(codecs)
2133                 f['ext'] = mimetype2ext(mimetype)
2134                 return f
2135             return {}
2136
2137         def _media_formats(src, cur_media_type, type_info={}):
2138             full_url = absolute_url(src)
2139             ext = type_info.get('ext') or determine_ext(full_url)
2140             if ext == 'm3u8':
2141                 is_plain_url = False
2142                 formats = self._extract_m3u8_formats(
2143                     full_url, video_id, ext='mp4',
2144                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2145                     preference=preference, fatal=False)
2146             elif ext == 'mpd':
2147                 is_plain_url = False
2148                 formats = self._extract_mpd_formats(
2149                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2150             else:
2151                 is_plain_url = True
2152                 formats = [{
2153                     'url': full_url,
2154                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2155                 }]
2156             return is_plain_url, formats
2157
2158         entries = []
2159         # amp-video and amp-audio are very similar to their HTML5 counterparts
2160         # so we wll include them right here (see
2161         # https://www.ampproject.org/docs/reference/components/amp-video)
2162         media_tags = [(media_tag, media_type, '')
2163                       for media_tag, media_type
2164                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2165         media_tags.extend(re.findall(
2166             # We only allow video|audio followed by a whitespace or '>'.
2167             # Allowing more characters may end up in significant slow down (see
2168             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2169             # http://www.porntrex.com/maps/videositemap.xml).
2170             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2171         for media_tag, media_type, media_content in media_tags:
2172             media_info = {
2173                 'formats': [],
2174                 'subtitles': {},
2175             }
2176             media_attributes = extract_attributes(media_tag)
2177             src = media_attributes.get('src')
2178             if src:
2179                 _, formats = _media_formats(src, media_type)
2180                 media_info['formats'].extend(formats)
2181             media_info['thumbnail'] = media_attributes.get('poster')
2182             if media_content:
2183                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2184                     source_attributes = extract_attributes(source_tag)
2185                     src = source_attributes.get('src')
2186                     if not src:
2187                         continue
2188                     f = parse_content_type(source_attributes.get('type'))
2189                     is_plain_url, formats = _media_formats(src, media_type, f)
2190                     if is_plain_url:
2191                         # res attribute is not standard but seen several times
2192                         # in the wild
2193                         f.update({
2194                             'height': int_or_none(source_attributes.get('res')),
2195                             'format_id': source_attributes.get('label'),
2196                         })
2197                         f.update(formats[0])
2198                         media_info['formats'].append(f)
2199                     else:
2200                         media_info['formats'].extend(formats)
2201                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2202                     track_attributes = extract_attributes(track_tag)
2203                     kind = track_attributes.get('kind')
2204                     if not kind or kind in ('subtitles', 'captions'):
2205                         src = track_attributes.get('src')
2206                         if not src:
2207                             continue
2208                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2209                         media_info['subtitles'].setdefault(lang, []).append({
2210                             'url': absolute_url(src),
2211                         })
2212             if media_info['formats'] or media_info['subtitles']:
2213                 entries.append(media_info)
2214         return entries
2215
2216     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2217         formats = []
2218         hdcore_sign = 'hdcore=3.7.0'
2219         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2220         hds_host = hosts.get('hds')
2221         if hds_host:
2222             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2223         if 'hdcore=' not in f4m_url:
2224             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2225         f4m_formats = self._extract_f4m_formats(
2226             f4m_url, video_id, f4m_id='hds', fatal=False)
2227         for entry in f4m_formats:
2228             entry.update({'extra_param_to_segment_url': hdcore_sign})
2229         formats.extend(f4m_formats)
2230         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2231         hls_host = hosts.get('hls')
2232         if hls_host:
2233             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2234         formats.extend(self._extract_m3u8_formats(
2235             m3u8_url, video_id, 'mp4', 'm3u8_native',
2236             m3u8_id='hls', fatal=False))
2237         return formats
2238
2239     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2240         query = compat_urlparse.urlparse(url).query
2241         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2242         url_base = self._search_regex(
2243             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2244         http_base_url = '%s:%s' % ('http', url_base)
2245         formats = []
2246
2247         def manifest_url(manifest):
2248             m_url = '%s/%s' % (http_base_url, manifest)
2249             if query:
2250                 m_url += '?%s' % query
2251             return m_url
2252
2253         if 'm3u8' not in skip_protocols:
2254             formats.extend(self._extract_m3u8_formats(
2255                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2256                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2257         if 'f4m' not in skip_protocols:
2258             formats.extend(self._extract_f4m_formats(
2259                 manifest_url('manifest.f4m'),
2260                 video_id, f4m_id='hds', fatal=False))
2261         if 'dash' not in skip_protocols:
2262             formats.extend(self._extract_mpd_formats(
2263                 manifest_url('manifest.mpd'),
2264                 video_id, mpd_id='dash', fatal=False))
2265         if re.search(r'(?:/smil:|\.smil)', url_base):
2266             if 'smil' not in skip_protocols:
2267                 rtmp_formats = self._extract_smil_formats(
2268                     manifest_url('jwplayer.smil'),
2269                     video_id, fatal=False)
2270                 for rtmp_format in rtmp_formats:
2271                     rtsp_format = rtmp_format.copy()
2272                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2273                     del rtsp_format['play_path']
2274                     del rtsp_format['ext']
2275                     rtsp_format.update({
2276                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2277                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2278                         'protocol': 'rtsp',
2279                     })
2280                     formats.extend([rtmp_format, rtsp_format])
2281         else:
2282             for protocol in ('rtmp', 'rtsp'):
2283                 if protocol not in skip_protocols:
2284                     formats.append({
2285                         'url': '%s:%s' % (protocol, url_base),
2286                         'format_id': protocol,
2287                         'protocol': protocol,
2288                     })
2289         return formats
2290
2291     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2292         mobj = re.search(
2293             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2294             webpage)
2295         if mobj:
2296             try:
2297                 jwplayer_data = self._parse_json(mobj.group('options'),
2298                                                  video_id=video_id,
2299                                                  transform_source=transform_source)
2300             except ExtractorError:
2301                 pass
2302             else:
2303                 if isinstance(jwplayer_data, dict):
2304                     return jwplayer_data
2305
2306     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2307         jwplayer_data = self._find_jwplayer_data(
2308             webpage, video_id, transform_source=js_to_json)
2309         return self._parse_jwplayer_data(
2310             jwplayer_data, video_id, *args, **kwargs)
2311
2312     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2313                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2314         # JWPlayer backward compatibility: flattened playlists
2315         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2316         if 'playlist' not in jwplayer_data:
2317             jwplayer_data = {'playlist': [jwplayer_data]}
2318
2319         entries = []
2320
2321         # JWPlayer backward compatibility: single playlist item
2322         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2323         if not isinstance(jwplayer_data['playlist'], list):
2324             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2325
2326         for video_data in jwplayer_data['playlist']:
2327             # JWPlayer backward compatibility: flattened sources
2328             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2329             if 'sources' not in video_data:
2330                 video_data['sources'] = [video_data]
2331
2332             this_video_id = video_id or video_data['mediaid']
2333
2334             formats = self._parse_jwplayer_formats(
2335                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2336                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2337
2338             subtitles = {}
2339             tracks = video_data.get('tracks')
2340             if tracks and isinstance(tracks, list):
2341                 for track in tracks:
2342                     if not isinstance(track, dict):
2343                         continue
2344                     if track.get('kind') != 'captions':
2345                         continue
2346                     track_url = urljoin(base_url, track.get('file'))
2347                     if not track_url:
2348                         continue
2349                     subtitles.setdefault(track.get('label') or 'en', []).append({
2350                         'url': self._proto_relative_url(track_url)
2351                     })
2352
2353             entry = {
2354                 'id': this_video_id,
2355                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2356                 'description': video_data.get('description'),
2357                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2358                 'timestamp': int_or_none(video_data.get('pubdate')),
2359                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2360                 'subtitles': subtitles,
2361             }
2362             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2363             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2364                 entry.update({
2365                     '_type': 'url_transparent',
2366                     'url': formats[0]['url'],
2367                 })
2368             else:
2369                 self._sort_formats(formats)
2370                 entry['formats'] = formats
2371             entries.append(entry)
2372         if len(entries) == 1:
2373             return entries[0]
2374         else:
2375             return self.playlist_result(entries)
2376
2377     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2378                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2379         urls = []
2380         formats = []
2381         for source in jwplayer_sources_data:
2382             if not isinstance(source, dict):
2383                 continue
2384             source_url = self._proto_relative_url(source.get('file'))
2385             if not source_url:
2386                 continue
2387             if base_url:
2388                 source_url = compat_urlparse.urljoin(base_url, source_url)
2389             if source_url in urls:
2390                 continue
2391             urls.append(source_url)
2392             source_type = source.get('type') or ''
2393             ext = mimetype2ext(source_type) or determine_ext(source_url)
2394             if source_type == 'hls' or ext == 'm3u8':
2395                 formats.extend(self._extract_m3u8_formats(
2396                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2397                     m3u8_id=m3u8_id, fatal=False))
2398             elif ext == 'mpd':
2399                 formats.extend(self._extract_mpd_formats(
2400                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2401             elif ext == 'smil':
2402                 formats.extend(self._extract_smil_formats(
2403                     source_url, video_id, fatal=False))
2404             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2405             elif source_type.startswith('audio') or ext in (
2406                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2407                 formats.append({
2408                     'url': source_url,
2409                     'vcodec': 'none',
2410                     'ext': ext,
2411                 })
2412             else:
2413                 height = int_or_none(source.get('height'))
2414                 if height is None:
2415                     # Often no height is provided but there is a label in
2416                     # format like "1080p", "720p SD", or 1080.
2417                     height = int_or_none(self._search_regex(
2418                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2419                         'height', default=None))
2420                 a_format = {
2421                     'url': source_url,
2422                     'width': int_or_none(source.get('width')),
2423                     'height': height,
2424                     'tbr': int_or_none(source.get('bitrate')),
2425                     'ext': ext,
2426                 }
2427                 if source_url.startswith('rtmp'):
2428                     a_format['ext'] = 'flv'
2429                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2430                     # of jwplayer.flash.swf
2431                     rtmp_url_parts = re.split(
2432                         r'((?:mp4|mp3|flv):)', source_url, 1)
2433                     if len(rtmp_url_parts) == 3:
2434                         rtmp_url, prefix, play_path = rtmp_url_parts
2435                         a_format.update({
2436                             'url': rtmp_url,
2437                             'play_path': prefix + play_path,
2438                         })
2439                     if rtmp_params:
2440                         a_format.update(rtmp_params)
2441                 formats.append(a_format)
2442         return formats
2443
2444     def _live_title(self, name):
2445         """ Generate the title for a live video """
2446         now = datetime.datetime.now()
2447         now_str = now.strftime('%Y-%m-%d %H:%M')
2448         return name + ' ' + now_str
2449
2450     def _int(self, v, name, fatal=False, **kwargs):
2451         res = int_or_none(v, **kwargs)
2452         if 'get_attr' in kwargs:
2453             print(getattr(v, kwargs['get_attr']))
2454         if res is None:
2455             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2456             if fatal:
2457                 raise ExtractorError(msg)
2458             else:
2459                 self._downloader.report_warning(msg)
2460         return res
2461
2462     def _float(self, v, name, fatal=False, **kwargs):
2463         res = float_or_none(v, **kwargs)
2464         if res is None:
2465             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2466             if fatal:
2467                 raise ExtractorError(msg)
2468             else:
2469                 self._downloader.report_warning(msg)
2470         return res
2471
2472     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2473                     path='/', secure=False, discard=False, rest={}, **kwargs):
2474         cookie = compat_cookiejar.Cookie(
2475             0, name, value, port, port is not None, domain, True,
2476             domain.startswith('.'), path, True, secure, expire_time,
2477             discard, None, None, rest)
2478         self._downloader.cookiejar.set_cookie(cookie)
2479
2480     def _get_cookies(self, url):
2481         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2482         req = sanitized_Request(url)
2483         self._downloader.cookiejar.add_cookie_header(req)
2484         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2485
2486     def get_testcases(self, include_onlymatching=False):
2487         t = getattr(self, '_TEST', None)
2488         if t:
2489             assert not hasattr(self, '_TESTS'), \
2490                 '%s has _TEST and _TESTS' % type(self).__name__
2491             tests = [t]
2492         else:
2493             tests = getattr(self, '_TESTS', [])
2494         for t in tests:
2495             if not include_onlymatching and t.get('only_matching', False):
2496                 continue
2497             t['name'] = type(self).__name__[:-len('IE')]
2498             yield t
2499
2500     def is_suitable(self, age_limit):
2501         """ Test whether the extractor is generally suitable for the given
2502         age limit (i.e. pornographic sites are not, all others usually are) """
2503
2504         any_restricted = False
2505         for tc in self.get_testcases(include_onlymatching=False):
2506             if tc.get('playlist', []):
2507                 tc = tc['playlist'][0]
2508             is_restricted = age_restricted(
2509                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2510             if not is_restricted:
2511                 return True
2512             any_restricted = any_restricted or is_restricted
2513         return not any_restricted
2514
2515     def extract_subtitles(self, *args, **kwargs):
2516         if (self._downloader.params.get('writesubtitles', False) or
2517                 self._downloader.params.get('listsubtitles')):
2518             return self._get_subtitles(*args, **kwargs)
2519         return {}
2520
2521     def _get_subtitles(self, *args, **kwargs):
2522         raise NotImplementedError('This method must be implemented by subclasses')
2523
2524     @staticmethod
2525     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2526         """ Merge subtitle items for one language. Items with duplicated URLs
2527         will be dropped. """
2528         list1_urls = set([item['url'] for item in subtitle_list1])
2529         ret = list(subtitle_list1)
2530         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2531         return ret
2532
2533     @classmethod
2534     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2535         """ Merge two subtitle dictionaries, language by language. """
2536         ret = dict(subtitle_dict1)
2537         for lang in subtitle_dict2:
2538             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2539         return ret
2540
2541     def extract_automatic_captions(self, *args, **kwargs):
2542         if (self._downloader.params.get('writeautomaticsub', False) or
2543                 self._downloader.params.get('listsubtitles')):
2544             return self._get_automatic_captions(*args, **kwargs)
2545         return {}
2546
2547     def _get_automatic_captions(self, *args, **kwargs):
2548         raise NotImplementedError('This method must be implemented by subclasses')
2549
2550     def mark_watched(self, *args, **kwargs):
2551         if (self._downloader.params.get('mark_watched', False) and
2552                 (self._get_login_info()[0] is not None or
2553                     self._downloader.params.get('cookiefile') is not None)):
2554             self._mark_watched(*args, **kwargs)
2555
2556     def _mark_watched(self, *args, **kwargs):
2557         raise NotImplementedError('This method must be implemented by subclasses')
2558
2559     def geo_verification_headers(self):
2560         headers = {}
2561         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2562         if geo_verification_proxy:
2563             headers['Ytdl-request-proxy'] = geo_verification_proxy
2564         return headers
2565
2566     def _generic_id(self, url):
2567         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2568
2569     def _generic_title(self, url):
2570         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2571
2572
2573 class SearchInfoExtractor(InfoExtractor):
2574     """
2575     Base class for paged search queries extractors.
2576     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2577     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2578     """
2579
2580     @classmethod
2581     def _make_valid_url(cls):
2582         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2583
2584     @classmethod
2585     def suitable(cls, url):
2586         return re.match(cls._make_valid_url(), url) is not None
2587
2588     def _real_extract(self, query):
2589         mobj = re.match(self._make_valid_url(), query)
2590         if mobj is None:
2591             raise ExtractorError('Invalid search query "%s"' % query)
2592
2593         prefix = mobj.group('prefix')
2594         query = mobj.group('query')
2595         if prefix == '':
2596             return self._get_n_results(query, 1)
2597         elif prefix == 'all':
2598             return self._get_n_results(query, self._MAX_RESULTS)
2599         else:
2600             n = int(prefix)
2601             if n <= 0:
2602                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2603             elif n > self._MAX_RESULTS:
2604                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2605                 n = self._MAX_RESULTS
2606             return self._get_n_results(query, n)
2607
2608     def _get_n_results(self, query, n):
2609         """Get a specified number of results for a query"""
2610         raise NotImplementedError('This method must be implemented by subclasses')
2611
2612     @property
2613     def SEARCH_KEY(self):
2614         return self._SEARCH_KEY