_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30 )
  31 from ..downloader.f4m import remove_encrypted_media
  32 from ..utils import (
  33     NO_DEFAULT,
  34     age_restricted,
  35     base_url,
  36     bug_reports_message,
  37     clean_html,
  38     compiled_regex_type,
  39     determine_ext,
  40     determine_protocol,
  41     error_to_compat_str,
  42     ExtractorError,
  43     extract_attributes,
  44     fix_xml_ampersands,
  45     float_or_none,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     int_or_none,
  49     js_to_json,
  50     mimetype2ext,
  51     orderedSet,
  52     parse_codecs,
  53     parse_duration,
  54     parse_iso8601,
  55     parse_m3u8_attributes,
  56     RegexNotFoundError,
  57     sanitized_Request,
  58     sanitize_filename,
  59     unescapeHTML,
  60     unified_strdate,
  61     unified_timestamp,
  62     update_Request,
  63     update_url_query,
  64     urljoin,
  65     url_basename,
  66     xpath_element,
  67     xpath_text,
  68     xpath_with_ns,
  69 )
  70
  71
  72 class InfoExtractor(object):
  73     """Information Extractor class.
  74
  75     Information extractors are the classes that, given a URL, extract
  76     information about the video (or videos) the URL refers to. This
  77     information includes the real video URL, the video title, author and
  78     others. The information is stored in a dictionary which is then
  79     passed to the YoutubeDL. The YoutubeDL processes this
  80     information possibly downloading the video to the file system, among
  81     other possible outcomes.
  82
  83     The type field determines the type of the result.
  84     By far the most common value (and the default if _type is missing) is
  85     "video", which indicates a single video.
  86
  87     For a video, the dictionaries must include the following fields:
  88
  89     id:             Video identifier.
  90     title:          Video title, unescaped.
  91
  92     Additionally, it must contain either a formats entry or a url one:
  93
  94     formats:        A list of dictionaries for each format available, ordered
  95                     from worst to best quality.
  96
  97                     Potential fields:
  98                     * url        Mandatory. The URL of the video file
  99                     * manifest_url
 100                                  The URL of the manifest file in case of
 101                                  fragmented media (DASH, hls, hds)
 102                     * ext        Will be calculated from URL if missing
 103                     * format     A human-readable description of the format
 104                                  ("mp4 container with h264/opus").
 105                                  Calculated from the format_id, width, height.
 106                                  and format_note fields if missing.
 107                     * format_id  A short description of the format
 108                                  ("mp4_h264_opus" or "19").
 109                                 Technically optional, but strongly recommended.
 110                     * format_note Additional info about the format
 111                                  ("3D" or "DASH video")
 112                     * width      Width of the video, if known
 113                     * height     Height of the video, if known
 114                     * resolution Textual description of width and height
 115                     * tbr        Average bitrate of audio and video in KBit/s
 116                     * abr        Average audio bitrate in KBit/s
 117                     * acodec     Name of the audio codec in use
 118                     * asr        Audio sampling rate in Hertz
 119                     * vbr        Average video bitrate in KBit/s
 120                     * fps        Frame rate
 121                     * vcodec     Name of the video codec in use
 122                     * container  Name of the container format
 123                     * filesize   The number of bytes, if known in advance
 124                     * filesize_approx  An estimate for the number of bytes
 125                     * player_url SWF Player URL (used for rtmpdump).
 126                     * protocol   The protocol that will be used for the actual
 127                                  download, lower-case.
 128                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 129                                  "m3u8", "m3u8_native" or "http_dash_segments".
 130                     * fragment_base_url
 131                                  Base URL for fragments. Each fragment's path
 132                                  value (if present) will be relative to
 133                                  this URL.
 134                     * fragments  A list of fragments of a fragmented media.
 135                                  Each fragment entry must contain either an url
 136                                  or a path. If an url is present it should be
 137                                  considered by a client. Otherwise both path and
 138                                  fragment_base_url must be present. Here is
 139                                  the list of all potential fields:
 140                                  * "url" - fragment's URL
 141                                  * "path" - fragment's path relative to
 142                                             fragment_base_url
 143                                  * "duration" (optional, int or float)
 144                                  * "filesize" (optional, int)
 145                     * preference Order number of this format. If this field is
 146                                  present and not None, the formats get sorted
 147                                  by this field, regardless of all other values.
 148                                  -1 for default (order by other properties),
 149                                  -2 or smaller for less than default.
 150                                  < -1000 to hide the format (if there is
 151                                     another one which is strictly better)
 152                     * language   Language code, e.g. "de" or "en-US".
 153                     * language_preference  Is this in the language mentioned in
 154                                  the URL?
 155                                  10 if it's what the URL is about,
 156                                  -1 for default (don't know),
 157                                  -10 otherwise, other values reserved for now.
 158                     * quality    Order number of the video quality of this
 159                                  format, irrespective of the file format.
 160                                  -1 for default (order by other properties),
 161                                  -2 or smaller for less than default.
 162                     * source_preference  Order number for this video source
 163                                   (quality takes higher priority)
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * http_headers  A dictionary of additional HTTP headers
 167                                  to add to the request.
 168                     * stretched_ratio  If given and not 1, indicates that the
 169                                  video's pixels are not square.
 170                                  width : height ratio as float.
 171                     * no_resume  The server does not support resuming the
 172                                  (HTTP or RTMP) download. Boolean.
 173
 174     url:            Final video URL.
 175     ext:            Video filename extension.
 176     format:         The video format, defaults to ext (used for --get-format)
 177     player_url:     SWF Player URL (used for rtmpdump).
 178
 179     The following fields are optional:
 180
 181     alt_title:      A secondary title of the video.
 182     display_id      An alternative identifier for the video, not necessarily
 183                     unique, but available before title. Typically, id is
 184                     something like "4234987", title "Dancing naked mole rats",
 185                     and display_id "dancing-naked-mole-rats"
 186     thumbnails:     A list of dictionaries, with the following entries:
 187                         * "id" (optional, string) - Thumbnail format ID
 188                         * "url"
 189                         * "preference" (optional, int) - quality of the image
 190                         * "width" (optional, int)
 191                         * "height" (optional, int)
 192                         * "resolution" (optional, string "{width}x{height"},
 193                                         deprecated)
 194                         * "filesize" (optional, int)
 195     thumbnail:      Full URL to a video thumbnail image.
 196     description:    Full video description.
 197     uploader:       Full name of the video uploader.
 198     license:        License name the video is licensed under.
 199     creator:        The creator of the video.
 200     release_date:   The date (YYYYMMDD) when the video was released.
 201     timestamp:      UNIX timestamp of the moment the video became available.
 202     upload_date:    Video upload date (YYYYMMDD).
 203                     If not explicitly set, calculated from timestamp.
 204     uploader_id:    Nickname or id of the video uploader.
 205     uploader_url:   Full URL to a personal webpage of the video uploader.
 206     location:       Physical location where the video was filmed.
 207     subtitles:      The available subtitles as a dictionary in the format
 208                     {tag: subformats}. "tag" is usually a language code, and
 209                     "subformats" is a list sorted from lower to higher
 210                     preference, each element is a dictionary with the "ext"
 211                     entry and one of:
 212                         * "data": The subtitles file contents
 213                         * "url": A URL pointing to the subtitles file
 214                     "ext" will be calculated from URL if missing
 215     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 216                     automatically generated captions
 217     duration:       Length of the video in seconds, as an integer or float.
 218     view_count:     How many users have watched the video on the platform.
 219     like_count:     Number of positive ratings of the video
 220     dislike_count:  Number of negative ratings of the video
 221     repost_count:   Number of reposts of the video
 222     average_rating: Average rating give by users, the scale used depends on the webpage
 223     comment_count:  Number of comments on the video
 224     comments:       A list of comments, each with one or more of the following
 225                     properties (all but one of text or html optional):
 226                         * "author" - human-readable name of the comment author
 227                         * "author_id" - user ID of the comment author
 228                         * "id" - Comment ID
 229                         * "html" - Comment as HTML
 230                         * "text" - Plain text of the comment
 231                         * "timestamp" - UNIX timestamp of comment
 232                         * "parent" - ID of the comment this one is replying to.
 233                                      Set to "root" to indicate that this is a
 234                                      comment to the original video.
 235     age_limit:      Age restriction for the video, as an integer (years)
 236     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 237                     should allow to get the same result again. (It will be set
 238                     by YoutubeDL if it's missing)
 239     categories:     A list of categories that the video falls in, for example
 240                     ["Sports", "Berlin"]
 241     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 242     is_live:        True, False, or None (=unknown). Whether this video is a
 243                     live stream that goes on instead of a fixed-length video.
 244     start_time:     Time in seconds where the reproduction should start, as
 245                     specified in the URL.
 246     end_time:       Time in seconds where the reproduction should end, as
 247                     specified in the URL.
 248     chapters:       A list of dictionaries, with the following entries:
 249                         * "start_time" - The start time of the chapter in seconds
 250                         * "end_time" - The end time of the chapter in seconds
 251                         * "title" (optional, string)
 252
 253     The following fields should only be used when the video belongs to some logical
 254     chapter or section:
 255
 256     chapter:        Name or title of the chapter the video belongs to.
 257     chapter_number: Number of the chapter the video belongs to, as an integer.
 258     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 259
 260     The following fields should only be used when the video is an episode of some
 261     series, programme or podcast:
 262
 263     series:         Title of the series or programme the video episode belongs to.
 264     season:         Title of the season the video episode belongs to.
 265     season_number:  Number of the season the video episode belongs to, as an integer.
 266     season_id:      Id of the season the video episode belongs to, as a unicode string.
 267     episode:        Title of the video episode. Unlike mandatory video title field,
 268                     this field should denote the exact title of the video episode
 269                     without any kind of decoration.
 270     episode_number: Number of the video episode within a season, as an integer.
 271     episode_id:     Id of the video episode, as a unicode string.
 272
 273     The following fields should only be used when the media is a track or a part of
 274     a music album:
 275
 276     track:          Title of the track.
 277     track_number:   Number of the track within an album or a disc, as an integer.
 278     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 279                     as a unicode string.
 280     artist:         Artist(s) of the track.
 281     genre:          Genre(s) of the track.
 282     album:          Title of the album the track belongs to.
 283     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 284     album_artist:   List of all artists appeared on the album (e.g.
 285                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 286                     and compilations).
 287     disc_number:    Number of the disc or other physical medium the track belongs to,
 288                     as an integer.
 289     release_year:   Year (YYYY) when the album was released.
 290
 291     Unless mentioned otherwise, the fields should be Unicode strings.
 292
 293     Unless mentioned otherwise, None is equivalent to absence of information.
 294
 295
 296     _type "playlist" indicates multiple videos.
 297     There must be a key "entries", which is a list, an iterable, or a PagedList
 298     object, each element of which is a valid dictionary by this specification.
 299
 300     Additionally, playlists can have "title", "description" and "id" attributes
 301     with the same semantics as videos (see above).
 302
 303
 304     _type "multi_video" indicates that there are multiple videos that
 305     form a single show, for examples multiple acts of an opera or TV episode.
 306     It must have an entries key like a playlist and contain all the keys
 307     required for a video at the same time.
 308
 309
 310     _type "url" indicates that the video must be extracted from another
 311     location, possibly by a different extractor. Its only required key is:
 312     "url" - the next URL to extract.
 313     The key "ie_key" can be set to the class name (minus the trailing "IE",
 314     e.g. "Youtube") if the extractor class is known in advance.
 315     Additionally, the dictionary may have any properties of the resolved entity
 316     known in advance, for example "title" if the title of the referred video is
 317     known ahead of time.
 318
 319
 320     _type "url_transparent" entities have the same specification as "url", but
 321     indicate that the given additional information is more precise than the one
 322     associated with the resolved URL.
 323     This is useful when a site employs a video service that hosts the video and
 324     its technical metadata, but that video service does not embed a useful
 325     title, description etc.
 326
 327
 328     Subclasses of this one should re-define the _real_initialize() and
 329     _real_extract() methods and define a _VALID_URL regexp.
 330     Probably, they should also be added to the list of extractors.
 331
 332     _GEO_BYPASS attribute may be set to False in order to disable
 333     geo restriction bypass mechanisms for a particular extractor.
 334     Though it won't disable explicit geo restriction bypass based on
 335     country code provided with geo_bypass_country. (experimental)
 336
 337     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 338     countries for this extractor. One of these countries will be used by
 339     geo restriction bypass mechanism right away in order to bypass
 340     geo restriction, of course, if the mechanism is not disabled. (experimental)
 341
 342     NB: both these geo attributes are experimental and may change in future
 343     or be completely removed.
 344
 345     Finally, the _WORKING attribute should be set to False for broken IEs
 346     in order to warn the users and skip the tests.
 347     """
 348
 349     _ready = False
 350     _downloader = None
 351     _x_forwarded_for_ip = None
 352     _GEO_BYPASS = True
 353     _GEO_COUNTRIES = None
 354     _WORKING = True
 355
 356     def __init__(self, downloader=None):
 357         """Constructor. Receives an optional downloader."""
 358         self._ready = False
 359         self._x_forwarded_for_ip = None
 360         self.set_downloader(downloader)
 361
 362     @classmethod
 363     def suitable(cls, url):
 364         """Receives a URL and returns True if suitable for this IE."""
 365
 366         # This does not use has/getattr intentionally - we want to know whether
 367         # we have cached the regexp for *this* class, whereas getattr would also
 368         # match the superclass
 369         if '_VALID_URL_RE' not in cls.__dict__:
 370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 371         return cls._VALID_URL_RE.match(url) is not None
 372
 373     @classmethod
 374     def _match_id(cls, url):
 375         if '_VALID_URL_RE' not in cls.__dict__:
 376             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 377         m = cls._VALID_URL_RE.match(url)
 378         assert m
 379         return compat_str(m.group('id'))
 380
 381     @classmethod
 382     def working(cls):
 383         """Getter method for _WORKING."""
 384         return cls._WORKING
 385
 386     def initialize(self):
 387         """Initializes an instance (authentication, etc)."""
 388         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 389         if not self._ready:
 390             self._real_initialize()
 391             self._ready = True
 392
 393     def _initialize_geo_bypass(self, countries):
 394         """
 395         Initialize geo restriction bypass mechanism.
 396
 397         This method is used to initialize geo bypass mechanism based on faking
 398         X-Forwarded-For HTTP header. A random country from provided country list
 399         is selected and a random IP belonging to this country is generated. This
 400         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 401         HTTP requests.
 402
 403         This method will be used for initial geo bypass mechanism initialization
 404         during the instance initialization with _GEO_COUNTRIES.
 405
 406         You may also manually call it from extractor's code if geo countries
 407         information is not available beforehand (e.g. obtained during
 408         extraction) or due to some another reason.
 409         """
 410         if not self._x_forwarded_for_ip:
 411             country_code = self._downloader.params.get('geo_bypass_country', None)
 412             # If there is no explicit country for geo bypass specified and
 413             # the extractor is known to be geo restricted let's fake IP
 414             # as X-Forwarded-For right away.
 415             if (not country_code and
 416                     self._GEO_BYPASS and
 417                     self._downloader.params.get('geo_bypass', True) and
 418                     countries):
 419                 country_code = random.choice(countries)
 420             if country_code:
 421                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 422                 if self._downloader.params.get('verbose', False):
 423                     self._downloader.to_screen(
 424                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 425                         % (self._x_forwarded_for_ip, country_code.upper()))
 426
 427     def extract(self, url):
 428         """Extracts URL information and returns it in list of dicts."""
 429         try:
 430             for _ in range(2):
 431                 try:
 432                     self.initialize()
 433                     ie_result = self._real_extract(url)
 434                     if self._x_forwarded_for_ip:
 435                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 436                     return ie_result
 437                 except GeoRestrictedError as e:
 438                     if self.__maybe_fake_ip_and_retry(e.countries):
 439                         continue
 440                     raise
 441         except ExtractorError:
 442             raise
 443         except compat_http_client.IncompleteRead as e:
 444             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 445         except (KeyError, StopIteration) as e:
 446             raise ExtractorError('An extractor error has occurred.', cause=e)
 447
 448     def __maybe_fake_ip_and_retry(self, countries):
 449         if (not self._downloader.params.get('geo_bypass_country', None) and
 450                 self._GEO_BYPASS and
 451                 self._downloader.params.get('geo_bypass', True) and
 452                 not self._x_forwarded_for_ip and
 453                 countries):
 454             country_code = random.choice(countries)
 455             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 456             if self._x_forwarded_for_ip:
 457                 self.report_warning(
 458                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 459                     % (self._x_forwarded_for_ip, country_code.upper()))
 460                 return True
 461         return False
 462
 463     def set_downloader(self, downloader):
 464         """Sets the downloader for this IE."""
 465         self._downloader = downloader
 466
 467     def _real_initialize(self):
 468         """Real initialization process. Redefine in subclasses."""
 469         pass
 470
 471     def _real_extract(self, url):
 472         """Real extraction process. Redefine in subclasses."""
 473         pass
 474
 475     @classmethod
 476     def ie_key(cls):
 477         """A string for getting the InfoExtractor with get_info_extractor"""
 478         return compat_str(cls.__name__[:-2])
 479
 480     @property
 481     def IE_NAME(self):
 482         return compat_str(type(self).__name__[:-2])
 483
 484     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 485         """ Returns the response handle """
 486         if note is None:
 487             self.report_download_webpage(video_id)
 488         elif note is not False:
 489             if video_id is None:
 490                 self.to_screen('%s' % (note,))
 491             else:
 492                 self.to_screen('%s: %s' % (video_id, note))
 493         if isinstance(url_or_request, compat_urllib_request.Request):
 494             url_or_request = update_Request(
 495                 url_or_request, data=data, headers=headers, query=query)
 496         else:
 497             if query:
 498                 url_or_request = update_url_query(url_or_request, query)
 499             if data is not None or headers:
 500                 url_or_request = sanitized_Request(url_or_request, data, headers)
 501         try:
 502             return self._downloader.urlopen(url_or_request)
 503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 504             if errnote is False:
 505                 return False
 506             if errnote is None:
 507                 errnote = 'Unable to download webpage'
 508
 509             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 510             if fatal:
 511                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 512             else:
 513                 self._downloader.report_warning(errmsg)
 514                 return False
 515
 516     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 517         """ Returns a tuple (page content as string, URL handle) """
 518         # Strip hashes from the URL (#1038)
 519         if isinstance(url_or_request, (compat_str, str)):
 520             url_or_request = url_or_request.partition('#')[0]
 521
 522         # Some sites check X-Forwarded-For HTTP header in order to figure out
 523         # the origin of the client behind proxy. This allows bypassing geo
 524         # restriction by faking this header's value to IP that belongs to some
 525         # geo unrestricted country. We will do so once we encounter any
 526         # geo restriction error.
 527         if self._x_forwarded_for_ip:
 528             if 'X-Forwarded-For' not in headers:
 529                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 530
 531         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 532         if urlh is False:
 533             assert not fatal
 534             return False
 535         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 536         return (content, urlh)
 537
 538     @staticmethod
 539     def _guess_encoding_from_content(content_type, webpage_bytes):
 540         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 541         if m:
 542             encoding = m.group(1)
 543         else:
 544             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 545                           webpage_bytes[:1024])
 546             if m:
 547                 encoding = m.group(1).decode('ascii')
 548             elif webpage_bytes.startswith(b'\xff\xfe'):
 549                 encoding = 'utf-16'
 550             else:
 551                 encoding = 'utf-8'
 552
 553         return encoding
 554
 555     def __check_blocked(self, content):
 556         first_block = content[:512]
 557         if ('<title>Access to this site is blocked</title>' in content and
 558                 'Websense' in first_block):
 559             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 560             blocked_iframe = self._html_search_regex(
 561                 r'<iframe src="([^"]+)"', content,
 562                 'Websense information URL', default=None)
 563             if blocked_iframe:
 564                 msg += ' Visit %s for more details' % blocked_iframe
 565             raise ExtractorError(msg, expected=True)
 566         if '<title>The URL you requested has been blocked</title>' in first_block:
 567             msg = (
 568                 'Access to this webpage has been blocked by Indian censorship. '
 569                 'Use a VPN or proxy server (with --proxy) to route around it.')
 570             block_msg = self._html_search_regex(
 571                 r'</h1><p>(.*?)</p>',
 572                 content, 'block message', default=None)
 573             if block_msg:
 574                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 575             raise ExtractorError(msg, expected=True)
 576         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 577                 'blocklist.rkn.gov.ru' in content):
 578             raise ExtractorError(
 579                 'Access to this webpage has been blocked by decision of the Russian government. '
 580                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 581                 expected=True)
 582
 583     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 584         content_type = urlh.headers.get('Content-Type', '')
 585         webpage_bytes = urlh.read()
 586         if prefix is not None:
 587             webpage_bytes = prefix + webpage_bytes
 588         if not encoding:
 589             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 590         if self._downloader.params.get('dump_intermediate_pages', False):
 591             try:
 592                 url = url_or_request.get_full_url()
 593             except AttributeError:
 594                 url = url_or_request
 595             self.to_screen('Dumping request to ' + url)
 596             dump = base64.b64encode(webpage_bytes).decode('ascii')
 597             self._downloader.to_screen(dump)
 598         if self._downloader.params.get('write_pages', False):
 599             try:
 600                 url = url_or_request.get_full_url()
 601             except AttributeError:
 602                 url = url_or_request
 603             basen = '%s_%s' % (video_id, url)
 604             if len(basen) > 240:
 605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 606                 basen = basen[:240 - len(h)] + h
 607             raw_filename = basen + '.dump'
 608             filename = sanitize_filename(raw_filename, restricted=True)
 609             self.to_screen('Saving request to ' + filename)
 610             # Working around MAX_PATH limitation on Windows (see
 611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 612             if compat_os_name == 'nt':
 613                 absfilepath = os.path.abspath(filename)
 614                 if len(absfilepath) > 259:
 615                     filename = '\\\\?\\' + absfilepath
 616             with open(filename, 'wb') as outf:
 617                 outf.write(webpage_bytes)
 618
 619         try:
 620             content = webpage_bytes.decode(encoding, 'replace')
 621         except LookupError:
 622             content = webpage_bytes.decode('utf-8', 'replace')
 623
 624         self.__check_blocked(content)
 625
 626         return content
 627
 628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 629         """ Returns the data of the page as a string """
 630         success = False
 631         try_count = 0
 632         while success is False:
 633             try:
 634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 635                 success = True
 636             except compat_http_client.IncompleteRead as e:
 637                 try_count += 1
 638                 if try_count >= tries:
 639                     raise e
 640                 self._sleep(timeout, video_id)
 641         if res is False:
 642             return res
 643         else:
 644             content, _ = res
 645             return content
 646
 647     def _download_xml(self, url_or_request, video_id,
 648                       note='Downloading XML', errnote='Unable to download XML',
 649                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 650         """Return the xml as an xml.etree.ElementTree.Element"""
 651         xml_string = self._download_webpage(
 652             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 653         if xml_string is False:
 654             return xml_string
 655         if transform_source:
 656             xml_string = transform_source(xml_string)
 657         return compat_etree_fromstring(xml_string.encode('utf-8'))
 658
 659     def _download_json(self, url_or_request, video_id,
 660                        note='Downloading JSON metadata',
 661                        errnote='Unable to download JSON metadata',
 662                        transform_source=None,
 663                        fatal=True, encoding=None, data=None, headers={}, query={}):
 664         json_string = self._download_webpage(
 665             url_or_request, video_id, note, errnote, fatal=fatal,
 666             encoding=encoding, data=data, headers=headers, query=query)
 667         if (not fatal) and json_string is False:
 668             return None
 669         return self._parse_json(
 670             json_string, video_id, transform_source=transform_source, fatal=fatal)
 671
 672     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 673         if transform_source:
 674             json_string = transform_source(json_string)
 675         try:
 676             return json.loads(json_string)
 677         except ValueError as ve:
 678             errmsg = '%s: Failed to parse JSON ' % video_id
 679             if fatal:
 680                 raise ExtractorError(errmsg, cause=ve)
 681             else:
 682                 self.report_warning(errmsg + str(ve))
 683
 684     def report_warning(self, msg, video_id=None):
 685         idstr = '' if video_id is None else '%s: ' % video_id
 686         self._downloader.report_warning(
 687             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 688
 689     def to_screen(self, msg):
 690         """Print msg to screen, prefixing it with '[ie_name]'"""
 691         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 692
 693     def report_extraction(self, id_or_name):
 694         """Report information extraction."""
 695         self.to_screen('%s: Extracting information' % id_or_name)
 696
 697     def report_download_webpage(self, video_id):
 698         """Report webpage download."""
 699         self.to_screen('%s: Downloading webpage' % video_id)
 700
 701     def report_age_confirmation(self):
 702         """Report attempt to confirm age."""
 703         self.to_screen('Confirming age')
 704
 705     def report_login(self):
 706         """Report attempt to log in."""
 707         self.to_screen('Logging in')
 708
 709     @staticmethod
 710     def raise_login_required(msg='This video is only available for registered users'):
 711         raise ExtractorError(
 712             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 713             expected=True)
 714
 715     @staticmethod
 716     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 717         raise GeoRestrictedError(msg, countries=countries)
 718
 719     # Methods for following #608
 720     @staticmethod
 721     def url_result(url, ie=None, video_id=None, video_title=None):
 722         """Returns a URL that points to a page that should be processed"""
 723         # TODO: ie should be the class used for getting the info
 724         video_info = {'_type': 'url',
 725                       'url': url,
 726                       'ie_key': ie}
 727         if video_id is not None:
 728             video_info['id'] = video_id
 729         if video_title is not None:
 730             video_info['title'] = video_title
 731         return video_info
 732
 733     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 734         urls = orderedSet(
 735             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 736             for m in matches)
 737         return self.playlist_result(
 738             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 739
 740     @staticmethod
 741     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 742         """Returns a playlist"""
 743         video_info = {'_type': 'playlist',
 744                       'entries': entries}
 745         if playlist_id:
 746             video_info['id'] = playlist_id
 747         if playlist_title:
 748             video_info['title'] = playlist_title
 749         if playlist_description:
 750             video_info['description'] = playlist_description
 751         return video_info
 752
 753     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 754         """
 755         Perform a regex search on the given string, using a single or a list of
 756         patterns returning the first matching group.
 757         In case of failure return a default value or raise a WARNING or a
 758         RegexNotFoundError, depending on fatal, specifying the field name.
 759         """
 760         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 761             mobj = re.search(pattern, string, flags)
 762         else:
 763             for p in pattern:
 764                 mobj = re.search(p, string, flags)
 765                 if mobj:
 766                     break
 767
 768         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 769             _name = '\033[0;34m%s\033[0m' % name
 770         else:
 771             _name = name
 772
 773         if mobj:
 774             if group is None:
 775                 # return the first matching group
 776                 return next(g for g in mobj.groups() if g is not None)
 777             else:
 778                 return mobj.group(group)
 779         elif default is not NO_DEFAULT:
 780             return default
 781         elif fatal:
 782             raise RegexNotFoundError('Unable to extract %s' % _name)
 783         else:
 784             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 785             return None
 786
 787     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 788         """
 789         Like _search_regex, but strips HTML tags and unescapes entities.
 790         """
 791         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 792         if res:
 793             return clean_html(res).strip()
 794         else:
 795             return res
 796
 797     def _get_netrc_login_info(self, netrc_machine=None):
 798         username = None
 799         password = None
 800         netrc_machine = netrc_machine or self._NETRC_MACHINE
 801
 802         if self._downloader.params.get('usenetrc', False):
 803             try:
 804                 info = netrc.netrc().authenticators(netrc_machine)
 805                 if info is not None:
 806                     username = info[0]
 807                     password = info[2]
 808                 else:
 809                     raise netrc.NetrcParseError(
 810                         'No authenticators for %s' % netrc_machine)
 811             except (IOError, netrc.NetrcParseError) as err:
 812                 self._downloader.report_warning(
 813                     'parsing .netrc: %s' % error_to_compat_str(err))
 814
 815         return username, password
 816
 817     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 818         """
 819         Get the login info as (username, password)
 820         First look for the manually specified credentials using username_option
 821         and password_option as keys in params dictionary. If no such credentials
 822         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 823         value.
 824         If there's no info available, return (None, None)
 825         """
 826         if self._downloader is None:
 827             return (None, None)
 828
 829         downloader_params = self._downloader.params
 830
 831         # Attempt to use provided username and password or .netrc data
 832         if downloader_params.get(username_option) is not None:
 833             username = downloader_params[username_option]
 834             password = downloader_params[password_option]
 835         else:
 836             username, password = self._get_netrc_login_info(netrc_machine)
 837
 838         return username, password
 839
 840     def _get_tfa_info(self, note='two-factor verification code'):
 841         """
 842         Get the two-factor authentication info
 843         TODO - asking the user will be required for sms/phone verify
 844         currently just uses the command line option
 845         If there's no info available, return None
 846         """
 847         if self._downloader is None:
 848             return None
 849         downloader_params = self._downloader.params
 850
 851         if downloader_params.get('twofactor') is not None:
 852             return downloader_params['twofactor']
 853
 854         return compat_getpass('Type %s and press [Return]: ' % note)
 855
 856     # Helper functions for extracting OpenGraph info
 857     @staticmethod
 858     def _og_regexes(prop):
 859         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 860         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 861                        % {'prop': re.escape(prop)})
 862         template = r'<meta[^>]+?%s[^>]+?%s'
 863         return [
 864             template % (property_re, content_re),
 865             template % (content_re, property_re),
 866         ]
 867
 868     @staticmethod
 869     def _meta_regex(prop):
 870         return r'''(?isx)<meta
 871                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 872                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 873
 874     def _og_search_property(self, prop, html, name=None, **kargs):
 875         if not isinstance(prop, (list, tuple)):
 876             prop = [prop]
 877         if name is None:
 878             name = 'OpenGraph %s' % prop[0]
 879         og_regexes = []
 880         for p in prop:
 881             og_regexes.extend(self._og_regexes(p))
 882         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 883         if escaped is None:
 884             return None
 885         return unescapeHTML(escaped)
 886
 887     def _og_search_thumbnail(self, html, **kargs):
 888         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 889
 890     def _og_search_description(self, html, **kargs):
 891         return self._og_search_property('description', html, fatal=False, **kargs)
 892
 893     def _og_search_title(self, html, **kargs):
 894         return self._og_search_property('title', html, **kargs)
 895
 896     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 897         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 898         if secure:
 899             regexes = self._og_regexes('video:secure_url') + regexes
 900         return self._html_search_regex(regexes, html, name, **kargs)
 901
 902     def _og_search_url(self, html, **kargs):
 903         return self._og_search_property('url', html, **kargs)
 904
 905     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 906         if not isinstance(name, (list, tuple)):
 907             name = [name]
 908         if display_name is None:
 909             display_name = name[0]
 910         return self._html_search_regex(
 911             [self._meta_regex(n) for n in name],
 912             html, display_name, fatal=fatal, group='content', **kwargs)
 913
 914     def _dc_search_uploader(self, html):
 915         return self._html_search_meta('dc.creator', html, 'uploader')
 916
 917     def _rta_search(self, html):
 918         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 919         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 920                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 921                      html):
 922             return 18
 923         return 0
 924
 925     def _media_rating_search(self, html):
 926         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 927         rating = self._html_search_meta('rating', html)
 928
 929         if not rating:
 930             return None
 931
 932         RATING_TABLE = {
 933             'safe for kids': 0,
 934             'general': 8,
 935             '14 years': 14,
 936             'mature': 17,
 937             'restricted': 19,
 938         }
 939         return RATING_TABLE.get(rating.lower())
 940
 941     def _family_friendly_search(self, html):
 942         # See http://schema.org/VideoObject
 943         family_friendly = self._html_search_meta(
 944             'isFamilyFriendly', html, default=None)
 945
 946         if not family_friendly:
 947             return None
 948
 949         RATING_TABLE = {
 950             '1': 0,
 951             'true': 0,
 952             '0': 18,
 953             'false': 18,
 954         }
 955         return RATING_TABLE.get(family_friendly.lower())
 956
 957     def _twitter_search_player(self, html):
 958         return self._html_search_meta('twitter:player', html,
 959                                       'twitter card player')
 960
 961     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 962         json_ld = self._search_regex(
 963             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 964             html, 'JSON-LD', group='json_ld', **kwargs)
 965         default = kwargs.get('default', NO_DEFAULT)
 966         if not json_ld:
 967             return default if default is not NO_DEFAULT else {}
 968         # JSON-LD may be malformed and thus `fatal` should be respected.
 969         # At the same time `default` may be passed that assumes `fatal=False`
 970         # for _search_regex. Let's simulate the same behavior here as well.
 971         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 972         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 973
 974     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 975         if isinstance(json_ld, compat_str):
 976             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 977         if not json_ld:
 978             return {}
 979         info = {}
 980         if not isinstance(json_ld, (list, tuple, dict)):
 981             return info
 982         if isinstance(json_ld, dict):
 983             json_ld = [json_ld]
 984
 985         def extract_video_object(e):
 986             assert e['@type'] == 'VideoObject'
 987             info.update({
 988                 'url': e.get('contentUrl'),
 989                 'title': unescapeHTML(e.get('name')),
 990                 'description': unescapeHTML(e.get('description')),
 991                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 992                 'duration': parse_duration(e.get('duration')),
 993                 'timestamp': unified_timestamp(e.get('uploadDate')),
 994                 'filesize': float_or_none(e.get('contentSize')),
 995                 'tbr': int_or_none(e.get('bitrate')),
 996                 'width': int_or_none(e.get('width')),
 997                 'height': int_or_none(e.get('height')),
 998                 'view_count': int_or_none(e.get('interactionCount')),
 999             })
1000
1001         for e in json_ld:
1002             if e.get('@context') == 'http://schema.org':
1003                 item_type = e.get('@type')
1004                 if expected_type is not None and expected_type != item_type:
1005                     return info
1006                 if item_type in ('TVEpisode', 'Episode'):
1007                     info.update({
1008                         'episode': unescapeHTML(e.get('name')),
1009                         'episode_number': int_or_none(e.get('episodeNumber')),
1010                         'description': unescapeHTML(e.get('description')),
1011                     })
1012                     part_of_season = e.get('partOfSeason')
1013                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1014                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1015                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1016                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1017                         info['series'] = unescapeHTML(part_of_series.get('name'))
1018                 elif item_type == 'Article':
1019                     info.update({
1020                         'timestamp': parse_iso8601(e.get('datePublished')),
1021                         'title': unescapeHTML(e.get('headline')),
1022                         'description': unescapeHTML(e.get('articleBody')),
1023                     })
1024                 elif item_type == 'VideoObject':
1025                     extract_video_object(e)
1026                     continue
1027                 video = e.get('video')
1028                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1029                     extract_video_object(video)
1030                 break
1031         return dict((k, v) for k, v in info.items() if v is not None)
1032
1033     @staticmethod
1034     def _hidden_inputs(html):
1035         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1036         hidden_inputs = {}
1037         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1038             attrs = extract_attributes(input)
1039             if not input:
1040                 continue
1041             if attrs.get('type') not in ('hidden', 'submit'):
1042                 continue
1043             name = attrs.get('name') or attrs.get('id')
1044             value = attrs.get('value')
1045             if name and value is not None:
1046                 hidden_inputs[name] = value
1047         return hidden_inputs
1048
1049     def _form_hidden_inputs(self, form_id, html):
1050         form = self._search_regex(
1051             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1052             html, '%s form' % form_id, group='form')
1053         return self._hidden_inputs(form)
1054
1055     def _sort_formats(self, formats, field_preference=None):
1056         if not formats:
1057             raise ExtractorError('No video formats found')
1058
1059         for f in formats:
1060             # Automatically determine tbr when missing based on abr and vbr (improves
1061             # formats sorting in some cases)
1062             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1063                 f['tbr'] = f['abr'] + f['vbr']
1064
1065         def _formats_key(f):
1066             # TODO remove the following workaround
1067             from ..utils import determine_ext
1068             if not f.get('ext') and 'url' in f:
1069                 f['ext'] = determine_ext(f['url'])
1070
1071             if isinstance(field_preference, (list, tuple)):
1072                 return tuple(
1073                     f.get(field)
1074                     if f.get(field) is not None
1075                     else ('' if field == 'format_id' else -1)
1076                     for field in field_preference)
1077
1078             preference = f.get('preference')
1079             if preference is None:
1080                 preference = 0
1081                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1082                     preference -= 0.5
1083
1084             protocol = f.get('protocol') or determine_protocol(f)
1085             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1086
1087             if f.get('vcodec') == 'none':  # audio only
1088                 preference -= 50
1089                 if self._downloader.params.get('prefer_free_formats'):
1090                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1091                 else:
1092                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1093                 ext_preference = 0
1094                 try:
1095                     audio_ext_preference = ORDER.index(f['ext'])
1096                 except ValueError:
1097                     audio_ext_preference = -1
1098             else:
1099                 if f.get('acodec') == 'none':  # video only
1100                     preference -= 40
1101                 if self._downloader.params.get('prefer_free_formats'):
1102                     ORDER = ['flv', 'mp4', 'webm']
1103                 else:
1104                     ORDER = ['webm', 'flv', 'mp4']
1105                 try:
1106                     ext_preference = ORDER.index(f['ext'])
1107                 except ValueError:
1108                     ext_preference = -1
1109                 audio_ext_preference = 0
1110
1111             return (
1112                 preference,
1113                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1114                 f.get('quality') if f.get('quality') is not None else -1,
1115                 f.get('tbr') if f.get('tbr') is not None else -1,
1116                 f.get('filesize') if f.get('filesize') is not None else -1,
1117                 f.get('vbr') if f.get('vbr') is not None else -1,
1118                 f.get('height') if f.get('height') is not None else -1,
1119                 f.get('width') if f.get('width') is not None else -1,
1120                 proto_preference,
1121                 ext_preference,
1122                 f.get('abr') if f.get('abr') is not None else -1,
1123                 audio_ext_preference,
1124                 f.get('fps') if f.get('fps') is not None else -1,
1125                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1126                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1127                 f.get('format_id') if f.get('format_id') is not None else '',
1128             )
1129         formats.sort(key=_formats_key)
1130
1131     def _check_formats(self, formats, video_id):
1132         if formats:
1133             formats[:] = filter(
1134                 lambda f: self._is_valid_url(
1135                     f['url'], video_id,
1136                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1137                 formats)
1138
1139     @staticmethod
1140     def _remove_duplicate_formats(formats):
1141         format_urls = set()
1142         unique_formats = []
1143         for f in formats:
1144             if f['url'] not in format_urls:
1145                 format_urls.add(f['url'])
1146                 unique_formats.append(f)
1147         formats[:] = unique_formats
1148
1149     def _is_valid_url(self, url, video_id, item='video', headers={}):
1150         url = self._proto_relative_url(url, scheme='http:')
1151         # For now assume non HTTP(S) URLs always valid
1152         if not (url.startswith('http://') or url.startswith('https://')):
1153             return True
1154         try:
1155             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1156             return True
1157         except ExtractorError as e:
1158             if isinstance(e.cause, compat_urllib_error.URLError):
1159                 self.to_screen(
1160                     '%s: %s URL is invalid, skipping' % (video_id, item))
1161                 return False
1162             raise
1163
1164     def http_scheme(self):
1165         """ Either "http:" or "https:", depending on the user's preferences """
1166         return (
1167             'http:'
1168             if self._downloader.params.get('prefer_insecure', False)
1169             else 'https:')
1170
1171     def _proto_relative_url(self, url, scheme=None):
1172         if url is None:
1173             return url
1174         if url.startswith('//'):
1175             if scheme is None:
1176                 scheme = self.http_scheme()
1177             return scheme + url
1178         else:
1179             return url
1180
1181     def _sleep(self, timeout, video_id, msg_template=None):
1182         if msg_template is None:
1183             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1184         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1185         self.to_screen(msg)
1186         time.sleep(timeout)
1187
1188     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1189                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1190                              fatal=True, m3u8_id=None):
1191         manifest = self._download_xml(
1192             manifest_url, video_id, 'Downloading f4m manifest',
1193             'Unable to download f4m manifest',
1194             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1195             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1196             transform_source=transform_source,
1197             fatal=fatal)
1198
1199         if manifest is False:
1200             return []
1201
1202         return self._parse_f4m_formats(
1203             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1204             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1205
1206     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1207                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1208                            fatal=True, m3u8_id=None):
1209         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1210         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1211         if akamai_pv is not None and ';' in akamai_pv.text:
1212             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1213             if playerVerificationChallenge.strip() != '':
1214                 return []
1215
1216         formats = []
1217         manifest_version = '1.0'
1218         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1219         if not media_nodes:
1220             manifest_version = '2.0'
1221             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1222         # Remove unsupported DRM protected media from final formats
1223         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1224         media_nodes = remove_encrypted_media(media_nodes)
1225         if not media_nodes:
1226             return formats
1227         base_url = xpath_text(
1228             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1229             'base URL', default=None)
1230         if base_url:
1231             base_url = base_url.strip()
1232
1233         bootstrap_info = xpath_element(
1234             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1235             'bootstrap info', default=None)
1236
1237         vcodec = None
1238         mime_type = xpath_text(
1239             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1240             'base URL', default=None)
1241         if mime_type and mime_type.startswith('audio/'):
1242             vcodec = 'none'
1243
1244         for i, media_el in enumerate(media_nodes):
1245             tbr = int_or_none(media_el.attrib.get('bitrate'))
1246             width = int_or_none(media_el.attrib.get('width'))
1247             height = int_or_none(media_el.attrib.get('height'))
1248             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1249             # If <bootstrapInfo> is present, the specified f4m is a
1250             # stream-level manifest, and only set-level manifests may refer to
1251             # external resources.  See section 11.4 and section 4 of F4M spec
1252             if bootstrap_info is None:
1253                 media_url = None
1254                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1255                 if manifest_version == '2.0':
1256                     media_url = media_el.attrib.get('href')
1257                 if media_url is None:
1258                     media_url = media_el.attrib.get('url')
1259                 if not media_url:
1260                     continue
1261                 manifest_url = (
1262                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1263                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1264                 # If media_url is itself a f4m manifest do the recursive extraction
1265                 # since bitrates in parent manifest (this one) and media_url manifest
1266                 # may differ leading to inability to resolve the format by requested
1267                 # bitrate in f4m downloader
1268                 ext = determine_ext(manifest_url)
1269                 if ext == 'f4m':
1270                     f4m_formats = self._extract_f4m_formats(
1271                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1272                         transform_source=transform_source, fatal=fatal)
1273                     # Sometimes stream-level manifest contains single media entry that
1274                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1275                     # At the same time parent's media entry in set-level manifest may
1276                     # contain it. We will copy it from parent in such cases.
1277                     if len(f4m_formats) == 1:
1278                         f = f4m_formats[0]
1279                         f.update({
1280                             'tbr': f.get('tbr') or tbr,
1281                             'width': f.get('width') or width,
1282                             'height': f.get('height') or height,
1283                             'format_id': f.get('format_id') if not tbr else format_id,
1284                             'vcodec': vcodec,
1285                         })
1286                     formats.extend(f4m_formats)
1287                     continue
1288                 elif ext == 'm3u8':
1289                     formats.extend(self._extract_m3u8_formats(
1290                         manifest_url, video_id, 'mp4', preference=preference,
1291                         m3u8_id=m3u8_id, fatal=fatal))
1292                     continue
1293             formats.append({
1294                 'format_id': format_id,
1295                 'url': manifest_url,
1296                 'manifest_url': manifest_url,
1297                 'ext': 'flv' if bootstrap_info is not None else None,
1298                 'tbr': tbr,
1299                 'width': width,
1300                 'height': height,
1301                 'vcodec': vcodec,
1302                 'preference': preference,
1303             })
1304         return formats
1305
1306     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1307         return {
1308             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1309             'url': m3u8_url,
1310             'ext': ext,
1311             'protocol': 'm3u8',
1312             'preference': preference - 100 if preference else -100,
1313             'resolution': 'multiple',
1314             'format_note': 'Quality selection URL',
1315         }
1316
1317     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1318                               entry_protocol='m3u8', preference=None,
1319                               m3u8_id=None, note=None, errnote=None,
1320                               fatal=True, live=False):
1321         res = self._download_webpage_handle(
1322             m3u8_url, video_id,
1323             note=note or 'Downloading m3u8 information',
1324             errnote=errnote or 'Failed to download m3u8 information',
1325             fatal=fatal)
1326
1327         if res is False:
1328             return []
1329
1330         m3u8_doc, urlh = res
1331         m3u8_url = urlh.geturl()
1332
1333         return self._parse_m3u8_formats(
1334             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1335             preference=preference, m3u8_id=m3u8_id, live=live)
1336
1337     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1338                             entry_protocol='m3u8', preference=None,
1339                             m3u8_id=None, live=False):
1340         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1341             return []
1342
1343         formats = []
1344
1345         format_url = lambda u: (
1346             u
1347             if re.match(r'^https?://', u)
1348             else compat_urlparse.urljoin(m3u8_url, u))
1349
1350         # References:
1351         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1352         # 2. https://github.com/rg3/youtube-dl/issues/12211
1353
1354         # We should try extracting formats only from master playlists [1, 4.3.4],
1355         # i.e. playlists that describe available qualities. On the other hand
1356         # media playlists [1, 4.3.3] should be returned as is since they contain
1357         # just the media without qualities renditions.
1358         # Fortunately, master playlist can be easily distinguished from media
1359         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1360         # master playlist tags MUST NOT appear in a media playist and vice versa.
1361         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1362         # media playlist and MUST NOT appear in master playlist thus we can
1363         # clearly detect media playlist with this criterion.
1364
1365         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1366             return [{
1367                 'url': m3u8_url,
1368                 'format_id': m3u8_id,
1369                 'ext': ext,
1370                 'protocol': entry_protocol,
1371                 'preference': preference,
1372             }]
1373
1374         groups = {}
1375         last_stream_inf = {}
1376
1377         def extract_media(x_media_line):
1378             media = parse_m3u8_attributes(x_media_line)
1379             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1380             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1381             if not (media_type and group_id and name):
1382                 return
1383             groups.setdefault(group_id, []).append(media)
1384             if media_type not in ('VIDEO', 'AUDIO'):
1385                 return
1386             media_url = media.get('URI')
1387             if media_url:
1388                 format_id = []
1389                 for v in (group_id, name):
1390                     if v:
1391                         format_id.append(v)
1392                 f = {
1393                     'format_id': '-'.join(format_id),
1394                     'url': format_url(media_url),
1395                     'manifest_url': m3u8_url,
1396                     'language': media.get('LANGUAGE'),
1397                     'ext': ext,
1398                     'protocol': entry_protocol,
1399                     'preference': preference,
1400                 }
1401                 if media_type == 'AUDIO':
1402                     f['vcodec'] = 'none'
1403                 formats.append(f)
1404
1405         def build_stream_name():
1406             # Despite specification does not mention NAME attribute for
1407             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1408             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1409             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1410             stream_name = last_stream_inf.get('NAME')
1411             if stream_name:
1412                 return stream_name
1413             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1414             # from corresponding rendition group
1415             stream_group_id = last_stream_inf.get('VIDEO')
1416             if not stream_group_id:
1417                 return
1418             stream_group = groups.get(stream_group_id)
1419             if not stream_group:
1420                 return stream_group_id
1421             rendition = stream_group[0]
1422             return rendition.get('NAME') or stream_group_id
1423
1424         for line in m3u8_doc.splitlines():
1425             if line.startswith('#EXT-X-STREAM-INF:'):
1426                 last_stream_inf = parse_m3u8_attributes(line)
1427             elif line.startswith('#EXT-X-MEDIA:'):
1428                 extract_media(line)
1429             elif line.startswith('#') or not line.strip():
1430                 continue
1431             else:
1432                 tbr = float_or_none(
1433                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1434                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1435                 format_id = []
1436                 if m3u8_id:
1437                     format_id.append(m3u8_id)
1438                 stream_name = build_stream_name()
1439                 # Bandwidth of live streams may differ over time thus making
1440                 # format_id unpredictable. So it's better to keep provided
1441                 # format_id intact.
1442                 if not live:
1443                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1444                 manifest_url = format_url(line.strip())
1445                 f = {
1446                     'format_id': '-'.join(format_id),
1447                     'url': manifest_url,
1448                     'manifest_url': m3u8_url,
1449                     'tbr': tbr,
1450                     'ext': ext,
1451                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1452                     'protocol': entry_protocol,
1453                     'preference': preference,
1454                 }
1455                 resolution = last_stream_inf.get('RESOLUTION')
1456                 if resolution:
1457                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1458                     if mobj:
1459                         f['width'] = int(mobj.group('width'))
1460                         f['height'] = int(mobj.group('height'))
1461                 # Unified Streaming Platform
1462                 mobj = re.search(
1463                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1464                 if mobj:
1465                     abr, vbr = mobj.groups()
1466                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1467                     f.update({
1468                         'vbr': vbr,
1469                         'abr': abr,
1470                     })
1471                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1472                 f.update(codecs)
1473                 audio_group_id = last_stream_inf.get('AUDIO')
1474                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1475                 # references a rendition group MUST have a CODECS attribute.
1476                 # However, this is not always respected, for example, [2]
1477                 # contains EXT-X-STREAM-INF tag which references AUDIO
1478                 # rendition group but does not have CODECS and despite
1479                 # referencing audio group an audio group, it represents
1480                 # a complete (with audio and video) format. So, for such cases
1481                 # we will ignore references to rendition groups and treat them
1482                 # as complete formats.
1483                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1484                     audio_group = groups.get(audio_group_id)
1485                     if audio_group and audio_group[0].get('URI'):
1486                         # TODO: update acodec for audio only formats with
1487                         # the same GROUP-ID
1488                         f['acodec'] = 'none'
1489                 formats.append(f)
1490                 last_stream_inf = {}
1491         return formats
1492
1493     @staticmethod
1494     def _xpath_ns(path, namespace=None):
1495         if not namespace:
1496             return path
1497         out = []
1498         for c in path.split('/'):
1499             if not c or c == '.':
1500                 out.append(c)
1501             else:
1502                 out.append('{%s}%s' % (namespace, c))
1503         return '/'.join(out)
1504
1505     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1506         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1507
1508         if smil is False:
1509             assert not fatal
1510             return []
1511
1512         namespace = self._parse_smil_namespace(smil)
1513
1514         return self._parse_smil_formats(
1515             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1516
1517     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1518         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1519         if smil is False:
1520             return {}
1521         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1522
1523     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1524         return self._download_xml(
1525             smil_url, video_id, 'Downloading SMIL file',
1526             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1527
1528     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1529         namespace = self._parse_smil_namespace(smil)
1530
1531         formats = self._parse_smil_formats(
1532             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1533         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1534
1535         video_id = os.path.splitext(url_basename(smil_url))[0]
1536         title = None
1537         description = None
1538         upload_date = None
1539         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1540             name = meta.attrib.get('name')
1541             content = meta.attrib.get('content')
1542             if not name or not content:
1543                 continue
1544             if not title and name == 'title':
1545                 title = content
1546             elif not description and name in ('description', 'abstract'):
1547                 description = content
1548             elif not upload_date and name == 'date':
1549                 upload_date = unified_strdate(content)
1550
1551         thumbnails = [{
1552             'id': image.get('type'),
1553             'url': image.get('src'),
1554             'width': int_or_none(image.get('width')),
1555             'height': int_or_none(image.get('height')),
1556         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1557
1558         return {
1559             'id': video_id,
1560             'title': title or video_id,
1561             'description': description,
1562             'upload_date': upload_date,
1563             'thumbnails': thumbnails,
1564             'formats': formats,
1565             'subtitles': subtitles,
1566         }
1567
1568     def _parse_smil_namespace(self, smil):
1569         return self._search_regex(
1570             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1571
1572     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1573         base = smil_url
1574         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1575             b = meta.get('base') or meta.get('httpBase')
1576             if b:
1577                 base = b
1578                 break
1579
1580         formats = []
1581         rtmp_count = 0
1582         http_count = 0
1583         m3u8_count = 0
1584
1585         srcs = []
1586         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1587         for medium in media:
1588             src = medium.get('src')
1589             if not src or src in srcs:
1590                 continue
1591             srcs.append(src)
1592
1593             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1594             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1595             width = int_or_none(medium.get('width'))
1596             height = int_or_none(medium.get('height'))
1597             proto = medium.get('proto')
1598             ext = medium.get('ext')
1599             src_ext = determine_ext(src)
1600             streamer = medium.get('streamer') or base
1601
1602             if proto == 'rtmp' or streamer.startswith('rtmp'):
1603                 rtmp_count += 1
1604                 formats.append({
1605                     'url': streamer,
1606                     'play_path': src,
1607                     'ext': 'flv',
1608                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1609                     'tbr': bitrate,
1610                     'filesize': filesize,
1611                     'width': width,
1612                     'height': height,
1613                 })
1614                 if transform_rtmp_url:
1615                     streamer, src = transform_rtmp_url(streamer, src)
1616                     formats[-1].update({
1617                         'url': streamer,
1618                         'play_path': src,
1619                     })
1620                 continue
1621
1622             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1623             src_url = src_url.strip()
1624
1625             if proto == 'm3u8' or src_ext == 'm3u8':
1626                 m3u8_formats = self._extract_m3u8_formats(
1627                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1628                 if len(m3u8_formats) == 1:
1629                     m3u8_count += 1
1630                     m3u8_formats[0].update({
1631                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1632                         'tbr': bitrate,
1633                         'width': width,
1634                         'height': height,
1635                     })
1636                 formats.extend(m3u8_formats)
1637                 continue
1638
1639             if src_ext == 'f4m':
1640                 f4m_url = src_url
1641                 if not f4m_params:
1642                     f4m_params = {
1643                         'hdcore': '3.2.0',
1644                         'plugin': 'flowplayer-3.2.0.1',
1645                     }
1646                 f4m_url += '&' if '?' in f4m_url else '?'
1647                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1648                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1649                 continue
1650
1651             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1652                 http_count += 1
1653                 formats.append({
1654                     'url': src_url,
1655                     'ext': ext or src_ext or 'flv',
1656                     'format_id': 'http-%d' % (bitrate or http_count),
1657                     'tbr': bitrate,
1658                     'filesize': filesize,
1659                     'width': width,
1660                     'height': height,
1661                 })
1662                 continue
1663
1664         return formats
1665
1666     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1667         urls = []
1668         subtitles = {}
1669         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1670             src = textstream.get('src')
1671             if not src or src in urls:
1672                 continue
1673             urls.append(src)
1674             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1675             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1676             subtitles.setdefault(lang, []).append({
1677                 'url': src,
1678                 'ext': ext,
1679             })
1680         return subtitles
1681
1682     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1683         xspf = self._download_xml(
1684             playlist_url, playlist_id, 'Downloading xpsf playlist',
1685             'Unable to download xspf manifest', fatal=fatal)
1686         if xspf is False:
1687             return []
1688         return self._parse_xspf(xspf, playlist_id)
1689
1690     def _parse_xspf(self, playlist, playlist_id):
1691         NS_MAP = {
1692             'xspf': 'http://xspf.org/ns/0/',
1693             's1': 'http://static.streamone.nl/player/ns/0',
1694         }
1695
1696         entries = []
1697         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1698             title = xpath_text(
1699                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1700             description = xpath_text(
1701                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1702             thumbnail = xpath_text(
1703                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1704             duration = float_or_none(
1705                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1706
1707             formats = [{
1708                 'url': location.text,
1709                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1710                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1711                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1712             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1713             self._sort_formats(formats)
1714
1715             entries.append({
1716                 'id': playlist_id,
1717                 'title': title,
1718                 'description': description,
1719                 'thumbnail': thumbnail,
1720                 'duration': duration,
1721                 'formats': formats,
1722             })
1723         return entries
1724
1725     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1726         res = self._download_webpage_handle(
1727             mpd_url, video_id,
1728             note=note or 'Downloading MPD manifest',
1729             errnote=errnote or 'Failed to download MPD manifest',
1730             fatal=fatal)
1731         if res is False:
1732             return []
1733         mpd, urlh = res
1734         mpd_base_url = base_url(urlh.geturl())
1735
1736         return self._parse_mpd_formats(
1737             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1738             formats_dict=formats_dict, mpd_url=mpd_url)
1739
1740     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1741         """
1742         Parse formats from MPD manifest.
1743         References:
1744          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1745             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1746          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1747         """
1748         if mpd_doc.get('type') == 'dynamic':
1749             return []
1750
1751         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1752
1753         def _add_ns(path):
1754             return self._xpath_ns(path, namespace)
1755
1756         def is_drm_protected(element):
1757             return element.find(_add_ns('ContentProtection')) is not None
1758
1759         def extract_multisegment_info(element, ms_parent_info):
1760             ms_info = ms_parent_info.copy()
1761
1762             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1763             # common attributes and elements.  We will only extract relevant
1764             # for us.
1765             def extract_common(source):
1766                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1767                 if segment_timeline is not None:
1768                     s_e = segment_timeline.findall(_add_ns('S'))
1769                     if s_e:
1770                         ms_info['total_number'] = 0
1771                         ms_info['s'] = []
1772                         for s in s_e:
1773                             r = int(s.get('r', 0))
1774                             ms_info['total_number'] += 1 + r
1775                             ms_info['s'].append({
1776                                 't': int(s.get('t', 0)),
1777                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1778                                 'd': int(s.attrib['d']),
1779                                 'r': r,
1780                             })
1781                 start_number = source.get('startNumber')
1782                 if start_number:
1783                     ms_info['start_number'] = int(start_number)
1784                 timescale = source.get('timescale')
1785                 if timescale:
1786                     ms_info['timescale'] = int(timescale)
1787                 segment_duration = source.get('duration')
1788                 if segment_duration:
1789                     ms_info['segment_duration'] = float(segment_duration)
1790
1791             def extract_Initialization(source):
1792                 initialization = source.find(_add_ns('Initialization'))
1793                 if initialization is not None:
1794                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1795
1796             segment_list = element.find(_add_ns('SegmentList'))
1797             if segment_list is not None:
1798                 extract_common(segment_list)
1799                 extract_Initialization(segment_list)
1800                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1801                 if segment_urls_e:
1802                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1803             else:
1804                 segment_template = element.find(_add_ns('SegmentTemplate'))
1805                 if segment_template is not None:
1806                     extract_common(segment_template)
1807                     media = segment_template.get('media')
1808                     if media:
1809                         ms_info['media'] = media
1810                     initialization = segment_template.get('initialization')
1811                     if initialization:
1812                         ms_info['initialization'] = initialization
1813                     else:
1814                         extract_Initialization(segment_template)
1815             return ms_info
1816
1817         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1818         formats = []
1819         for period in mpd_doc.findall(_add_ns('Period')):
1820             period_duration = parse_duration(period.get('duration')) or mpd_duration
1821             period_ms_info = extract_multisegment_info(period, {
1822                 'start_number': 1,
1823                 'timescale': 1,
1824             })
1825             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1826                 if is_drm_protected(adaptation_set):
1827                     continue
1828                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1829                 for representation in adaptation_set.findall(_add_ns('Representation')):
1830                     if is_drm_protected(representation):
1831                         continue
1832                     representation_attrib = adaptation_set.attrib.copy()
1833                     representation_attrib.update(representation.attrib)
1834                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1835                     mime_type = representation_attrib['mimeType']
1836                     content_type = mime_type.split('/')[0]
1837                     if content_type == 'text':
1838                         # TODO implement WebVTT downloading
1839                         pass
1840                     elif content_type in ('video', 'audio'):
1841                         base_url = ''
1842                         for element in (representation, adaptation_set, period, mpd_doc):
1843                             base_url_e = element.find(_add_ns('BaseURL'))
1844                             if base_url_e is not None:
1845                                 base_url = base_url_e.text + base_url
1846                                 if re.match(r'^https?://', base_url):
1847                                     break
1848                         if mpd_base_url and not re.match(r'^https?://', base_url):
1849                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1850                                 mpd_base_url += '/'
1851                             base_url = mpd_base_url + base_url
1852                         representation_id = representation_attrib.get('id')
1853                         lang = representation_attrib.get('lang')
1854                         url_el = representation.find(_add_ns('BaseURL'))
1855                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1856                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1857                         f = {
1858                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1859                             'url': base_url,
1860                             'manifest_url': mpd_url,
1861                             'ext': mimetype2ext(mime_type),
1862                             'width': int_or_none(representation_attrib.get('width')),
1863                             'height': int_or_none(representation_attrib.get('height')),
1864                             'tbr': float_or_none(bandwidth, 1000),
1865                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1866                             'fps': int_or_none(representation_attrib.get('frameRate')),
1867                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1868                             'format_note': 'DASH %s' % content_type,
1869                             'filesize': filesize,
1870                         }
1871                         f.update(parse_codecs(representation_attrib.get('codecs')))
1872                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1873
1874                         def prepare_template(template_name, identifiers):
1875                             t = representation_ms_info[template_name]
1876                             t = t.replace('$RepresentationID$', representation_id)
1877                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1878                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1879                             t.replace('$$', '$')
1880                             return t
1881
1882                         # @initialization is a regular template like @media one
1883                         # so it should be handled just the same way (see
1884                         # https://github.com/rg3/youtube-dl/issues/11605)
1885                         if 'initialization' in representation_ms_info:
1886                             initialization_template = prepare_template(
1887                                 'initialization',
1888                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1889                                 # $Time$ shall not be included for @initialization thus
1890                                 # only $Bandwidth$ remains
1891                                 ('Bandwidth', ))
1892                             representation_ms_info['initialization_url'] = initialization_template % {
1893                                 'Bandwidth': bandwidth,
1894                             }
1895
1896                         def location_key(location):
1897                             return 'url' if re.match(r'^https?://', location) else 'path'
1898
1899                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1900
1901                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1902                             media_location_key = location_key(media_template)
1903
1904                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1905                             # can't be used at the same time
1906                             if '%(Number' in media_template and 's' not in representation_ms_info:
1907                                 segment_duration = None
1908                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1909                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1910                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1911                                 representation_ms_info['fragments'] = [{
1912                                     media_location_key: media_template % {
1913                                         'Number': segment_number,
1914                                         'Bandwidth': bandwidth,
1915                                     },
1916                                     'duration': segment_duration,
1917                                 } for segment_number in range(
1918                                     representation_ms_info['start_number'],
1919                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1920                             else:
1921                                 # $Number*$ or $Time$ in media template with S list available
1922                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1923                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1924                                 representation_ms_info['fragments'] = []
1925                                 segment_time = 0
1926                                 segment_d = None
1927                                 segment_number = representation_ms_info['start_number']
1928
1929                                 def add_segment_url():
1930                                     segment_url = media_template % {
1931                                         'Time': segment_time,
1932                                         'Bandwidth': bandwidth,
1933                                         'Number': segment_number,
1934                                     }
1935                                     representation_ms_info['fragments'].append({
1936                                         media_location_key: segment_url,
1937                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1938                                     })
1939
1940                                 for num, s in enumerate(representation_ms_info['s']):
1941                                     segment_time = s.get('t') or segment_time
1942                                     segment_d = s['d']
1943                                     add_segment_url()
1944                                     segment_number += 1
1945                                     for r in range(s.get('r', 0)):
1946                                         segment_time += segment_d
1947                                         add_segment_url()
1948                                         segment_number += 1
1949                                     segment_time += segment_d
1950                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1951                             # No media template
1952                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1953                             # or any YouTube dashsegments video
1954                             fragments = []
1955                             segment_index = 0
1956                             timescale = representation_ms_info['timescale']
1957                             for s in representation_ms_info['s']:
1958                                 duration = float_or_none(s['d'], timescale)
1959                                 for r in range(s.get('r', 0) + 1):
1960                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1961                                     fragments.append({
1962                                         location_key(segment_uri): segment_uri,
1963                                         'duration': duration,
1964                                     })
1965                                     segment_index += 1
1966                             representation_ms_info['fragments'] = fragments
1967                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1968                         # No fragments key is present in this case.
1969                         if 'fragments' in representation_ms_info:
1970                             f.update({
1971                                 'fragment_base_url': base_url,
1972                                 'fragments': [],
1973                                 'protocol': 'http_dash_segments',
1974                             })
1975                             if 'initialization_url' in representation_ms_info:
1976                                 initialization_url = representation_ms_info['initialization_url']
1977                                 if not f.get('url'):
1978                                     f['url'] = initialization_url
1979                                 f['fragments'].append({location_key(initialization_url): initialization_url})
1980                             f['fragments'].extend(representation_ms_info['fragments'])
1981                         try:
1982                             existing_format = next(
1983                                 fo for fo in formats
1984                                 if fo['format_id'] == representation_id)
1985                         except StopIteration:
1986                             full_info = formats_dict.get(representation_id, {}).copy()
1987                             full_info.update(f)
1988                             formats.append(full_info)
1989                         else:
1990                             existing_format.update(f)
1991                     else:
1992                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1993         return formats
1994
1995     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1996         res = self._download_webpage_handle(
1997             ism_url, video_id,
1998             note=note or 'Downloading ISM manifest',
1999             errnote=errnote or 'Failed to download ISM manifest',
2000             fatal=fatal)
2001         if res is False:
2002             return []
2003         ism, urlh = res
2004
2005         return self._parse_ism_formats(
2006             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2007
2008     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2009         """
2010         Parse formats from ISM manifest.
2011         References:
2012          1. [MS-SSTR]: Smooth Streaming Protocol,
2013             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2014         """
2015         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2016             return []
2017
2018         duration = int(ism_doc.attrib['Duration'])
2019         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2020
2021         formats = []
2022         for stream in ism_doc.findall('StreamIndex'):
2023             stream_type = stream.get('Type')
2024             if stream_type not in ('video', 'audio'):
2025                 continue
2026             url_pattern = stream.attrib['Url']
2027             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2028             stream_name = stream.get('Name')
2029             for track in stream.findall('QualityLevel'):
2030                 fourcc = track.get('FourCC')
2031                 # TODO: add support for WVC1 and WMAP
2032                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2033                     self.report_warning('%s is not a supported codec' % fourcc)
2034                     continue
2035                 tbr = int(track.attrib['Bitrate']) // 1000
2036                 # [1] does not mention Width and Height attributes. However,
2037                 # they're often present while MaxWidth and MaxHeight are
2038                 # missing, so should be used as fallbacks
2039                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2040                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2041                 sampling_rate = int_or_none(track.get('SamplingRate'))
2042
2043                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2044                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2045
2046                 fragments = []
2047                 fragment_ctx = {
2048                     'time': 0,
2049                 }
2050                 stream_fragments = stream.findall('c')
2051                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2052                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2053                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2054                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2055                     if not fragment_ctx['duration']:
2056                         try:
2057                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2058                         except IndexError:
2059                             next_fragment_time = duration
2060                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2061                     for _ in range(fragment_repeat):
2062                         fragments.append({
2063                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2064                             'duration': fragment_ctx['duration'] / stream_timescale,
2065                         })
2066                         fragment_ctx['time'] += fragment_ctx['duration']
2067
2068                 format_id = []
2069                 if ism_id:
2070                     format_id.append(ism_id)
2071                 if stream_name:
2072                     format_id.append(stream_name)
2073                 format_id.append(compat_str(tbr))
2074
2075                 formats.append({
2076                     'format_id': '-'.join(format_id),
2077                     'url': ism_url,
2078                     'manifest_url': ism_url,
2079                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2080                     'width': width,
2081                     'height': height,
2082                     'tbr': tbr,
2083                     'asr': sampling_rate,
2084                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2085                     'acodec': 'none' if stream_type == 'video' else fourcc,
2086                     'protocol': 'ism',
2087                     'fragments': fragments,
2088                     '_download_params': {
2089                         'duration': duration,
2090                         'timescale': stream_timescale,
2091                         'width': width or 0,
2092                         'height': height or 0,
2093                         'fourcc': fourcc,
2094                         'codec_private_data': track.get('CodecPrivateData'),
2095                         'sampling_rate': sampling_rate,
2096                         'channels': int_or_none(track.get('Channels', 2)),
2097                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2098                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2099                     },
2100                 })
2101         return formats
2102
2103     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2104         def absolute_url(video_url):
2105             return compat_urlparse.urljoin(base_url, video_url)
2106
2107         def parse_content_type(content_type):
2108             if not content_type:
2109                 return {}
2110             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2111             if ctr:
2112                 mimetype, codecs = ctr.groups()
2113                 f = parse_codecs(codecs)
2114                 f['ext'] = mimetype2ext(mimetype)
2115                 return f
2116             return {}
2117
2118         def _media_formats(src, cur_media_type, type_info={}):
2119             full_url = absolute_url(src)
2120             ext = type_info.get('ext') or determine_ext(full_url)
2121             if ext == 'm3u8':
2122                 is_plain_url = False
2123                 formats = self._extract_m3u8_formats(
2124                     full_url, video_id, ext='mp4',
2125                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2126                     preference=preference)
2127             elif ext == 'mpd':
2128                 is_plain_url = False
2129                 formats = self._extract_mpd_formats(
2130                     full_url, video_id, mpd_id=mpd_id)
2131             else:
2132                 is_plain_url = True
2133                 formats = [{
2134                     'url': full_url,
2135                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2136                 }]
2137             return is_plain_url, formats
2138
2139         entries = []
2140         # amp-video and amp-audio are very similar to their HTML5 counterparts
2141         # so we wll include them right here (see
2142         # https://www.ampproject.org/docs/reference/components/amp-video)
2143         media_tags = [(media_tag, media_type, '')
2144                       for media_tag, media_type
2145                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2146         media_tags.extend(re.findall(
2147             # We only allow video|audio followed by a whitespace or '>'.
2148             # Allowing more characters may end up in significant slow down (see
2149             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2150             # http://www.porntrex.com/maps/videositemap.xml).
2151             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2152         for media_tag, media_type, media_content in media_tags:
2153             media_info = {
2154                 'formats': [],
2155                 'subtitles': {},
2156             }
2157             media_attributes = extract_attributes(media_tag)
2158             src = media_attributes.get('src')
2159             if src:
2160                 _, formats = _media_formats(src, media_type)
2161                 media_info['formats'].extend(formats)
2162             media_info['thumbnail'] = media_attributes.get('poster')
2163             if media_content:
2164                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2165                     source_attributes = extract_attributes(source_tag)
2166                     src = source_attributes.get('src')
2167                     if not src:
2168                         continue
2169                     f = parse_content_type(source_attributes.get('type'))
2170                     is_plain_url, formats = _media_formats(src, media_type, f)
2171                     if is_plain_url:
2172                         f.update(formats[0])
2173                         media_info['formats'].append(f)
2174                     else:
2175                         media_info['formats'].extend(formats)
2176                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2177                     track_attributes = extract_attributes(track_tag)
2178                     kind = track_attributes.get('kind')
2179                     if not kind or kind in ('subtitles', 'captions'):
2180                         src = track_attributes.get('src')
2181                         if not src:
2182                             continue
2183                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2184                         media_info['subtitles'].setdefault(lang, []).append({
2185                             'url': absolute_url(src),
2186                         })
2187             if media_info['formats'] or media_info['subtitles']:
2188                 entries.append(media_info)
2189         return entries
2190
2191     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2192         formats = []
2193         hdcore_sign = 'hdcore=3.7.0'
2194         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2195         hds_host = hosts.get('hds')
2196         if hds_host:
2197             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2198         if 'hdcore=' not in f4m_url:
2199             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2200         f4m_formats = self._extract_f4m_formats(
2201             f4m_url, video_id, f4m_id='hds', fatal=False)
2202         for entry in f4m_formats:
2203             entry.update({'extra_param_to_segment_url': hdcore_sign})
2204         formats.extend(f4m_formats)
2205         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2206         hls_host = hosts.get('hls')
2207         if hls_host:
2208             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2209         formats.extend(self._extract_m3u8_formats(
2210             m3u8_url, video_id, 'mp4', 'm3u8_native',
2211             m3u8_id='hls', fatal=False))
2212         return formats
2213
2214     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2215         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2216         url_base = self._search_regex(
2217             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2218         http_base_url = '%s:%s' % ('http', url_base)
2219         formats = []
2220         if 'm3u8' not in skip_protocols:
2221             formats.extend(self._extract_m3u8_formats(
2222                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2223                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2224         if 'f4m' not in skip_protocols:
2225             formats.extend(self._extract_f4m_formats(
2226                 http_base_url + '/manifest.f4m',
2227                 video_id, f4m_id='hds', fatal=False))
2228         if 'dash' not in skip_protocols:
2229             formats.extend(self._extract_mpd_formats(
2230                 http_base_url + '/manifest.mpd',
2231                 video_id, mpd_id='dash', fatal=False))
2232         if re.search(r'(?:/smil:|\.smil)', url_base):
2233             if 'smil' not in skip_protocols:
2234                 rtmp_formats = self._extract_smil_formats(
2235                     http_base_url + '/jwplayer.smil',
2236                     video_id, fatal=False)
2237                 for rtmp_format in rtmp_formats:
2238                     rtsp_format = rtmp_format.copy()
2239                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2240                     del rtsp_format['play_path']
2241                     del rtsp_format['ext']
2242                     rtsp_format.update({
2243                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2244                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2245                         'protocol': 'rtsp',
2246                     })
2247                     formats.extend([rtmp_format, rtsp_format])
2248         else:
2249             for protocol in ('rtmp', 'rtsp'):
2250                 if protocol not in skip_protocols:
2251                     formats.append({
2252                         'url': '%s:%s' % (protocol, url_base),
2253                         'format_id': protocol,
2254                         'protocol': protocol,
2255                     })
2256         return formats
2257
2258     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2259         mobj = re.search(
2260             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2261             webpage)
2262         if mobj:
2263             try:
2264                 jwplayer_data = self._parse_json(mobj.group('options'),
2265                                                  video_id=video_id,
2266                                                  transform_source=transform_source)
2267             except ExtractorError:
2268                 pass
2269             else:
2270                 if isinstance(jwplayer_data, dict):
2271                     return jwplayer_data
2272
2273     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2274         jwplayer_data = self._find_jwplayer_data(
2275             webpage, video_id, transform_source=js_to_json)
2276         return self._parse_jwplayer_data(
2277             jwplayer_data, video_id, *args, **kwargs)
2278
2279     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2280                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2281         # JWPlayer backward compatibility: flattened playlists
2282         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2283         if 'playlist' not in jwplayer_data:
2284             jwplayer_data = {'playlist': [jwplayer_data]}
2285
2286         entries = []
2287
2288         # JWPlayer backward compatibility: single playlist item
2289         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2290         if not isinstance(jwplayer_data['playlist'], list):
2291             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2292
2293         for video_data in jwplayer_data['playlist']:
2294             # JWPlayer backward compatibility: flattened sources
2295             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2296             if 'sources' not in video_data:
2297                 video_data['sources'] = [video_data]
2298
2299             this_video_id = video_id or video_data['mediaid']
2300
2301             formats = self._parse_jwplayer_formats(
2302                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2303                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2304             self._sort_formats(formats)
2305
2306             subtitles = {}
2307             tracks = video_data.get('tracks')
2308             if tracks and isinstance(tracks, list):
2309                 for track in tracks:
2310                     if not isinstance(track, dict):
2311                         continue
2312                     if track.get('kind') != 'captions':
2313                         continue
2314                     track_url = urljoin(base_url, track.get('file'))
2315                     if not track_url:
2316                         continue
2317                     subtitles.setdefault(track.get('label') or 'en', []).append({
2318                         'url': self._proto_relative_url(track_url)
2319                     })
2320
2321             entries.append({
2322                 'id': this_video_id,
2323                 'title': video_data['title'] if require_title else video_data.get('title'),
2324                 'description': video_data.get('description'),
2325                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2326                 'timestamp': int_or_none(video_data.get('pubdate')),
2327                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2328                 'subtitles': subtitles,
2329                 'formats': formats,
2330             })
2331         if len(entries) == 1:
2332             return entries[0]
2333         else:
2334             return self.playlist_result(entries)
2335
2336     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2337                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2338         urls = []
2339         formats = []
2340         for source in jwplayer_sources_data:
2341             if not isinstance(source, dict):
2342                 continue
2343             source_url = self._proto_relative_url(source.get('file'))
2344             if not source_url:
2345                 continue
2346             if base_url:
2347                 source_url = compat_urlparse.urljoin(base_url, source_url)
2348             if source_url in urls:
2349                 continue
2350             urls.append(source_url)
2351             source_type = source.get('type') or ''
2352             ext = mimetype2ext(source_type) or determine_ext(source_url)
2353             if source_type == 'hls' or ext == 'm3u8':
2354                 formats.extend(self._extract_m3u8_formats(
2355                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2356                     m3u8_id=m3u8_id, fatal=False))
2357             elif ext == 'mpd':
2358                 formats.extend(self._extract_mpd_formats(
2359                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2360             elif ext == 'smil':
2361                 formats.extend(self._extract_smil_formats(
2362                     source_url, video_id, fatal=False))
2363             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2364             elif source_type.startswith('audio') or ext in (
2365                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2366                 formats.append({
2367                     'url': source_url,
2368                     'vcodec': 'none',
2369                     'ext': ext,
2370                 })
2371             else:
2372                 height = int_or_none(source.get('height'))
2373                 if height is None:
2374                     # Often no height is provided but there is a label in
2375                     # format like "1080p", "720p SD", or 1080.
2376                     height = int_or_none(self._search_regex(
2377                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2378                         'height', default=None))
2379                 a_format = {
2380                     'url': source_url,
2381                     'width': int_or_none(source.get('width')),
2382                     'height': height,
2383                     'tbr': int_or_none(source.get('bitrate')),
2384                     'ext': ext,
2385                 }
2386                 if source_url.startswith('rtmp'):
2387                     a_format['ext'] = 'flv'
2388                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2389                     # of jwplayer.flash.swf
2390                     rtmp_url_parts = re.split(
2391                         r'((?:mp4|mp3|flv):)', source_url, 1)
2392                     if len(rtmp_url_parts) == 3:
2393                         rtmp_url, prefix, play_path = rtmp_url_parts
2394                         a_format.update({
2395                             'url': rtmp_url,
2396                             'play_path': prefix + play_path,
2397                         })
2398                     if rtmp_params:
2399                         a_format.update(rtmp_params)
2400                 formats.append(a_format)
2401         return formats
2402
2403     def _live_title(self, name):
2404         """ Generate the title for a live video """
2405         now = datetime.datetime.now()
2406         now_str = now.strftime('%Y-%m-%d %H:%M')
2407         return name + ' ' + now_str
2408
2409     def _int(self, v, name, fatal=False, **kwargs):
2410         res = int_or_none(v, **kwargs)
2411         if 'get_attr' in kwargs:
2412             print(getattr(v, kwargs['get_attr']))
2413         if res is None:
2414             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2415             if fatal:
2416                 raise ExtractorError(msg)
2417             else:
2418                 self._downloader.report_warning(msg)
2419         return res
2420
2421     def _float(self, v, name, fatal=False, **kwargs):
2422         res = float_or_none(v, **kwargs)
2423         if res is None:
2424             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2425             if fatal:
2426                 raise ExtractorError(msg)
2427             else:
2428                 self._downloader.report_warning(msg)
2429         return res
2430
2431     def _set_cookie(self, domain, name, value, expire_time=None):
2432         cookie = compat_cookiejar.Cookie(
2433             0, name, value, None, None, domain, None,
2434             None, '/', True, False, expire_time, '', None, None, None)
2435         self._downloader.cookiejar.set_cookie(cookie)
2436
2437     def _get_cookies(self, url):
2438         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2439         req = sanitized_Request(url)
2440         self._downloader.cookiejar.add_cookie_header(req)
2441         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2442
2443     def get_testcases(self, include_onlymatching=False):
2444         t = getattr(self, '_TEST', None)
2445         if t:
2446             assert not hasattr(self, '_TESTS'), \
2447                 '%s has _TEST and _TESTS' % type(self).__name__
2448             tests = [t]
2449         else:
2450             tests = getattr(self, '_TESTS', [])
2451         for t in tests:
2452             if not include_onlymatching and t.get('only_matching', False):
2453                 continue
2454             t['name'] = type(self).__name__[:-len('IE')]
2455             yield t
2456
2457     def is_suitable(self, age_limit):
2458         """ Test whether the extractor is generally suitable for the given
2459         age limit (i.e. pornographic sites are not, all others usually are) """
2460
2461         any_restricted = False
2462         for tc in self.get_testcases(include_onlymatching=False):
2463             if tc.get('playlist', []):
2464                 tc = tc['playlist'][0]
2465             is_restricted = age_restricted(
2466                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2467             if not is_restricted:
2468                 return True
2469             any_restricted = any_restricted or is_restricted
2470         return not any_restricted
2471
2472     def extract_subtitles(self, *args, **kwargs):
2473         if (self._downloader.params.get('writesubtitles', False) or
2474                 self._downloader.params.get('listsubtitles')):
2475             return self._get_subtitles(*args, **kwargs)
2476         return {}
2477
2478     def _get_subtitles(self, *args, **kwargs):
2479         raise NotImplementedError('This method must be implemented by subclasses')
2480
2481     @staticmethod
2482     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2483         """ Merge subtitle items for one language. Items with duplicated URLs
2484         will be dropped. """
2485         list1_urls = set([item['url'] for item in subtitle_list1])
2486         ret = list(subtitle_list1)
2487         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2488         return ret
2489
2490     @classmethod
2491     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2492         """ Merge two subtitle dictionaries, language by language. """
2493         ret = dict(subtitle_dict1)
2494         for lang in subtitle_dict2:
2495             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2496         return ret
2497
2498     def extract_automatic_captions(self, *args, **kwargs):
2499         if (self._downloader.params.get('writeautomaticsub', False) or
2500                 self._downloader.params.get('listsubtitles')):
2501             return self._get_automatic_captions(*args, **kwargs)
2502         return {}
2503
2504     def _get_automatic_captions(self, *args, **kwargs):
2505         raise NotImplementedError('This method must be implemented by subclasses')
2506
2507     def mark_watched(self, *args, **kwargs):
2508         if (self._downloader.params.get('mark_watched', False) and
2509                 (self._get_login_info()[0] is not None or
2510                     self._downloader.params.get('cookiefile') is not None)):
2511             self._mark_watched(*args, **kwargs)
2512
2513     def _mark_watched(self, *args, **kwargs):
2514         raise NotImplementedError('This method must be implemented by subclasses')
2515
2516     def geo_verification_headers(self):
2517         headers = {}
2518         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2519         if geo_verification_proxy:
2520             headers['Ytdl-request-proxy'] = geo_verification_proxy
2521         return headers
2522
2523     def _generic_id(self, url):
2524         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2525
2526     def _generic_title(self, url):
2527         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2528
2529
2530 class SearchInfoExtractor(InfoExtractor):
2531     """
2532     Base class for paged search queries extractors.
2533     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2534     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2535     """
2536
2537     @classmethod
2538     def _make_valid_url(cls):
2539         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2540
2541     @classmethod
2542     def suitable(cls, url):
2543         return re.match(cls._make_valid_url(), url) is not None
2544
2545     def _real_extract(self, query):
2546         mobj = re.match(self._make_valid_url(), query)
2547         if mobj is None:
2548             raise ExtractorError('Invalid search query "%s"' % query)
2549
2550         prefix = mobj.group('prefix')
2551         query = mobj.group('query')
2552         if prefix == '':
2553             return self._get_n_results(query, 1)
2554         elif prefix == 'all':
2555             return self._get_n_results(query, self._MAX_RESULTS)
2556         else:
2557             n = int(prefix)
2558             if n <= 0:
2559                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2560             elif n > self._MAX_RESULTS:
2561                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2562                 n = self._MAX_RESULTS
2563             return self._get_n_results(query, n)
2564
2565     def _get_n_results(self, query, n):
2566         """Get a specified number of results for a query"""
2567         raise NotImplementedError('This method must be implemented by subclasses')
2568
2569     @property
2570     def SEARCH_KEY(self):
2571         return self._SEARCH_KEY