_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177
 178     url:            Final video URL.
 179     ext:            Video filename extension.
 180     format:         The video format, defaults to ext (used for --get-format)
 181     player_url:     SWF Player URL (used for rtmpdump).
 182
 183     The following fields are optional:
 184
 185     alt_title:      A secondary title of the video.
 186     display_id      An alternative identifier for the video, not necessarily
 187                     unique, but available before title. Typically, id is
 188                     something like "4234987", title "Dancing naked mole rats",
 189                     and display_id "dancing-naked-mole-rats"
 190     thumbnails:     A list of dictionaries, with the following entries:
 191                         * "id" (optional, string) - Thumbnail format ID
 192                         * "url"
 193                         * "preference" (optional, int) - quality of the image
 194                         * "width" (optional, int)
 195                         * "height" (optional, int)
 196                         * "resolution" (optional, string "{width}x{height"},
 197                                         deprecated)
 198                         * "filesize" (optional, int)
 199     thumbnail:      Full URL to a video thumbnail image.
 200     description:    Full video description.
 201     uploader:       Full name of the video uploader.
 202     license:        License name the video is licensed under.
 203     creator:        The creator of the video.
 204     release_date:   The date (YYYYMMDD) when the video was released.
 205     timestamp:      UNIX timestamp of the moment the video became available.
 206     upload_date:    Video upload date (YYYYMMDD).
 207                     If not explicitly set, calculated from timestamp.
 208     uploader_id:    Nickname or id of the video uploader.
 209     uploader_url:   Full URL to a personal webpage of the video uploader.
 210     location:       Physical location where the video was filmed.
 211     subtitles:      The available subtitles as a dictionary in the format
 212                     {tag: subformats}. "tag" is usually a language code, and
 213                     "subformats" is a list sorted from lower to higher
 214                     preference, each element is a dictionary with the "ext"
 215                     entry and one of:
 216                         * "data": The subtitles file contents
 217                         * "url": A URL pointing to the subtitles file
 218                     "ext" will be calculated from URL if missing
 219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 220                     automatically generated captions
 221     duration:       Length of the video in seconds, as an integer or float.
 222     view_count:     How many users have watched the video on the platform.
 223     like_count:     Number of positive ratings of the video
 224     dislike_count:  Number of negative ratings of the video
 225     repost_count:   Number of reposts of the video
 226     average_rating: Average rating give by users, the scale used depends on the webpage
 227     comment_count:  Number of comments on the video
 228     comments:       A list of comments, each with one or more of the following
 229                     properties (all but one of text or html optional):
 230                         * "author" - human-readable name of the comment author
 231                         * "author_id" - user ID of the comment author
 232                         * "id" - Comment ID
 233                         * "html" - Comment as HTML
 234                         * "text" - Plain text of the comment
 235                         * "timestamp" - UNIX timestamp of comment
 236                         * "parent" - ID of the comment this one is replying to.
 237                                      Set to "root" to indicate that this is a
 238                                      comment to the original video.
 239     age_limit:      Age restriction for the video, as an integer (years)
 240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 241                     should allow to get the same result again. (It will be set
 242                     by YoutubeDL if it's missing)
 243     categories:     A list of categories that the video falls in, for example
 244                     ["Sports", "Berlin"]
 245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 246     is_live:        True, False, or None (=unknown). Whether this video is a
 247                     live stream that goes on instead of a fixed-length video.
 248     start_time:     Time in seconds where the reproduction should start, as
 249                     specified in the URL.
 250     end_time:       Time in seconds where the reproduction should end, as
 251                     specified in the URL.
 252     chapters:       A list of dictionaries, with the following entries:
 253                         * "start_time" - The start time of the chapter in seconds
 254                         * "end_time" - The end time of the chapter in seconds
 255                         * "title" (optional, string)
 256
 257     The following fields should only be used when the video belongs to some logical
 258     chapter or section:
 259
 260     chapter:        Name or title of the chapter the video belongs to.
 261     chapter_number: Number of the chapter the video belongs to, as an integer.
 262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 263
 264     The following fields should only be used when the video is an episode of some
 265     series, programme or podcast:
 266
 267     series:         Title of the series or programme the video episode belongs to.
 268     season:         Title of the season the video episode belongs to.
 269     season_number:  Number of the season the video episode belongs to, as an integer.
 270     season_id:      Id of the season the video episode belongs to, as a unicode string.
 271     episode:        Title of the video episode. Unlike mandatory video title field,
 272                     this field should denote the exact title of the video episode
 273                     without any kind of decoration.
 274     episode_number: Number of the video episode within a season, as an integer.
 275     episode_id:     Id of the video episode, as a unicode string.
 276
 277     The following fields should only be used when the media is a track or a part of
 278     a music album:
 279
 280     track:          Title of the track.
 281     track_number:   Number of the track within an album or a disc, as an integer.
 282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 283                     as a unicode string.
 284     artist:         Artist(s) of the track.
 285     genre:          Genre(s) of the track.
 286     album:          Title of the album the track belongs to.
 287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 288     album_artist:   List of all artists appeared on the album (e.g.
 289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 290                     and compilations).
 291     disc_number:    Number of the disc or other physical medium the track belongs to,
 292                     as an integer.
 293     release_year:   Year (YYYY) when the album was released.
 294
 295     Unless mentioned otherwise, the fields should be Unicode strings.
 296
 297     Unless mentioned otherwise, None is equivalent to absence of information.
 298
 299
 300     _type "playlist" indicates multiple videos.
 301     There must be a key "entries", which is a list, an iterable, or a PagedList
 302     object, each element of which is a valid dictionary by this specification.
 303
 304     Additionally, playlists can have "title", "description" and "id" attributes
 305     with the same semantics as videos (see above).
 306
 307
 308     _type "multi_video" indicates that there are multiple videos that
 309     form a single show, for examples multiple acts of an opera or TV episode.
 310     It must have an entries key like a playlist and contain all the keys
 311     required for a video at the same time.
 312
 313
 314     _type "url" indicates that the video must be extracted from another
 315     location, possibly by a different extractor. Its only required key is:
 316     "url" - the next URL to extract.
 317     The key "ie_key" can be set to the class name (minus the trailing "IE",
 318     e.g. "Youtube") if the extractor class is known in advance.
 319     Additionally, the dictionary may have any properties of the resolved entity
 320     known in advance, for example "title" if the title of the referred video is
 321     known ahead of time.
 322
 323
 324     _type "url_transparent" entities have the same specification as "url", but
 325     indicate that the given additional information is more precise than the one
 326     associated with the resolved URL.
 327     This is useful when a site employs a video service that hosts the video and
 328     its technical metadata, but that video service does not embed a useful
 329     title, description etc.
 330
 331
 332     Subclasses of this one should re-define the _real_initialize() and
 333     _real_extract() methods and define a _VALID_URL regexp.
 334     Probably, they should also be added to the list of extractors.
 335
 336     _GEO_BYPASS attribute may be set to False in order to disable
 337     geo restriction bypass mechanisms for a particular extractor.
 338     Though it won't disable explicit geo restriction bypass based on
 339     country code provided with geo_bypass_country. (experimental)
 340
 341     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 342     countries for this extractor. One of these countries will be used by
 343     geo restriction bypass mechanism right away in order to bypass
 344     geo restriction, of course, if the mechanism is not disabled. (experimental)
 345
 346     NB: both these geo attributes are experimental and may change in future
 347     or be completely removed.
 348
 349     Finally, the _WORKING attribute should be set to False for broken IEs
 350     in order to warn the users and skip the tests.
 351     """
 352
 353     _ready = False
 354     _downloader = None
 355     _x_forwarded_for_ip = None
 356     _GEO_BYPASS = True
 357     _GEO_COUNTRIES = None
 358     _WORKING = True
 359
 360     def __init__(self, downloader=None):
 361         """Constructor. Receives an optional downloader."""
 362         self._ready = False
 363         self._x_forwarded_for_ip = None
 364         self.set_downloader(downloader)
 365
 366     @classmethod
 367     def suitable(cls, url):
 368         """Receives a URL and returns True if suitable for this IE."""
 369
 370         # This does not use has/getattr intentionally - we want to know whether
 371         # we have cached the regexp for *this* class, whereas getattr would also
 372         # match the superclass
 373         if '_VALID_URL_RE' not in cls.__dict__:
 374             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 375         return cls._VALID_URL_RE.match(url) is not None
 376
 377     @classmethod
 378     def _match_id(cls, url):
 379         if '_VALID_URL_RE' not in cls.__dict__:
 380             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 381         m = cls._VALID_URL_RE.match(url)
 382         assert m
 383         return compat_str(m.group('id'))
 384
 385     @classmethod
 386     def working(cls):
 387         """Getter method for _WORKING."""
 388         return cls._WORKING
 389
 390     def initialize(self):
 391         """Initializes an instance (authentication, etc)."""
 392         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 393         if not self._ready:
 394             self._real_initialize()
 395             self._ready = True
 396
 397     def _initialize_geo_bypass(self, countries):
 398         """
 399         Initialize geo restriction bypass mechanism.
 400
 401         This method is used to initialize geo bypass mechanism based on faking
 402         X-Forwarded-For HTTP header. A random country from provided country list
 403         is selected and a random IP belonging to this country is generated. This
 404         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 405         HTTP requests.
 406
 407         This method will be used for initial geo bypass mechanism initialization
 408         during the instance initialization with _GEO_COUNTRIES.
 409
 410         You may also manually call it from extractor's code if geo countries
 411         information is not available beforehand (e.g. obtained during
 412         extraction) or due to some another reason.
 413         """
 414         if not self._x_forwarded_for_ip:
 415             country_code = self._downloader.params.get('geo_bypass_country', None)
 416             # If there is no explicit country for geo bypass specified and
 417             # the extractor is known to be geo restricted let's fake IP
 418             # as X-Forwarded-For right away.
 419             if (not country_code and
 420                     self._GEO_BYPASS and
 421                     self._downloader.params.get('geo_bypass', True) and
 422                     countries):
 423                 country_code = random.choice(countries)
 424             if country_code:
 425                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 426                 if self._downloader.params.get('verbose', False):
 427                     self._downloader.to_screen(
 428                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 429                         % (self._x_forwarded_for_ip, country_code.upper()))
 430
 431     def extract(self, url):
 432         """Extracts URL information and returns it in list of dicts."""
 433         try:
 434             for _ in range(2):
 435                 try:
 436                     self.initialize()
 437                     ie_result = self._real_extract(url)
 438                     if self._x_forwarded_for_ip:
 439                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 440                     return ie_result
 441                 except GeoRestrictedError as e:
 442                     if self.__maybe_fake_ip_and_retry(e.countries):
 443                         continue
 444                     raise
 445         except ExtractorError:
 446             raise
 447         except compat_http_client.IncompleteRead as e:
 448             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 449         except (KeyError, StopIteration) as e:
 450             raise ExtractorError('An extractor error has occurred.', cause=e)
 451
 452     def __maybe_fake_ip_and_retry(self, countries):
 453         if (not self._downloader.params.get('geo_bypass_country', None) and
 454                 self._GEO_BYPASS and
 455                 self._downloader.params.get('geo_bypass', True) and
 456                 not self._x_forwarded_for_ip and
 457                 countries):
 458             country_code = random.choice(countries)
 459             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 460             if self._x_forwarded_for_ip:
 461                 self.report_warning(
 462                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 463                     % (self._x_forwarded_for_ip, country_code.upper()))
 464                 return True
 465         return False
 466
 467     def set_downloader(self, downloader):
 468         """Sets the downloader for this IE."""
 469         self._downloader = downloader
 470
 471     def _real_initialize(self):
 472         """Real initialization process. Redefine in subclasses."""
 473         pass
 474
 475     def _real_extract(self, url):
 476         """Real extraction process. Redefine in subclasses."""
 477         pass
 478
 479     @classmethod
 480     def ie_key(cls):
 481         """A string for getting the InfoExtractor with get_info_extractor"""
 482         return compat_str(cls.__name__[:-2])
 483
 484     @property
 485     def IE_NAME(self):
 486         return compat_str(type(self).__name__[:-2])
 487
 488     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 489         """ Returns the response handle """
 490         if note is None:
 491             self.report_download_webpage(video_id)
 492         elif note is not False:
 493             if video_id is None:
 494                 self.to_screen('%s' % (note,))
 495             else:
 496                 self.to_screen('%s: %s' % (video_id, note))
 497         if isinstance(url_or_request, compat_urllib_request.Request):
 498             url_or_request = update_Request(
 499                 url_or_request, data=data, headers=headers, query=query)
 500         else:
 501             if query:
 502                 url_or_request = update_url_query(url_or_request, query)
 503             if data is not None or headers:
 504                 url_or_request = sanitized_Request(url_or_request, data, headers)
 505         try:
 506             return self._downloader.urlopen(url_or_request)
 507         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 508             if errnote is False:
 509                 return False
 510             if errnote is None:
 511                 errnote = 'Unable to download webpage'
 512
 513             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 514             if fatal:
 515                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 516             else:
 517                 self._downloader.report_warning(errmsg)
 518                 return False
 519
 520     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 521         """ Returns a tuple (page content as string, URL handle) """
 522         # Strip hashes from the URL (#1038)
 523         if isinstance(url_or_request, (compat_str, str)):
 524             url_or_request = url_or_request.partition('#')[0]
 525
 526         # Some sites check X-Forwarded-For HTTP header in order to figure out
 527         # the origin of the client behind proxy. This allows bypassing geo
 528         # restriction by faking this header's value to IP that belongs to some
 529         # geo unrestricted country. We will do so once we encounter any
 530         # geo restriction error.
 531         if self._x_forwarded_for_ip:
 532             if 'X-Forwarded-For' not in headers:
 533                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 534
 535         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 536         if urlh is False:
 537             assert not fatal
 538             return False
 539         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 540         return (content, urlh)
 541
 542     @staticmethod
 543     def _guess_encoding_from_content(content_type, webpage_bytes):
 544         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 545         if m:
 546             encoding = m.group(1)
 547         else:
 548             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 549                           webpage_bytes[:1024])
 550             if m:
 551                 encoding = m.group(1).decode('ascii')
 552             elif webpage_bytes.startswith(b'\xff\xfe'):
 553                 encoding = 'utf-16'
 554             else:
 555                 encoding = 'utf-8'
 556
 557         return encoding
 558
 559     def __check_blocked(self, content):
 560         first_block = content[:512]
 561         if ('<title>Access to this site is blocked</title>' in content and
 562                 'Websense' in first_block):
 563             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 564             blocked_iframe = self._html_search_regex(
 565                 r'<iframe src="([^"]+)"', content,
 566                 'Websense information URL', default=None)
 567             if blocked_iframe:
 568                 msg += ' Visit %s for more details' % blocked_iframe
 569             raise ExtractorError(msg, expected=True)
 570         if '<title>The URL you requested has been blocked</title>' in first_block:
 571             msg = (
 572                 'Access to this webpage has been blocked by Indian censorship. '
 573                 'Use a VPN or proxy server (with --proxy) to route around it.')
 574             block_msg = self._html_search_regex(
 575                 r'</h1><p>(.*?)</p>',
 576                 content, 'block message', default=None)
 577             if block_msg:
 578                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 579             raise ExtractorError(msg, expected=True)
 580         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 581                 'blocklist.rkn.gov.ru' in content):
 582             raise ExtractorError(
 583                 'Access to this webpage has been blocked by decision of the Russian government. '
 584                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 585                 expected=True)
 586
 587     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 588         content_type = urlh.headers.get('Content-Type', '')
 589         webpage_bytes = urlh.read()
 590         if prefix is not None:
 591             webpage_bytes = prefix + webpage_bytes
 592         if not encoding:
 593             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 594         if self._downloader.params.get('dump_intermediate_pages', False):
 595             self.to_screen('Dumping request to ' + urlh.geturl())
 596             dump = base64.b64encode(webpage_bytes).decode('ascii')
 597             self._downloader.to_screen(dump)
 598         if self._downloader.params.get('write_pages', False):
 599             basen = '%s_%s' % (video_id, urlh.geturl())
 600             if len(basen) > 240:
 601                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 602                 basen = basen[:240 - len(h)] + h
 603             raw_filename = basen + '.dump'
 604             filename = sanitize_filename(raw_filename, restricted=True)
 605             self.to_screen('Saving request to ' + filename)
 606             # Working around MAX_PATH limitation on Windows (see
 607             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 608             if compat_os_name == 'nt':
 609                 absfilepath = os.path.abspath(filename)
 610                 if len(absfilepath) > 259:
 611                     filename = '\\\\?\\' + absfilepath
 612             with open(filename, 'wb') as outf:
 613                 outf.write(webpage_bytes)
 614
 615         try:
 616             content = webpage_bytes.decode(encoding, 'replace')
 617         except LookupError:
 618             content = webpage_bytes.decode('utf-8', 'replace')
 619
 620         self.__check_blocked(content)
 621
 622         return content
 623
 624     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 625         """ Returns the data of the page as a string """
 626         success = False
 627         try_count = 0
 628         while success is False:
 629             try:
 630                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 631                 success = True
 632             except compat_http_client.IncompleteRead as e:
 633                 try_count += 1
 634                 if try_count >= tries:
 635                     raise e
 636                 self._sleep(timeout, video_id)
 637         if res is False:
 638             return res
 639         else:
 640             content, _ = res
 641             return content
 642
 643     def _download_xml(self, url_or_request, video_id,
 644                       note='Downloading XML', errnote='Unable to download XML',
 645                       transform_source=None, fatal=True, encoding=None,
 646                       data=None, headers={}, query={}):
 647         """Return the xml as an xml.etree.ElementTree.Element"""
 648         xml_string = self._download_webpage(
 649             url_or_request, video_id, note, errnote, fatal=fatal,
 650             encoding=encoding, data=data, headers=headers, query=query)
 651         if xml_string is False:
 652             return xml_string
 653         return self._parse_xml(
 654             xml_string, video_id, transform_source=transform_source,
 655             fatal=fatal)
 656
 657     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 658         if transform_source:
 659             xml_string = transform_source(xml_string)
 660         try:
 661             return compat_etree_fromstring(xml_string.encode('utf-8'))
 662         except compat_xml_parse_error as ve:
 663             errmsg = '%s: Failed to parse XML ' % video_id
 664             if fatal:
 665                 raise ExtractorError(errmsg, cause=ve)
 666             else:
 667                 self.report_warning(errmsg + str(ve))
 668
 669     def _download_json(self, url_or_request, video_id,
 670                        note='Downloading JSON metadata',
 671                        errnote='Unable to download JSON metadata',
 672                        transform_source=None,
 673                        fatal=True, encoding=None, data=None, headers={}, query={}):
 674         json_string = self._download_webpage(
 675             url_or_request, video_id, note, errnote, fatal=fatal,
 676             encoding=encoding, data=data, headers=headers, query=query)
 677         if (not fatal) and json_string is False:
 678             return None
 679         return self._parse_json(
 680             json_string, video_id, transform_source=transform_source, fatal=fatal)
 681
 682     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 683         if transform_source:
 684             json_string = transform_source(json_string)
 685         try:
 686             return json.loads(json_string)
 687         except ValueError as ve:
 688             errmsg = '%s: Failed to parse JSON ' % video_id
 689             if fatal:
 690                 raise ExtractorError(errmsg, cause=ve)
 691             else:
 692                 self.report_warning(errmsg + str(ve))
 693
 694     def report_warning(self, msg, video_id=None):
 695         idstr = '' if video_id is None else '%s: ' % video_id
 696         self._downloader.report_warning(
 697             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 698
 699     def to_screen(self, msg):
 700         """Print msg to screen, prefixing it with '[ie_name]'"""
 701         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 702
 703     def report_extraction(self, id_or_name):
 704         """Report information extraction."""
 705         self.to_screen('%s: Extracting information' % id_or_name)
 706
 707     def report_download_webpage(self, video_id):
 708         """Report webpage download."""
 709         self.to_screen('%s: Downloading webpage' % video_id)
 710
 711     def report_age_confirmation(self):
 712         """Report attempt to confirm age."""
 713         self.to_screen('Confirming age')
 714
 715     def report_login(self):
 716         """Report attempt to log in."""
 717         self.to_screen('Logging in')
 718
 719     @staticmethod
 720     def raise_login_required(msg='This video is only available for registered users'):
 721         raise ExtractorError(
 722             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 723             expected=True)
 724
 725     @staticmethod
 726     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 727         raise GeoRestrictedError(msg, countries=countries)
 728
 729     # Methods for following #608
 730     @staticmethod
 731     def url_result(url, ie=None, video_id=None, video_title=None):
 732         """Returns a URL that points to a page that should be processed"""
 733         # TODO: ie should be the class used for getting the info
 734         video_info = {'_type': 'url',
 735                       'url': url,
 736                       'ie_key': ie}
 737         if video_id is not None:
 738             video_info['id'] = video_id
 739         if video_title is not None:
 740             video_info['title'] = video_title
 741         return video_info
 742
 743     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 744         urls = orderedSet(
 745             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 746             for m in matches)
 747         return self.playlist_result(
 748             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 749
 750     @staticmethod
 751     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 752         """Returns a playlist"""
 753         video_info = {'_type': 'playlist',
 754                       'entries': entries}
 755         if playlist_id:
 756             video_info['id'] = playlist_id
 757         if playlist_title:
 758             video_info['title'] = playlist_title
 759         if playlist_description:
 760             video_info['description'] = playlist_description
 761         return video_info
 762
 763     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 764         """
 765         Perform a regex search on the given string, using a single or a list of
 766         patterns returning the first matching group.
 767         In case of failure return a default value or raise a WARNING or a
 768         RegexNotFoundError, depending on fatal, specifying the field name.
 769         """
 770         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 771             mobj = re.search(pattern, string, flags)
 772         else:
 773             for p in pattern:
 774                 mobj = re.search(p, string, flags)
 775                 if mobj:
 776                     break
 777
 778         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 779             _name = '\033[0;34m%s\033[0m' % name
 780         else:
 781             _name = name
 782
 783         if mobj:
 784             if group is None:
 785                 # return the first matching group
 786                 return next(g for g in mobj.groups() if g is not None)
 787             else:
 788                 return mobj.group(group)
 789         elif default is not NO_DEFAULT:
 790             return default
 791         elif fatal:
 792             raise RegexNotFoundError('Unable to extract %s' % _name)
 793         else:
 794             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 795             return None
 796
 797     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 798         """
 799         Like _search_regex, but strips HTML tags and unescapes entities.
 800         """
 801         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 802         if res:
 803             return clean_html(res).strip()
 804         else:
 805             return res
 806
 807     def _get_netrc_login_info(self, netrc_machine=None):
 808         username = None
 809         password = None
 810         netrc_machine = netrc_machine or self._NETRC_MACHINE
 811
 812         if self._downloader.params.get('usenetrc', False):
 813             try:
 814                 info = netrc.netrc().authenticators(netrc_machine)
 815                 if info is not None:
 816                     username = info[0]
 817                     password = info[2]
 818                 else:
 819                     raise netrc.NetrcParseError(
 820                         'No authenticators for %s' % netrc_machine)
 821             except (IOError, netrc.NetrcParseError) as err:
 822                 self._downloader.report_warning(
 823                     'parsing .netrc: %s' % error_to_compat_str(err))
 824
 825         return username, password
 826
 827     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 828         """
 829         Get the login info as (username, password)
 830         First look for the manually specified credentials using username_option
 831         and password_option as keys in params dictionary. If no such credentials
 832         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 833         value.
 834         If there's no info available, return (None, None)
 835         """
 836         if self._downloader is None:
 837             return (None, None)
 838
 839         downloader_params = self._downloader.params
 840
 841         # Attempt to use provided username and password or .netrc data
 842         if downloader_params.get(username_option) is not None:
 843             username = downloader_params[username_option]
 844             password = downloader_params[password_option]
 845         else:
 846             username, password = self._get_netrc_login_info(netrc_machine)
 847
 848         return username, password
 849
 850     def _get_tfa_info(self, note='two-factor verification code'):
 851         """
 852         Get the two-factor authentication info
 853         TODO - asking the user will be required for sms/phone verify
 854         currently just uses the command line option
 855         If there's no info available, return None
 856         """
 857         if self._downloader is None:
 858             return None
 859         downloader_params = self._downloader.params
 860
 861         if downloader_params.get('twofactor') is not None:
 862             return downloader_params['twofactor']
 863
 864         return compat_getpass('Type %s and press [Return]: ' % note)
 865
 866     # Helper functions for extracting OpenGraph info
 867     @staticmethod
 868     def _og_regexes(prop):
 869         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 870         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 871                        % {'prop': re.escape(prop)})
 872         template = r'<meta[^>]+?%s[^>]+?%s'
 873         return [
 874             template % (property_re, content_re),
 875             template % (content_re, property_re),
 876         ]
 877
 878     @staticmethod
 879     def _meta_regex(prop):
 880         return r'''(?isx)<meta
 881                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 882                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 883
 884     def _og_search_property(self, prop, html, name=None, **kargs):
 885         if not isinstance(prop, (list, tuple)):
 886             prop = [prop]
 887         if name is None:
 888             name = 'OpenGraph %s' % prop[0]
 889         og_regexes = []
 890         for p in prop:
 891             og_regexes.extend(self._og_regexes(p))
 892         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 893         if escaped is None:
 894             return None
 895         return unescapeHTML(escaped)
 896
 897     def _og_search_thumbnail(self, html, **kargs):
 898         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 899
 900     def _og_search_description(self, html, **kargs):
 901         return self._og_search_property('description', html, fatal=False, **kargs)
 902
 903     def _og_search_title(self, html, **kargs):
 904         return self._og_search_property('title', html, **kargs)
 905
 906     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 907         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 908         if secure:
 909             regexes = self._og_regexes('video:secure_url') + regexes
 910         return self._html_search_regex(regexes, html, name, **kargs)
 911
 912     def _og_search_url(self, html, **kargs):
 913         return self._og_search_property('url', html, **kargs)
 914
 915     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 916         if not isinstance(name, (list, tuple)):
 917             name = [name]
 918         if display_name is None:
 919             display_name = name[0]
 920         return self._html_search_regex(
 921             [self._meta_regex(n) for n in name],
 922             html, display_name, fatal=fatal, group='content', **kwargs)
 923
 924     def _dc_search_uploader(self, html):
 925         return self._html_search_meta('dc.creator', html, 'uploader')
 926
 927     def _rta_search(self, html):
 928         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 929         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 930                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 931                      html):
 932             return 18
 933         return 0
 934
 935     def _media_rating_search(self, html):
 936         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 937         rating = self._html_search_meta('rating', html)
 938
 939         if not rating:
 940             return None
 941
 942         RATING_TABLE = {
 943             'safe for kids': 0,
 944             'general': 8,
 945             '14 years': 14,
 946             'mature': 17,
 947             'restricted': 19,
 948         }
 949         return RATING_TABLE.get(rating.lower())
 950
 951     def _family_friendly_search(self, html):
 952         # See http://schema.org/VideoObject
 953         family_friendly = self._html_search_meta(
 954             'isFamilyFriendly', html, default=None)
 955
 956         if not family_friendly:
 957             return None
 958
 959         RATING_TABLE = {
 960             '1': 0,
 961             'true': 0,
 962             '0': 18,
 963             'false': 18,
 964         }
 965         return RATING_TABLE.get(family_friendly.lower())
 966
 967     def _twitter_search_player(self, html):
 968         return self._html_search_meta('twitter:player', html,
 969                                       'twitter card player')
 970
 971     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 972         json_ld = self._search_regex(
 973             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 974             html, 'JSON-LD', group='json_ld', **kwargs)
 975         default = kwargs.get('default', NO_DEFAULT)
 976         if not json_ld:
 977             return default if default is not NO_DEFAULT else {}
 978         # JSON-LD may be malformed and thus `fatal` should be respected.
 979         # At the same time `default` may be passed that assumes `fatal=False`
 980         # for _search_regex. Let's simulate the same behavior here as well.
 981         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 982         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 983
 984     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 985         if isinstance(json_ld, compat_str):
 986             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 987         if not json_ld:
 988             return {}
 989         info = {}
 990         if not isinstance(json_ld, (list, tuple, dict)):
 991             return info
 992         if isinstance(json_ld, dict):
 993             json_ld = [json_ld]
 994
 995         def extract_video_object(e):
 996             assert e['@type'] == 'VideoObject'
 997             info.update({
 998                 'url': e.get('contentUrl'),
 999                 'title': unescapeHTML(e.get('name')),
1000                 'description': unescapeHTML(e.get('description')),
1001                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1002                 'duration': parse_duration(e.get('duration')),
1003                 'timestamp': unified_timestamp(e.get('uploadDate')),
1004                 'filesize': float_or_none(e.get('contentSize')),
1005                 'tbr': int_or_none(e.get('bitrate')),
1006                 'width': int_or_none(e.get('width')),
1007                 'height': int_or_none(e.get('height')),
1008                 'view_count': int_or_none(e.get('interactionCount')),
1009             })
1010
1011         for e in json_ld:
1012             if e.get('@context') == 'http://schema.org':
1013                 item_type = e.get('@type')
1014                 if expected_type is not None and expected_type != item_type:
1015                     return info
1016                 if item_type in ('TVEpisode', 'Episode'):
1017                     info.update({
1018                         'episode': unescapeHTML(e.get('name')),
1019                         'episode_number': int_or_none(e.get('episodeNumber')),
1020                         'description': unescapeHTML(e.get('description')),
1021                     })
1022                     part_of_season = e.get('partOfSeason')
1023                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1024                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1025                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1026                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1027                         info['series'] = unescapeHTML(part_of_series.get('name'))
1028                 elif item_type == 'Article':
1029                     info.update({
1030                         'timestamp': parse_iso8601(e.get('datePublished')),
1031                         'title': unescapeHTML(e.get('headline')),
1032                         'description': unescapeHTML(e.get('articleBody')),
1033                     })
1034                 elif item_type == 'VideoObject':
1035                     extract_video_object(e)
1036                     continue
1037                 video = e.get('video')
1038                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1039                     extract_video_object(video)
1040                 break
1041         return dict((k, v) for k, v in info.items() if v is not None)
1042
1043     @staticmethod
1044     def _hidden_inputs(html):
1045         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1046         hidden_inputs = {}
1047         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1048             attrs = extract_attributes(input)
1049             if not input:
1050                 continue
1051             if attrs.get('type') not in ('hidden', 'submit'):
1052                 continue
1053             name = attrs.get('name') or attrs.get('id')
1054             value = attrs.get('value')
1055             if name and value is not None:
1056                 hidden_inputs[name] = value
1057         return hidden_inputs
1058
1059     def _form_hidden_inputs(self, form_id, html):
1060         form = self._search_regex(
1061             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1062             html, '%s form' % form_id, group='form')
1063         return self._hidden_inputs(form)
1064
1065     def _sort_formats(self, formats, field_preference=None):
1066         if not formats:
1067             raise ExtractorError('No video formats found')
1068
1069         for f in formats:
1070             # Automatically determine tbr when missing based on abr and vbr (improves
1071             # formats sorting in some cases)
1072             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1073                 f['tbr'] = f['abr'] + f['vbr']
1074
1075         def _formats_key(f):
1076             # TODO remove the following workaround
1077             from ..utils import determine_ext
1078             if not f.get('ext') and 'url' in f:
1079                 f['ext'] = determine_ext(f['url'])
1080
1081             if isinstance(field_preference, (list, tuple)):
1082                 return tuple(
1083                     f.get(field)
1084                     if f.get(field) is not None
1085                     else ('' if field == 'format_id' else -1)
1086                     for field in field_preference)
1087
1088             preference = f.get('preference')
1089             if preference is None:
1090                 preference = 0
1091                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1092                     preference -= 0.5
1093
1094             protocol = f.get('protocol') or determine_protocol(f)
1095             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1096
1097             if f.get('vcodec') == 'none':  # audio only
1098                 preference -= 50
1099                 if self._downloader.params.get('prefer_free_formats'):
1100                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1101                 else:
1102                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1103                 ext_preference = 0
1104                 try:
1105                     audio_ext_preference = ORDER.index(f['ext'])
1106                 except ValueError:
1107                     audio_ext_preference = -1
1108             else:
1109                 if f.get('acodec') == 'none':  # video only
1110                     preference -= 40
1111                 if self._downloader.params.get('prefer_free_formats'):
1112                     ORDER = ['flv', 'mp4', 'webm']
1113                 else:
1114                     ORDER = ['webm', 'flv', 'mp4']
1115                 try:
1116                     ext_preference = ORDER.index(f['ext'])
1117                 except ValueError:
1118                     ext_preference = -1
1119                 audio_ext_preference = 0
1120
1121             return (
1122                 preference,
1123                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1124                 f.get('quality') if f.get('quality') is not None else -1,
1125                 f.get('tbr') if f.get('tbr') is not None else -1,
1126                 f.get('filesize') if f.get('filesize') is not None else -1,
1127                 f.get('vbr') if f.get('vbr') is not None else -1,
1128                 f.get('height') if f.get('height') is not None else -1,
1129                 f.get('width') if f.get('width') is not None else -1,
1130                 proto_preference,
1131                 ext_preference,
1132                 f.get('abr') if f.get('abr') is not None else -1,
1133                 audio_ext_preference,
1134                 f.get('fps') if f.get('fps') is not None else -1,
1135                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1136                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1137                 f.get('format_id') if f.get('format_id') is not None else '',
1138             )
1139         formats.sort(key=_formats_key)
1140
1141     def _check_formats(self, formats, video_id):
1142         if formats:
1143             formats[:] = filter(
1144                 lambda f: self._is_valid_url(
1145                     f['url'], video_id,
1146                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1147                 formats)
1148
1149     @staticmethod
1150     def _remove_duplicate_formats(formats):
1151         format_urls = set()
1152         unique_formats = []
1153         for f in formats:
1154             if f['url'] not in format_urls:
1155                 format_urls.add(f['url'])
1156                 unique_formats.append(f)
1157         formats[:] = unique_formats
1158
1159     def _is_valid_url(self, url, video_id, item='video', headers={}):
1160         url = self._proto_relative_url(url, scheme='http:')
1161         # For now assume non HTTP(S) URLs always valid
1162         if not (url.startswith('http://') or url.startswith('https://')):
1163             return True
1164         try:
1165             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1166             return True
1167         except ExtractorError as e:
1168             if isinstance(e.cause, compat_urllib_error.URLError):
1169                 self.to_screen(
1170                     '%s: %s URL is invalid, skipping' % (video_id, item))
1171                 return False
1172             raise
1173
1174     def http_scheme(self):
1175         """ Either "http:" or "https:", depending on the user's preferences """
1176         return (
1177             'http:'
1178             if self._downloader.params.get('prefer_insecure', False)
1179             else 'https:')
1180
1181     def _proto_relative_url(self, url, scheme=None):
1182         if url is None:
1183             return url
1184         if url.startswith('//'):
1185             if scheme is None:
1186                 scheme = self.http_scheme()
1187             return scheme + url
1188         else:
1189             return url
1190
1191     def _sleep(self, timeout, video_id, msg_template=None):
1192         if msg_template is None:
1193             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1194         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1195         self.to_screen(msg)
1196         time.sleep(timeout)
1197
1198     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1199                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1200                              fatal=True, m3u8_id=None):
1201         manifest = self._download_xml(
1202             manifest_url, video_id, 'Downloading f4m manifest',
1203             'Unable to download f4m manifest',
1204             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1205             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1206             transform_source=transform_source,
1207             fatal=fatal)
1208
1209         if manifest is False:
1210             return []
1211
1212         return self._parse_f4m_formats(
1213             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1214             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1215
1216     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1217                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1218                            fatal=True, m3u8_id=None):
1219         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1220         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1221         if akamai_pv is not None and ';' in akamai_pv.text:
1222             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1223             if playerVerificationChallenge.strip() != '':
1224                 return []
1225
1226         formats = []
1227         manifest_version = '1.0'
1228         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1229         if not media_nodes:
1230             manifest_version = '2.0'
1231             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1232         # Remove unsupported DRM protected media from final formats
1233         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1234         media_nodes = remove_encrypted_media(media_nodes)
1235         if not media_nodes:
1236             return formats
1237
1238         manifest_base_url = get_base_url(manifest)
1239
1240         bootstrap_info = xpath_element(
1241             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1242             'bootstrap info', default=None)
1243
1244         vcodec = None
1245         mime_type = xpath_text(
1246             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1247             'base URL', default=None)
1248         if mime_type and mime_type.startswith('audio/'):
1249             vcodec = 'none'
1250
1251         for i, media_el in enumerate(media_nodes):
1252             tbr = int_or_none(media_el.attrib.get('bitrate'))
1253             width = int_or_none(media_el.attrib.get('width'))
1254             height = int_or_none(media_el.attrib.get('height'))
1255             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1256             # If <bootstrapInfo> is present, the specified f4m is a
1257             # stream-level manifest, and only set-level manifests may refer to
1258             # external resources.  See section 11.4 and section 4 of F4M spec
1259             if bootstrap_info is None:
1260                 media_url = None
1261                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1262                 if manifest_version == '2.0':
1263                     media_url = media_el.attrib.get('href')
1264                 if media_url is None:
1265                     media_url = media_el.attrib.get('url')
1266                 if not media_url:
1267                     continue
1268                 manifest_url = (
1269                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1270                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1271                 # If media_url is itself a f4m manifest do the recursive extraction
1272                 # since bitrates in parent manifest (this one) and media_url manifest
1273                 # may differ leading to inability to resolve the format by requested
1274                 # bitrate in f4m downloader
1275                 ext = determine_ext(manifest_url)
1276                 if ext == 'f4m':
1277                     f4m_formats = self._extract_f4m_formats(
1278                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1279                         transform_source=transform_source, fatal=fatal)
1280                     # Sometimes stream-level manifest contains single media entry that
1281                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1282                     # At the same time parent's media entry in set-level manifest may
1283                     # contain it. We will copy it from parent in such cases.
1284                     if len(f4m_formats) == 1:
1285                         f = f4m_formats[0]
1286                         f.update({
1287                             'tbr': f.get('tbr') or tbr,
1288                             'width': f.get('width') or width,
1289                             'height': f.get('height') or height,
1290                             'format_id': f.get('format_id') if not tbr else format_id,
1291                             'vcodec': vcodec,
1292                         })
1293                     formats.extend(f4m_formats)
1294                     continue
1295                 elif ext == 'm3u8':
1296                     formats.extend(self._extract_m3u8_formats(
1297                         manifest_url, video_id, 'mp4', preference=preference,
1298                         m3u8_id=m3u8_id, fatal=fatal))
1299                     continue
1300             formats.append({
1301                 'format_id': format_id,
1302                 'url': manifest_url,
1303                 'manifest_url': manifest_url,
1304                 'ext': 'flv' if bootstrap_info is not None else None,
1305                 'protocol': 'f4m',
1306                 'tbr': tbr,
1307                 'width': width,
1308                 'height': height,
1309                 'vcodec': vcodec,
1310                 'preference': preference,
1311             })
1312         return formats
1313
1314     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1315         return {
1316             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1317             'url': m3u8_url,
1318             'ext': ext,
1319             'protocol': 'm3u8',
1320             'preference': preference - 100 if preference else -100,
1321             'resolution': 'multiple',
1322             'format_note': 'Quality selection URL',
1323         }
1324
1325     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1326                               entry_protocol='m3u8', preference=None,
1327                               m3u8_id=None, note=None, errnote=None,
1328                               fatal=True, live=False):
1329         res = self._download_webpage_handle(
1330             m3u8_url, video_id,
1331             note=note or 'Downloading m3u8 information',
1332             errnote=errnote or 'Failed to download m3u8 information',
1333             fatal=fatal)
1334
1335         if res is False:
1336             return []
1337
1338         m3u8_doc, urlh = res
1339         m3u8_url = urlh.geturl()
1340
1341         return self._parse_m3u8_formats(
1342             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1343             preference=preference, m3u8_id=m3u8_id, live=live)
1344
1345     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1346                             entry_protocol='m3u8', preference=None,
1347                             m3u8_id=None, live=False):
1348         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1349             return []
1350
1351         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1352             return []
1353
1354         formats = []
1355
1356         format_url = lambda u: (
1357             u
1358             if re.match(r'^https?://', u)
1359             else compat_urlparse.urljoin(m3u8_url, u))
1360
1361         # References:
1362         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1363         # 2. https://github.com/rg3/youtube-dl/issues/12211
1364
1365         # We should try extracting formats only from master playlists [1, 4.3.4],
1366         # i.e. playlists that describe available qualities. On the other hand
1367         # media playlists [1, 4.3.3] should be returned as is since they contain
1368         # just the media without qualities renditions.
1369         # Fortunately, master playlist can be easily distinguished from media
1370         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1371         # master playlist tags MUST NOT appear in a media playist and vice versa.
1372         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1373         # media playlist and MUST NOT appear in master playlist thus we can
1374         # clearly detect media playlist with this criterion.
1375
1376         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1377             return [{
1378                 'url': m3u8_url,
1379                 'format_id': m3u8_id,
1380                 'ext': ext,
1381                 'protocol': entry_protocol,
1382                 'preference': preference,
1383             }]
1384
1385         groups = {}
1386         last_stream_inf = {}
1387
1388         def extract_media(x_media_line):
1389             media = parse_m3u8_attributes(x_media_line)
1390             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1391             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1392             if not (media_type and group_id and name):
1393                 return
1394             groups.setdefault(group_id, []).append(media)
1395             if media_type not in ('VIDEO', 'AUDIO'):
1396                 return
1397             media_url = media.get('URI')
1398             if media_url:
1399                 format_id = []
1400                 for v in (m3u8_id, group_id, name):
1401                     if v:
1402                         format_id.append(v)
1403                 f = {
1404                     'format_id': '-'.join(format_id),
1405                     'url': format_url(media_url),
1406                     'manifest_url': m3u8_url,
1407                     'language': media.get('LANGUAGE'),
1408                     'ext': ext,
1409                     'protocol': entry_protocol,
1410                     'preference': preference,
1411                 }
1412                 if media_type == 'AUDIO':
1413                     f['vcodec'] = 'none'
1414                 formats.append(f)
1415
1416         def build_stream_name():
1417             # Despite specification does not mention NAME attribute for
1418             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1419             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1420             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1421             stream_name = last_stream_inf.get('NAME')
1422             if stream_name:
1423                 return stream_name
1424             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1425             # from corresponding rendition group
1426             stream_group_id = last_stream_inf.get('VIDEO')
1427             if not stream_group_id:
1428                 return
1429             stream_group = groups.get(stream_group_id)
1430             if not stream_group:
1431                 return stream_group_id
1432             rendition = stream_group[0]
1433             return rendition.get('NAME') or stream_group_id
1434
1435         for line in m3u8_doc.splitlines():
1436             if line.startswith('#EXT-X-STREAM-INF:'):
1437                 last_stream_inf = parse_m3u8_attributes(line)
1438             elif line.startswith('#EXT-X-MEDIA:'):
1439                 extract_media(line)
1440             elif line.startswith('#') or not line.strip():
1441                 continue
1442             else:
1443                 tbr = float_or_none(
1444                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1445                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1446                 format_id = []
1447                 if m3u8_id:
1448                     format_id.append(m3u8_id)
1449                 stream_name = build_stream_name()
1450                 # Bandwidth of live streams may differ over time thus making
1451                 # format_id unpredictable. So it's better to keep provided
1452                 # format_id intact.
1453                 if not live:
1454                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1455                 manifest_url = format_url(line.strip())
1456                 f = {
1457                     'format_id': '-'.join(format_id),
1458                     'url': manifest_url,
1459                     'manifest_url': m3u8_url,
1460                     'tbr': tbr,
1461                     'ext': ext,
1462                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1463                     'protocol': entry_protocol,
1464                     'preference': preference,
1465                 }
1466                 resolution = last_stream_inf.get('RESOLUTION')
1467                 if resolution:
1468                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1469                     if mobj:
1470                         f['width'] = int(mobj.group('width'))
1471                         f['height'] = int(mobj.group('height'))
1472                 # Unified Streaming Platform
1473                 mobj = re.search(
1474                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1475                 if mobj:
1476                     abr, vbr = mobj.groups()
1477                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1478                     f.update({
1479                         'vbr': vbr,
1480                         'abr': abr,
1481                     })
1482                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1483                 f.update(codecs)
1484                 audio_group_id = last_stream_inf.get('AUDIO')
1485                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1486                 # references a rendition group MUST have a CODECS attribute.
1487                 # However, this is not always respected, for example, [2]
1488                 # contains EXT-X-STREAM-INF tag which references AUDIO
1489                 # rendition group but does not have CODECS and despite
1490                 # referencing audio group an audio group, it represents
1491                 # a complete (with audio and video) format. So, for such cases
1492                 # we will ignore references to rendition groups and treat them
1493                 # as complete formats.
1494                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1495                     audio_group = groups.get(audio_group_id)
1496                     if audio_group and audio_group[0].get('URI'):
1497                         # TODO: update acodec for audio only formats with
1498                         # the same GROUP-ID
1499                         f['acodec'] = 'none'
1500                 formats.append(f)
1501                 last_stream_inf = {}
1502         return formats
1503
1504     @staticmethod
1505     def _xpath_ns(path, namespace=None):
1506         if not namespace:
1507             return path
1508         out = []
1509         for c in path.split('/'):
1510             if not c or c == '.':
1511                 out.append(c)
1512             else:
1513                 out.append('{%s}%s' % (namespace, c))
1514         return '/'.join(out)
1515
1516     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1517         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1518
1519         if smil is False:
1520             assert not fatal
1521             return []
1522
1523         namespace = self._parse_smil_namespace(smil)
1524
1525         return self._parse_smil_formats(
1526             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1527
1528     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1529         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1530         if smil is False:
1531             return {}
1532         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1533
1534     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1535         return self._download_xml(
1536             smil_url, video_id, 'Downloading SMIL file',
1537             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1538
1539     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1540         namespace = self._parse_smil_namespace(smil)
1541
1542         formats = self._parse_smil_formats(
1543             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1544         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1545
1546         video_id = os.path.splitext(url_basename(smil_url))[0]
1547         title = None
1548         description = None
1549         upload_date = None
1550         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1551             name = meta.attrib.get('name')
1552             content = meta.attrib.get('content')
1553             if not name or not content:
1554                 continue
1555             if not title and name == 'title':
1556                 title = content
1557             elif not description and name in ('description', 'abstract'):
1558                 description = content
1559             elif not upload_date and name == 'date':
1560                 upload_date = unified_strdate(content)
1561
1562         thumbnails = [{
1563             'id': image.get('type'),
1564             'url': image.get('src'),
1565             'width': int_or_none(image.get('width')),
1566             'height': int_or_none(image.get('height')),
1567         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1568
1569         return {
1570             'id': video_id,
1571             'title': title or video_id,
1572             'description': description,
1573             'upload_date': upload_date,
1574             'thumbnails': thumbnails,
1575             'formats': formats,
1576             'subtitles': subtitles,
1577         }
1578
1579     def _parse_smil_namespace(self, smil):
1580         return self._search_regex(
1581             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1582
1583     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1584         base = smil_url
1585         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1586             b = meta.get('base') or meta.get('httpBase')
1587             if b:
1588                 base = b
1589                 break
1590
1591         formats = []
1592         rtmp_count = 0
1593         http_count = 0
1594         m3u8_count = 0
1595
1596         srcs = []
1597         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1598         for medium in media:
1599             src = medium.get('src')
1600             if not src or src in srcs:
1601                 continue
1602             srcs.append(src)
1603
1604             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1605             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1606             width = int_or_none(medium.get('width'))
1607             height = int_or_none(medium.get('height'))
1608             proto = medium.get('proto')
1609             ext = medium.get('ext')
1610             src_ext = determine_ext(src)
1611             streamer = medium.get('streamer') or base
1612
1613             if proto == 'rtmp' or streamer.startswith('rtmp'):
1614                 rtmp_count += 1
1615                 formats.append({
1616                     'url': streamer,
1617                     'play_path': src,
1618                     'ext': 'flv',
1619                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1620                     'tbr': bitrate,
1621                     'filesize': filesize,
1622                     'width': width,
1623                     'height': height,
1624                 })
1625                 if transform_rtmp_url:
1626                     streamer, src = transform_rtmp_url(streamer, src)
1627                     formats[-1].update({
1628                         'url': streamer,
1629                         'play_path': src,
1630                     })
1631                 continue
1632
1633             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1634             src_url = src_url.strip()
1635
1636             if proto == 'm3u8' or src_ext == 'm3u8':
1637                 m3u8_formats = self._extract_m3u8_formats(
1638                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1639                 if len(m3u8_formats) == 1:
1640                     m3u8_count += 1
1641                     m3u8_formats[0].update({
1642                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1643                         'tbr': bitrate,
1644                         'width': width,
1645                         'height': height,
1646                     })
1647                 formats.extend(m3u8_formats)
1648                 continue
1649
1650             if src_ext == 'f4m':
1651                 f4m_url = src_url
1652                 if not f4m_params:
1653                     f4m_params = {
1654                         'hdcore': '3.2.0',
1655                         'plugin': 'flowplayer-3.2.0.1',
1656                     }
1657                 f4m_url += '&' if '?' in f4m_url else '?'
1658                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1659                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1660                 continue
1661
1662             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1663                 http_count += 1
1664                 formats.append({
1665                     'url': src_url,
1666                     'ext': ext or src_ext or 'flv',
1667                     'format_id': 'http-%d' % (bitrate or http_count),
1668                     'tbr': bitrate,
1669                     'filesize': filesize,
1670                     'width': width,
1671                     'height': height,
1672                 })
1673                 continue
1674
1675         return formats
1676
1677     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1678         urls = []
1679         subtitles = {}
1680         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1681             src = textstream.get('src')
1682             if not src or src in urls:
1683                 continue
1684             urls.append(src)
1685             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1686             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1687             subtitles.setdefault(lang, []).append({
1688                 'url': src,
1689                 'ext': ext,
1690             })
1691         return subtitles
1692
1693     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1694         xspf = self._download_xml(
1695             playlist_url, playlist_id, 'Downloading xpsf playlist',
1696             'Unable to download xspf manifest', fatal=fatal)
1697         if xspf is False:
1698             return []
1699         return self._parse_xspf(xspf, playlist_id)
1700
1701     def _parse_xspf(self, playlist, playlist_id):
1702         NS_MAP = {
1703             'xspf': 'http://xspf.org/ns/0/',
1704             's1': 'http://static.streamone.nl/player/ns/0',
1705         }
1706
1707         entries = []
1708         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1709             title = xpath_text(
1710                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1711             description = xpath_text(
1712                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1713             thumbnail = xpath_text(
1714                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1715             duration = float_or_none(
1716                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1717
1718             formats = [{
1719                 'url': location.text,
1720                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1721                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1722                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1723             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1724             self._sort_formats(formats)
1725
1726             entries.append({
1727                 'id': playlist_id,
1728                 'title': title,
1729                 'description': description,
1730                 'thumbnail': thumbnail,
1731                 'duration': duration,
1732                 'formats': formats,
1733             })
1734         return entries
1735
1736     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1737         res = self._download_webpage_handle(
1738             mpd_url, video_id,
1739             note=note or 'Downloading MPD manifest',
1740             errnote=errnote or 'Failed to download MPD manifest',
1741             fatal=fatal)
1742         if res is False:
1743             return []
1744         mpd, urlh = res
1745         mpd_base_url = base_url(urlh.geturl())
1746
1747         return self._parse_mpd_formats(
1748             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1749             formats_dict=formats_dict, mpd_url=mpd_url)
1750
1751     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1752         """
1753         Parse formats from MPD manifest.
1754         References:
1755          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1756             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1757          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1758         """
1759         if mpd_doc.get('type') == 'dynamic':
1760             return []
1761
1762         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1763
1764         def _add_ns(path):
1765             return self._xpath_ns(path, namespace)
1766
1767         def is_drm_protected(element):
1768             return element.find(_add_ns('ContentProtection')) is not None
1769
1770         def extract_multisegment_info(element, ms_parent_info):
1771             ms_info = ms_parent_info.copy()
1772
1773             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1774             # common attributes and elements.  We will only extract relevant
1775             # for us.
1776             def extract_common(source):
1777                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1778                 if segment_timeline is not None:
1779                     s_e = segment_timeline.findall(_add_ns('S'))
1780                     if s_e:
1781                         ms_info['total_number'] = 0
1782                         ms_info['s'] = []
1783                         for s in s_e:
1784                             r = int(s.get('r', 0))
1785                             ms_info['total_number'] += 1 + r
1786                             ms_info['s'].append({
1787                                 't': int(s.get('t', 0)),
1788                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1789                                 'd': int(s.attrib['d']),
1790                                 'r': r,
1791                             })
1792                 start_number = source.get('startNumber')
1793                 if start_number:
1794                     ms_info['start_number'] = int(start_number)
1795                 timescale = source.get('timescale')
1796                 if timescale:
1797                     ms_info['timescale'] = int(timescale)
1798                 segment_duration = source.get('duration')
1799                 if segment_duration:
1800                     ms_info['segment_duration'] = float(segment_duration)
1801
1802             def extract_Initialization(source):
1803                 initialization = source.find(_add_ns('Initialization'))
1804                 if initialization is not None:
1805                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1806
1807             segment_list = element.find(_add_ns('SegmentList'))
1808             if segment_list is not None:
1809                 extract_common(segment_list)
1810                 extract_Initialization(segment_list)
1811                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1812                 if segment_urls_e:
1813                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1814             else:
1815                 segment_template = element.find(_add_ns('SegmentTemplate'))
1816                 if segment_template is not None:
1817                     extract_common(segment_template)
1818                     media = segment_template.get('media')
1819                     if media:
1820                         ms_info['media'] = media
1821                     initialization = segment_template.get('initialization')
1822                     if initialization:
1823                         ms_info['initialization'] = initialization
1824                     else:
1825                         extract_Initialization(segment_template)
1826             return ms_info
1827
1828         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1829         formats = []
1830         for period in mpd_doc.findall(_add_ns('Period')):
1831             period_duration = parse_duration(period.get('duration')) or mpd_duration
1832             period_ms_info = extract_multisegment_info(period, {
1833                 'start_number': 1,
1834                 'timescale': 1,
1835             })
1836             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1837                 if is_drm_protected(adaptation_set):
1838                     continue
1839                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1840                 for representation in adaptation_set.findall(_add_ns('Representation')):
1841                     if is_drm_protected(representation):
1842                         continue
1843                     representation_attrib = adaptation_set.attrib.copy()
1844                     representation_attrib.update(representation.attrib)
1845                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1846                     mime_type = representation_attrib['mimeType']
1847                     content_type = mime_type.split('/')[0]
1848                     if content_type == 'text':
1849                         # TODO implement WebVTT downloading
1850                         pass
1851                     elif content_type in ('video', 'audio'):
1852                         base_url = ''
1853                         for element in (representation, adaptation_set, period, mpd_doc):
1854                             base_url_e = element.find(_add_ns('BaseURL'))
1855                             if base_url_e is not None:
1856                                 base_url = base_url_e.text + base_url
1857                                 if re.match(r'^https?://', base_url):
1858                                     break
1859                         if mpd_base_url and not re.match(r'^https?://', base_url):
1860                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1861                                 mpd_base_url += '/'
1862                             base_url = mpd_base_url + base_url
1863                         representation_id = representation_attrib.get('id')
1864                         lang = representation_attrib.get('lang')
1865                         url_el = representation.find(_add_ns('BaseURL'))
1866                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1867                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1868                         f = {
1869                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1870                             'url': base_url,
1871                             'manifest_url': mpd_url,
1872                             'ext': mimetype2ext(mime_type),
1873                             'width': int_or_none(representation_attrib.get('width')),
1874                             'height': int_or_none(representation_attrib.get('height')),
1875                             'tbr': float_or_none(bandwidth, 1000),
1876                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1877                             'fps': int_or_none(representation_attrib.get('frameRate')),
1878                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1879                             'format_note': 'DASH %s' % content_type,
1880                             'filesize': filesize,
1881                         }
1882                         f.update(parse_codecs(representation_attrib.get('codecs')))
1883                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1884
1885                         def prepare_template(template_name, identifiers):
1886                             t = representation_ms_info[template_name]
1887                             t = t.replace('$RepresentationID$', representation_id)
1888                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1889                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1890                             t.replace('$$', '$')
1891                             return t
1892
1893                         # @initialization is a regular template like @media one
1894                         # so it should be handled just the same way (see
1895                         # https://github.com/rg3/youtube-dl/issues/11605)
1896                         if 'initialization' in representation_ms_info:
1897                             initialization_template = prepare_template(
1898                                 'initialization',
1899                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1900                                 # $Time$ shall not be included for @initialization thus
1901                                 # only $Bandwidth$ remains
1902                                 ('Bandwidth', ))
1903                             representation_ms_info['initialization_url'] = initialization_template % {
1904                                 'Bandwidth': bandwidth,
1905                             }
1906
1907                         def location_key(location):
1908                             return 'url' if re.match(r'^https?://', location) else 'path'
1909
1910                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1911
1912                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1913                             media_location_key = location_key(media_template)
1914
1915                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1916                             # can't be used at the same time
1917                             if '%(Number' in media_template and 's' not in representation_ms_info:
1918                                 segment_duration = None
1919                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1920                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1921                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1922                                 representation_ms_info['fragments'] = [{
1923                                     media_location_key: media_template % {
1924                                         'Number': segment_number,
1925                                         'Bandwidth': bandwidth,
1926                                     },
1927                                     'duration': segment_duration,
1928                                 } for segment_number in range(
1929                                     representation_ms_info['start_number'],
1930                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1931                             else:
1932                                 # $Number*$ or $Time$ in media template with S list available
1933                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1934                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1935                                 representation_ms_info['fragments'] = []
1936                                 segment_time = 0
1937                                 segment_d = None
1938                                 segment_number = representation_ms_info['start_number']
1939
1940                                 def add_segment_url():
1941                                     segment_url = media_template % {
1942                                         'Time': segment_time,
1943                                         'Bandwidth': bandwidth,
1944                                         'Number': segment_number,
1945                                     }
1946                                     representation_ms_info['fragments'].append({
1947                                         media_location_key: segment_url,
1948                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1949                                     })
1950
1951                                 for num, s in enumerate(representation_ms_info['s']):
1952                                     segment_time = s.get('t') or segment_time
1953                                     segment_d = s['d']
1954                                     add_segment_url()
1955                                     segment_number += 1
1956                                     for r in range(s.get('r', 0)):
1957                                         segment_time += segment_d
1958                                         add_segment_url()
1959                                         segment_number += 1
1960                                     segment_time += segment_d
1961                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1962                             # No media template
1963                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1964                             # or any YouTube dashsegments video
1965                             fragments = []
1966                             segment_index = 0
1967                             timescale = representation_ms_info['timescale']
1968                             for s in representation_ms_info['s']:
1969                                 duration = float_or_none(s['d'], timescale)
1970                                 for r in range(s.get('r', 0) + 1):
1971                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1972                                     fragments.append({
1973                                         location_key(segment_uri): segment_uri,
1974                                         'duration': duration,
1975                                     })
1976                                     segment_index += 1
1977                             representation_ms_info['fragments'] = fragments
1978                         elif 'segment_urls' in representation_ms_info:
1979                             # Segment URLs with no SegmentTimeline
1980                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
1981                             fragments = []
1982                             segment_duration = float_or_none(
1983                                 representation_ms_info['segment_duration'],
1984                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
1985                             for segment_url in representation_ms_info['segment_urls']:
1986                                 fragment = {
1987                                     location_key(segment_url): segment_url,
1988                                 }
1989                                 if segment_duration:
1990                                     fragment['duration'] = segment_duration
1991                                 fragments.append(fragment)
1992                             representation_ms_info['fragments'] = fragments
1993                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1994                         # No fragments key is present in this case.
1995                         if 'fragments' in representation_ms_info:
1996                             f.update({
1997                                 'fragment_base_url': base_url,
1998                                 'fragments': [],
1999                                 'protocol': 'http_dash_segments',
2000                             })
2001                             if 'initialization_url' in representation_ms_info:
2002                                 initialization_url = representation_ms_info['initialization_url']
2003                                 if not f.get('url'):
2004                                     f['url'] = initialization_url
2005                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2006                             f['fragments'].extend(representation_ms_info['fragments'])
2007                         try:
2008                             existing_format = next(
2009                                 fo for fo in formats
2010                                 if fo['format_id'] == representation_id)
2011                         except StopIteration:
2012                             full_info = formats_dict.get(representation_id, {}).copy()
2013                             full_info.update(f)
2014                             formats.append(full_info)
2015                         else:
2016                             existing_format.update(f)
2017                     else:
2018                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2019         return formats
2020
2021     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2022         res = self._download_webpage_handle(
2023             ism_url, video_id,
2024             note=note or 'Downloading ISM manifest',
2025             errnote=errnote or 'Failed to download ISM manifest',
2026             fatal=fatal)
2027         if res is False:
2028             return []
2029         ism, urlh = res
2030
2031         return self._parse_ism_formats(
2032             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2033
2034     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2035         """
2036         Parse formats from ISM manifest.
2037         References:
2038          1. [MS-SSTR]: Smooth Streaming Protocol,
2039             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2040         """
2041         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2042             return []
2043
2044         duration = int(ism_doc.attrib['Duration'])
2045         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2046
2047         formats = []
2048         for stream in ism_doc.findall('StreamIndex'):
2049             stream_type = stream.get('Type')
2050             if stream_type not in ('video', 'audio'):
2051                 continue
2052             url_pattern = stream.attrib['Url']
2053             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2054             stream_name = stream.get('Name')
2055             for track in stream.findall('QualityLevel'):
2056                 fourcc = track.get('FourCC')
2057                 # TODO: add support for WVC1 and WMAP
2058                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2059                     self.report_warning('%s is not a supported codec' % fourcc)
2060                     continue
2061                 tbr = int(track.attrib['Bitrate']) // 1000
2062                 # [1] does not mention Width and Height attributes. However,
2063                 # they're often present while MaxWidth and MaxHeight are
2064                 # missing, so should be used as fallbacks
2065                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2066                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2067                 sampling_rate = int_or_none(track.get('SamplingRate'))
2068
2069                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2070                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2071
2072                 fragments = []
2073                 fragment_ctx = {
2074                     'time': 0,
2075                 }
2076                 stream_fragments = stream.findall('c')
2077                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2078                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2079                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2080                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2081                     if not fragment_ctx['duration']:
2082                         try:
2083                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2084                         except IndexError:
2085                             next_fragment_time = duration
2086                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2087                     for _ in range(fragment_repeat):
2088                         fragments.append({
2089                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2090                             'duration': fragment_ctx['duration'] / stream_timescale,
2091                         })
2092                         fragment_ctx['time'] += fragment_ctx['duration']
2093
2094                 format_id = []
2095                 if ism_id:
2096                     format_id.append(ism_id)
2097                 if stream_name:
2098                     format_id.append(stream_name)
2099                 format_id.append(compat_str(tbr))
2100
2101                 formats.append({
2102                     'format_id': '-'.join(format_id),
2103                     'url': ism_url,
2104                     'manifest_url': ism_url,
2105                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2106                     'width': width,
2107                     'height': height,
2108                     'tbr': tbr,
2109                     'asr': sampling_rate,
2110                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2111                     'acodec': 'none' if stream_type == 'video' else fourcc,
2112                     'protocol': 'ism',
2113                     'fragments': fragments,
2114                     '_download_params': {
2115                         'duration': duration,
2116                         'timescale': stream_timescale,
2117                         'width': width or 0,
2118                         'height': height or 0,
2119                         'fourcc': fourcc,
2120                         'codec_private_data': track.get('CodecPrivateData'),
2121                         'sampling_rate': sampling_rate,
2122                         'channels': int_or_none(track.get('Channels', 2)),
2123                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2124                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2125                     },
2126                 })
2127         return formats
2128
2129     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2130         def absolute_url(video_url):
2131             return compat_urlparse.urljoin(base_url, video_url)
2132
2133         def parse_content_type(content_type):
2134             if not content_type:
2135                 return {}
2136             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2137             if ctr:
2138                 mimetype, codecs = ctr.groups()
2139                 f = parse_codecs(codecs)
2140                 f['ext'] = mimetype2ext(mimetype)
2141                 return f
2142             return {}
2143
2144         def _media_formats(src, cur_media_type, type_info={}):
2145             full_url = absolute_url(src)
2146             ext = type_info.get('ext') or determine_ext(full_url)
2147             if ext == 'm3u8':
2148                 is_plain_url = False
2149                 formats = self._extract_m3u8_formats(
2150                     full_url, video_id, ext='mp4',
2151                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2152                     preference=preference, fatal=False)
2153             elif ext == 'mpd':
2154                 is_plain_url = False
2155                 formats = self._extract_mpd_formats(
2156                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2157             else:
2158                 is_plain_url = True
2159                 formats = [{
2160                     'url': full_url,
2161                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2162                 }]
2163             return is_plain_url, formats
2164
2165         entries = []
2166         # amp-video and amp-audio are very similar to their HTML5 counterparts
2167         # so we wll include them right here (see
2168         # https://www.ampproject.org/docs/reference/components/amp-video)
2169         media_tags = [(media_tag, media_type, '')
2170                       for media_tag, media_type
2171                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2172         media_tags.extend(re.findall(
2173             # We only allow video|audio followed by a whitespace or '>'.
2174             # Allowing more characters may end up in significant slow down (see
2175             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2176             # http://www.porntrex.com/maps/videositemap.xml).
2177             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2178         for media_tag, media_type, media_content in media_tags:
2179             media_info = {
2180                 'formats': [],
2181                 'subtitles': {},
2182             }
2183             media_attributes = extract_attributes(media_tag)
2184             src = media_attributes.get('src')
2185             if src:
2186                 _, formats = _media_formats(src, media_type)
2187                 media_info['formats'].extend(formats)
2188             media_info['thumbnail'] = media_attributes.get('poster')
2189             if media_content:
2190                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2191                     source_attributes = extract_attributes(source_tag)
2192                     src = source_attributes.get('src')
2193                     if not src:
2194                         continue
2195                     f = parse_content_type(source_attributes.get('type'))
2196                     is_plain_url, formats = _media_formats(src, media_type, f)
2197                     if is_plain_url:
2198                         # res attribute is not standard but seen several times
2199                         # in the wild
2200                         f.update({
2201                             'height': int_or_none(source_attributes.get('res')),
2202                             'format_id': source_attributes.get('label'),
2203                         })
2204                         f.update(formats[0])
2205                         media_info['formats'].append(f)
2206                     else:
2207                         media_info['formats'].extend(formats)
2208                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2209                     track_attributes = extract_attributes(track_tag)
2210                     kind = track_attributes.get('kind')
2211                     if not kind or kind in ('subtitles', 'captions'):
2212                         src = track_attributes.get('src')
2213                         if not src:
2214                             continue
2215                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2216                         media_info['subtitles'].setdefault(lang, []).append({
2217                             'url': absolute_url(src),
2218                         })
2219             if media_info['formats'] or media_info['subtitles']:
2220                 entries.append(media_info)
2221         return entries
2222
2223     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2224         formats = []
2225         hdcore_sign = 'hdcore=3.7.0'
2226         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2227         hds_host = hosts.get('hds')
2228         if hds_host:
2229             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2230         if 'hdcore=' not in f4m_url:
2231             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2232         f4m_formats = self._extract_f4m_formats(
2233             f4m_url, video_id, f4m_id='hds', fatal=False)
2234         for entry in f4m_formats:
2235             entry.update({'extra_param_to_segment_url': hdcore_sign})
2236         formats.extend(f4m_formats)
2237         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2238         hls_host = hosts.get('hls')
2239         if hls_host:
2240             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2241         formats.extend(self._extract_m3u8_formats(
2242             m3u8_url, video_id, 'mp4', 'm3u8_native',
2243             m3u8_id='hls', fatal=False))
2244         return formats
2245
2246     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2247         query = compat_urlparse.urlparse(url).query
2248         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2249         url_base = self._search_regex(
2250             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2251         http_base_url = '%s:%s' % ('http', url_base)
2252         formats = []
2253
2254         def manifest_url(manifest):
2255             m_url = '%s/%s' % (http_base_url, manifest)
2256             if query:
2257                 m_url += '?%s' % query
2258             return m_url
2259
2260         if 'm3u8' not in skip_protocols:
2261             formats.extend(self._extract_m3u8_formats(
2262                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2263                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2264         if 'f4m' not in skip_protocols:
2265             formats.extend(self._extract_f4m_formats(
2266                 manifest_url('manifest.f4m'),
2267                 video_id, f4m_id='hds', fatal=False))
2268         if 'dash' not in skip_protocols:
2269             formats.extend(self._extract_mpd_formats(
2270                 manifest_url('manifest.mpd'),
2271                 video_id, mpd_id='dash', fatal=False))
2272         if re.search(r'(?:/smil:|\.smil)', url_base):
2273             if 'smil' not in skip_protocols:
2274                 rtmp_formats = self._extract_smil_formats(
2275                     manifest_url('jwplayer.smil'),
2276                     video_id, fatal=False)
2277                 for rtmp_format in rtmp_formats:
2278                     rtsp_format = rtmp_format.copy()
2279                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2280                     del rtsp_format['play_path']
2281                     del rtsp_format['ext']
2282                     rtsp_format.update({
2283                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2284                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2285                         'protocol': 'rtsp',
2286                     })
2287                     formats.extend([rtmp_format, rtsp_format])
2288         else:
2289             for protocol in ('rtmp', 'rtsp'):
2290                 if protocol not in skip_protocols:
2291                     formats.append({
2292                         'url': '%s:%s' % (protocol, url_base),
2293                         'format_id': protocol,
2294                         'protocol': protocol,
2295                     })
2296         return formats
2297
2298     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2299         mobj = re.search(
2300             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2301             webpage)
2302         if mobj:
2303             try:
2304                 jwplayer_data = self._parse_json(mobj.group('options'),
2305                                                  video_id=video_id,
2306                                                  transform_source=transform_source)
2307             except ExtractorError:
2308                 pass
2309             else:
2310                 if isinstance(jwplayer_data, dict):
2311                     return jwplayer_data
2312
2313     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2314         jwplayer_data = self._find_jwplayer_data(
2315             webpage, video_id, transform_source=js_to_json)
2316         return self._parse_jwplayer_data(
2317             jwplayer_data, video_id, *args, **kwargs)
2318
2319     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2320                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2321         # JWPlayer backward compatibility: flattened playlists
2322         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2323         if 'playlist' not in jwplayer_data:
2324             jwplayer_data = {'playlist': [jwplayer_data]}
2325
2326         entries = []
2327
2328         # JWPlayer backward compatibility: single playlist item
2329         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2330         if not isinstance(jwplayer_data['playlist'], list):
2331             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2332
2333         for video_data in jwplayer_data['playlist']:
2334             # JWPlayer backward compatibility: flattened sources
2335             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2336             if 'sources' not in video_data:
2337                 video_data['sources'] = [video_data]
2338
2339             this_video_id = video_id or video_data['mediaid']
2340
2341             formats = self._parse_jwplayer_formats(
2342                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2343                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2344
2345             subtitles = {}
2346             tracks = video_data.get('tracks')
2347             if tracks and isinstance(tracks, list):
2348                 for track in tracks:
2349                     if not isinstance(track, dict):
2350                         continue
2351                     if track.get('kind') != 'captions':
2352                         continue
2353                     track_url = urljoin(base_url, track.get('file'))
2354                     if not track_url:
2355                         continue
2356                     subtitles.setdefault(track.get('label') or 'en', []).append({
2357                         'url': self._proto_relative_url(track_url)
2358                     })
2359
2360             entry = {
2361                 'id': this_video_id,
2362                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2363                 'description': video_data.get('description'),
2364                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2365                 'timestamp': int_or_none(video_data.get('pubdate')),
2366                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2367                 'subtitles': subtitles,
2368             }
2369             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2370             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2371                 entry.update({
2372                     '_type': 'url_transparent',
2373                     'url': formats[0]['url'],
2374                 })
2375             else:
2376                 self._sort_formats(formats)
2377                 entry['formats'] = formats
2378             entries.append(entry)
2379         if len(entries) == 1:
2380             return entries[0]
2381         else:
2382             return self.playlist_result(entries)
2383
2384     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2385                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2386         urls = []
2387         formats = []
2388         for source in jwplayer_sources_data:
2389             if not isinstance(source, dict):
2390                 continue
2391             source_url = self._proto_relative_url(source.get('file'))
2392             if not source_url:
2393                 continue
2394             if base_url:
2395                 source_url = compat_urlparse.urljoin(base_url, source_url)
2396             if source_url in urls:
2397                 continue
2398             urls.append(source_url)
2399             source_type = source.get('type') or ''
2400             ext = mimetype2ext(source_type) or determine_ext(source_url)
2401             if source_type == 'hls' or ext == 'm3u8':
2402                 formats.extend(self._extract_m3u8_formats(
2403                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2404                     m3u8_id=m3u8_id, fatal=False))
2405             elif ext == 'mpd':
2406                 formats.extend(self._extract_mpd_formats(
2407                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2408             elif ext == 'smil':
2409                 formats.extend(self._extract_smil_formats(
2410                     source_url, video_id, fatal=False))
2411             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2412             elif source_type.startswith('audio') or ext in (
2413                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2414                 formats.append({
2415                     'url': source_url,
2416                     'vcodec': 'none',
2417                     'ext': ext,
2418                 })
2419             else:
2420                 height = int_or_none(source.get('height'))
2421                 if height is None:
2422                     # Often no height is provided but there is a label in
2423                     # format like "1080p", "720p SD", or 1080.
2424                     height = int_or_none(self._search_regex(
2425                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2426                         'height', default=None))
2427                 a_format = {
2428                     'url': source_url,
2429                     'width': int_or_none(source.get('width')),
2430                     'height': height,
2431                     'tbr': int_or_none(source.get('bitrate')),
2432                     'ext': ext,
2433                 }
2434                 if source_url.startswith('rtmp'):
2435                     a_format['ext'] = 'flv'
2436                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2437                     # of jwplayer.flash.swf
2438                     rtmp_url_parts = re.split(
2439                         r'((?:mp4|mp3|flv):)', source_url, 1)
2440                     if len(rtmp_url_parts) == 3:
2441                         rtmp_url, prefix, play_path = rtmp_url_parts
2442                         a_format.update({
2443                             'url': rtmp_url,
2444                             'play_path': prefix + play_path,
2445                         })
2446                     if rtmp_params:
2447                         a_format.update(rtmp_params)
2448                 formats.append(a_format)
2449         return formats
2450
2451     def _live_title(self, name):
2452         """ Generate the title for a live video """
2453         now = datetime.datetime.now()
2454         now_str = now.strftime('%Y-%m-%d %H:%M')
2455         return name + ' ' + now_str
2456
2457     def _int(self, v, name, fatal=False, **kwargs):
2458         res = int_or_none(v, **kwargs)
2459         if 'get_attr' in kwargs:
2460             print(getattr(v, kwargs['get_attr']))
2461         if res is None:
2462             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2463             if fatal:
2464                 raise ExtractorError(msg)
2465             else:
2466                 self._downloader.report_warning(msg)
2467         return res
2468
2469     def _float(self, v, name, fatal=False, **kwargs):
2470         res = float_or_none(v, **kwargs)
2471         if res is None:
2472             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2473             if fatal:
2474                 raise ExtractorError(msg)
2475             else:
2476                 self._downloader.report_warning(msg)
2477         return res
2478
2479     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2480                     path='/', secure=False, discard=False, rest={}, **kwargs):
2481         cookie = compat_cookiejar.Cookie(
2482             0, name, value, port, port is not None, domain, True,
2483             domain.startswith('.'), path, True, secure, expire_time,
2484             discard, None, None, rest)
2485         self._downloader.cookiejar.set_cookie(cookie)
2486
2487     def _get_cookies(self, url):
2488         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2489         req = sanitized_Request(url)
2490         self._downloader.cookiejar.add_cookie_header(req)
2491         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2492
2493     def get_testcases(self, include_onlymatching=False):
2494         t = getattr(self, '_TEST', None)
2495         if t:
2496             assert not hasattr(self, '_TESTS'), \
2497                 '%s has _TEST and _TESTS' % type(self).__name__
2498             tests = [t]
2499         else:
2500             tests = getattr(self, '_TESTS', [])
2501         for t in tests:
2502             if not include_onlymatching and t.get('only_matching', False):
2503                 continue
2504             t['name'] = type(self).__name__[:-len('IE')]
2505             yield t
2506
2507     def is_suitable(self, age_limit):
2508         """ Test whether the extractor is generally suitable for the given
2509         age limit (i.e. pornographic sites are not, all others usually are) """
2510
2511         any_restricted = False
2512         for tc in self.get_testcases(include_onlymatching=False):
2513             if tc.get('playlist', []):
2514                 tc = tc['playlist'][0]
2515             is_restricted = age_restricted(
2516                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2517             if not is_restricted:
2518                 return True
2519             any_restricted = any_restricted or is_restricted
2520         return not any_restricted
2521
2522     def extract_subtitles(self, *args, **kwargs):
2523         if (self._downloader.params.get('writesubtitles', False) or
2524                 self._downloader.params.get('listsubtitles')):
2525             return self._get_subtitles(*args, **kwargs)
2526         return {}
2527
2528     def _get_subtitles(self, *args, **kwargs):
2529         raise NotImplementedError('This method must be implemented by subclasses')
2530
2531     @staticmethod
2532     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2533         """ Merge subtitle items for one language. Items with duplicated URLs
2534         will be dropped. """
2535         list1_urls = set([item['url'] for item in subtitle_list1])
2536         ret = list(subtitle_list1)
2537         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2538         return ret
2539
2540     @classmethod
2541     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2542         """ Merge two subtitle dictionaries, language by language. """
2543         ret = dict(subtitle_dict1)
2544         for lang in subtitle_dict2:
2545             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2546         return ret
2547
2548     def extract_automatic_captions(self, *args, **kwargs):
2549         if (self._downloader.params.get('writeautomaticsub', False) or
2550                 self._downloader.params.get('listsubtitles')):
2551             return self._get_automatic_captions(*args, **kwargs)
2552         return {}
2553
2554     def _get_automatic_captions(self, *args, **kwargs):
2555         raise NotImplementedError('This method must be implemented by subclasses')
2556
2557     def mark_watched(self, *args, **kwargs):
2558         if (self._downloader.params.get('mark_watched', False) and
2559                 (self._get_login_info()[0] is not None or
2560                     self._downloader.params.get('cookiefile') is not None)):
2561             self._mark_watched(*args, **kwargs)
2562
2563     def _mark_watched(self, *args, **kwargs):
2564         raise NotImplementedError('This method must be implemented by subclasses')
2565
2566     def geo_verification_headers(self):
2567         headers = {}
2568         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2569         if geo_verification_proxy:
2570             headers['Ytdl-request-proxy'] = geo_verification_proxy
2571         return headers
2572
2573     def _generic_id(self, url):
2574         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2575
2576     def _generic_title(self, url):
2577         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2578
2579
2580 class SearchInfoExtractor(InfoExtractor):
2581     """
2582     Base class for paged search queries extractors.
2583     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2584     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2585     """
2586
2587     @classmethod
2588     def _make_valid_url(cls):
2589         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2590
2591     @classmethod
2592     def suitable(cls, url):
2593         return re.match(cls._make_valid_url(), url) is not None
2594
2595     def _real_extract(self, query):
2596         mobj = re.match(self._make_valid_url(), query)
2597         if mobj is None:
2598             raise ExtractorError('Invalid search query "%s"' % query)
2599
2600         prefix = mobj.group('prefix')
2601         query = mobj.group('query')
2602         if prefix == '':
2603             return self._get_n_results(query, 1)
2604         elif prefix == 'all':
2605             return self._get_n_results(query, self._MAX_RESULTS)
2606         else:
2607             n = int(prefix)
2608             if n <= 0:
2609                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2610             elif n > self._MAX_RESULTS:
2611                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2612                 n = self._MAX_RESULTS
2613             return self._get_n_results(query, n)
2614
2615     def _get_n_results(self, query, n):
2616         """Get a specified number of results for a query"""
2617         raise NotImplementedError('This method must be implemented by subclasses')
2618
2619     @property
2620     def SEARCH_KEY(self):
2621         return self._SEARCH_KEY