_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177
 178     url:            Final video URL.
 179     ext:            Video filename extension.
 180     format:         The video format, defaults to ext (used for --get-format)
 181     player_url:     SWF Player URL (used for rtmpdump).
 182
 183     The following fields are optional:
 184
 185     alt_title:      A secondary title of the video.
 186     display_id      An alternative identifier for the video, not necessarily
 187                     unique, but available before title. Typically, id is
 188                     something like "4234987", title "Dancing naked mole rats",
 189                     and display_id "dancing-naked-mole-rats"
 190     thumbnails:     A list of dictionaries, with the following entries:
 191                         * "id" (optional, string) - Thumbnail format ID
 192                         * "url"
 193                         * "preference" (optional, int) - quality of the image
 194                         * "width" (optional, int)
 195                         * "height" (optional, int)
 196                         * "resolution" (optional, string "{width}x{height"},
 197                                         deprecated)
 198                         * "filesize" (optional, int)
 199     thumbnail:      Full URL to a video thumbnail image.
 200     description:    Full video description.
 201     uploader:       Full name of the video uploader.
 202     license:        License name the video is licensed under.
 203     creator:        The creator of the video.
 204     release_date:   The date (YYYYMMDD) when the video was released.
 205     timestamp:      UNIX timestamp of the moment the video became available.
 206     upload_date:    Video upload date (YYYYMMDD).
 207                     If not explicitly set, calculated from timestamp.
 208     uploader_id:    Nickname or id of the video uploader.
 209     uploader_url:   Full URL to a personal webpage of the video uploader.
 210     location:       Physical location where the video was filmed.
 211     subtitles:      The available subtitles as a dictionary in the format
 212                     {tag: subformats}. "tag" is usually a language code, and
 213                     "subformats" is a list sorted from lower to higher
 214                     preference, each element is a dictionary with the "ext"
 215                     entry and one of:
 216                         * "data": The subtitles file contents
 217                         * "url": A URL pointing to the subtitles file
 218                     "ext" will be calculated from URL if missing
 219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 220                     automatically generated captions
 221     duration:       Length of the video in seconds, as an integer or float.
 222     view_count:     How many users have watched the video on the platform.
 223     like_count:     Number of positive ratings of the video
 224     dislike_count:  Number of negative ratings of the video
 225     repost_count:   Number of reposts of the video
 226     average_rating: Average rating give by users, the scale used depends on the webpage
 227     comment_count:  Number of comments on the video
 228     comments:       A list of comments, each with one or more of the following
 229                     properties (all but one of text or html optional):
 230                         * "author" - human-readable name of the comment author
 231                         * "author_id" - user ID of the comment author
 232                         * "id" - Comment ID
 233                         * "html" - Comment as HTML
 234                         * "text" - Plain text of the comment
 235                         * "timestamp" - UNIX timestamp of comment
 236                         * "parent" - ID of the comment this one is replying to.
 237                                      Set to "root" to indicate that this is a
 238                                      comment to the original video.
 239     age_limit:      Age restriction for the video, as an integer (years)
 240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 241                     should allow to get the same result again. (It will be set
 242                     by YoutubeDL if it's missing)
 243     categories:     A list of categories that the video falls in, for example
 244                     ["Sports", "Berlin"]
 245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 246     is_live:        True, False, or None (=unknown). Whether this video is a
 247                     live stream that goes on instead of a fixed-length video.
 248     start_time:     Time in seconds where the reproduction should start, as
 249                     specified in the URL.
 250     end_time:       Time in seconds where the reproduction should end, as
 251                     specified in the URL.
 252     chapters:       A list of dictionaries, with the following entries:
 253                         * "start_time" - The start time of the chapter in seconds
 254                         * "end_time" - The end time of the chapter in seconds
 255                         * "title" (optional, string)
 256
 257     The following fields should only be used when the video belongs to some logical
 258     chapter or section:
 259
 260     chapter:        Name or title of the chapter the video belongs to.
 261     chapter_number: Number of the chapter the video belongs to, as an integer.
 262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 263
 264     The following fields should only be used when the video is an episode of some
 265     series, programme or podcast:
 266
 267     series:         Title of the series or programme the video episode belongs to.
 268     season:         Title of the season the video episode belongs to.
 269     season_number:  Number of the season the video episode belongs to, as an integer.
 270     season_id:      Id of the season the video episode belongs to, as a unicode string.
 271     episode:        Title of the video episode. Unlike mandatory video title field,
 272                     this field should denote the exact title of the video episode
 273                     without any kind of decoration.
 274     episode_number: Number of the video episode within a season, as an integer.
 275     episode_id:     Id of the video episode, as a unicode string.
 276
 277     The following fields should only be used when the media is a track or a part of
 278     a music album:
 279
 280     track:          Title of the track.
 281     track_number:   Number of the track within an album or a disc, as an integer.
 282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 283                     as a unicode string.
 284     artist:         Artist(s) of the track.
 285     genre:          Genre(s) of the track.
 286     album:          Title of the album the track belongs to.
 287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 288     album_artist:   List of all artists appeared on the album (e.g.
 289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 290                     and compilations).
 291     disc_number:    Number of the disc or other physical medium the track belongs to,
 292                     as an integer.
 293     release_year:   Year (YYYY) when the album was released.
 294
 295     Unless mentioned otherwise, the fields should be Unicode strings.
 296
 297     Unless mentioned otherwise, None is equivalent to absence of information.
 298
 299
 300     _type "playlist" indicates multiple videos.
 301     There must be a key "entries", which is a list, an iterable, or a PagedList
 302     object, each element of which is a valid dictionary by this specification.
 303
 304     Additionally, playlists can have "id", "title", "description", "uploader",
 305     "uploader_id", "uploader_url" attributes with the same semantics as videos
 306     (see above).
 307
 308
 309     _type "multi_video" indicates that there are multiple videos that
 310     form a single show, for examples multiple acts of an opera or TV episode.
 311     It must have an entries key like a playlist and contain all the keys
 312     required for a video at the same time.
 313
 314
 315     _type "url" indicates that the video must be extracted from another
 316     location, possibly by a different extractor. Its only required key is:
 317     "url" - the next URL to extract.
 318     The key "ie_key" can be set to the class name (minus the trailing "IE",
 319     e.g. "Youtube") if the extractor class is known in advance.
 320     Additionally, the dictionary may have any properties of the resolved entity
 321     known in advance, for example "title" if the title of the referred video is
 322     known ahead of time.
 323
 324
 325     _type "url_transparent" entities have the same specification as "url", but
 326     indicate that the given additional information is more precise than the one
 327     associated with the resolved URL.
 328     This is useful when a site employs a video service that hosts the video and
 329     its technical metadata, but that video service does not embed a useful
 330     title, description etc.
 331
 332
 333     Subclasses of this one should re-define the _real_initialize() and
 334     _real_extract() methods and define a _VALID_URL regexp.
 335     Probably, they should also be added to the list of extractors.
 336
 337     _GEO_BYPASS attribute may be set to False in order to disable
 338     geo restriction bypass mechanisms for a particular extractor.
 339     Though it won't disable explicit geo restriction bypass based on
 340     country code provided with geo_bypass_country. (experimental)
 341
 342     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 343     countries for this extractor. One of these countries will be used by
 344     geo restriction bypass mechanism right away in order to bypass
 345     geo restriction, of course, if the mechanism is not disabled. (experimental)
 346
 347     NB: both these geo attributes are experimental and may change in future
 348     or be completely removed.
 349
 350     Finally, the _WORKING attribute should be set to False for broken IEs
 351     in order to warn the users and skip the tests.
 352     """
 353
 354     _ready = False
 355     _downloader = None
 356     _x_forwarded_for_ip = None
 357     _GEO_BYPASS = True
 358     _GEO_COUNTRIES = None
 359     _WORKING = True
 360
 361     def __init__(self, downloader=None):
 362         """Constructor. Receives an optional downloader."""
 363         self._ready = False
 364         self._x_forwarded_for_ip = None
 365         self.set_downloader(downloader)
 366
 367     @classmethod
 368     def suitable(cls, url):
 369         """Receives a URL and returns True if suitable for this IE."""
 370
 371         # This does not use has/getattr intentionally - we want to know whether
 372         # we have cached the regexp for *this* class, whereas getattr would also
 373         # match the superclass
 374         if '_VALID_URL_RE' not in cls.__dict__:
 375             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 376         return cls._VALID_URL_RE.match(url) is not None
 377
 378     @classmethod
 379     def _match_id(cls, url):
 380         if '_VALID_URL_RE' not in cls.__dict__:
 381             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 382         m = cls._VALID_URL_RE.match(url)
 383         assert m
 384         return compat_str(m.group('id'))
 385
 386     @classmethod
 387     def working(cls):
 388         """Getter method for _WORKING."""
 389         return cls._WORKING
 390
 391     def initialize(self):
 392         """Initializes an instance (authentication, etc)."""
 393         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 394         if not self._ready:
 395             self._real_initialize()
 396             self._ready = True
 397
 398     def _initialize_geo_bypass(self, countries):
 399         """
 400         Initialize geo restriction bypass mechanism.
 401
 402         This method is used to initialize geo bypass mechanism based on faking
 403         X-Forwarded-For HTTP header. A random country from provided country list
 404         is selected and a random IP belonging to this country is generated. This
 405         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 406         HTTP requests.
 407
 408         This method will be used for initial geo bypass mechanism initialization
 409         during the instance initialization with _GEO_COUNTRIES.
 410
 411         You may also manually call it from extractor's code if geo countries
 412         information is not available beforehand (e.g. obtained during
 413         extraction) or due to some another reason.
 414         """
 415         if not self._x_forwarded_for_ip:
 416             country_code = self._downloader.params.get('geo_bypass_country', None)
 417             # If there is no explicit country for geo bypass specified and
 418             # the extractor is known to be geo restricted let's fake IP
 419             # as X-Forwarded-For right away.
 420             if (not country_code and
 421                     self._GEO_BYPASS and
 422                     self._downloader.params.get('geo_bypass', True) and
 423                     countries):
 424                 country_code = random.choice(countries)
 425             if country_code:
 426                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 427                 if self._downloader.params.get('verbose', False):
 428                     self._downloader.to_screen(
 429                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 430                         % (self._x_forwarded_for_ip, country_code.upper()))
 431
 432     def extract(self, url):
 433         """Extracts URL information and returns it in list of dicts."""
 434         try:
 435             for _ in range(2):
 436                 try:
 437                     self.initialize()
 438                     ie_result = self._real_extract(url)
 439                     if self._x_forwarded_for_ip:
 440                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 441                     return ie_result
 442                 except GeoRestrictedError as e:
 443                     if self.__maybe_fake_ip_and_retry(e.countries):
 444                         continue
 445                     raise
 446         except ExtractorError:
 447             raise
 448         except compat_http_client.IncompleteRead as e:
 449             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 450         except (KeyError, StopIteration) as e:
 451             raise ExtractorError('An extractor error has occurred.', cause=e)
 452
 453     def __maybe_fake_ip_and_retry(self, countries):
 454         if (not self._downloader.params.get('geo_bypass_country', None) and
 455                 self._GEO_BYPASS and
 456                 self._downloader.params.get('geo_bypass', True) and
 457                 not self._x_forwarded_for_ip and
 458                 countries):
 459             country_code = random.choice(countries)
 460             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 461             if self._x_forwarded_for_ip:
 462                 self.report_warning(
 463                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 464                     % (self._x_forwarded_for_ip, country_code.upper()))
 465                 return True
 466         return False
 467
 468     def set_downloader(self, downloader):
 469         """Sets the downloader for this IE."""
 470         self._downloader = downloader
 471
 472     def _real_initialize(self):
 473         """Real initialization process. Redefine in subclasses."""
 474         pass
 475
 476     def _real_extract(self, url):
 477         """Real extraction process. Redefine in subclasses."""
 478         pass
 479
 480     @classmethod
 481     def ie_key(cls):
 482         """A string for getting the InfoExtractor with get_info_extractor"""
 483         return compat_str(cls.__name__[:-2])
 484
 485     @property
 486     def IE_NAME(self):
 487         return compat_str(type(self).__name__[:-2])
 488
 489     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 490         """ Returns the response handle """
 491         if note is None:
 492             self.report_download_webpage(video_id)
 493         elif note is not False:
 494             if video_id is None:
 495                 self.to_screen('%s' % (note,))
 496             else:
 497                 self.to_screen('%s: %s' % (video_id, note))
 498         if isinstance(url_or_request, compat_urllib_request.Request):
 499             url_or_request = update_Request(
 500                 url_or_request, data=data, headers=headers, query=query)
 501         else:
 502             if query:
 503                 url_or_request = update_url_query(url_or_request, query)
 504             if data is not None or headers:
 505                 url_or_request = sanitized_Request(url_or_request, data, headers)
 506         try:
 507             return self._downloader.urlopen(url_or_request)
 508         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 509             if errnote is False:
 510                 return False
 511             if errnote is None:
 512                 errnote = 'Unable to download webpage'
 513
 514             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 515             if fatal:
 516                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 517             else:
 518                 self._downloader.report_warning(errmsg)
 519                 return False
 520
 521     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 522         """ Returns a tuple (page content as string, URL handle) """
 523         # Strip hashes from the URL (#1038)
 524         if isinstance(url_or_request, (compat_str, str)):
 525             url_or_request = url_or_request.partition('#')[0]
 526
 527         # Some sites check X-Forwarded-For HTTP header in order to figure out
 528         # the origin of the client behind proxy. This allows bypassing geo
 529         # restriction by faking this header's value to IP that belongs to some
 530         # geo unrestricted country. We will do so once we encounter any
 531         # geo restriction error.
 532         if self._x_forwarded_for_ip:
 533             if 'X-Forwarded-For' not in headers:
 534                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 535
 536         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 537         if urlh is False:
 538             assert not fatal
 539             return False
 540         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 541         return (content, urlh)
 542
 543     @staticmethod
 544     def _guess_encoding_from_content(content_type, webpage_bytes):
 545         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 546         if m:
 547             encoding = m.group(1)
 548         else:
 549             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 550                           webpage_bytes[:1024])
 551             if m:
 552                 encoding = m.group(1).decode('ascii')
 553             elif webpage_bytes.startswith(b'\xff\xfe'):
 554                 encoding = 'utf-16'
 555             else:
 556                 encoding = 'utf-8'
 557
 558         return encoding
 559
 560     def __check_blocked(self, content):
 561         first_block = content[:512]
 562         if ('<title>Access to this site is blocked</title>' in content and
 563                 'Websense' in first_block):
 564             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 565             blocked_iframe = self._html_search_regex(
 566                 r'<iframe src="([^"]+)"', content,
 567                 'Websense information URL', default=None)
 568             if blocked_iframe:
 569                 msg += ' Visit %s for more details' % blocked_iframe
 570             raise ExtractorError(msg, expected=True)
 571         if '<title>The URL you requested has been blocked</title>' in first_block:
 572             msg = (
 573                 'Access to this webpage has been blocked by Indian censorship. '
 574                 'Use a VPN or proxy server (with --proxy) to route around it.')
 575             block_msg = self._html_search_regex(
 576                 r'</h1><p>(.*?)</p>',
 577                 content, 'block message', default=None)
 578             if block_msg:
 579                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 580             raise ExtractorError(msg, expected=True)
 581         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 582                 'blocklist.rkn.gov.ru' in content):
 583             raise ExtractorError(
 584                 'Access to this webpage has been blocked by decision of the Russian government. '
 585                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 586                 expected=True)
 587
 588     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 589         content_type = urlh.headers.get('Content-Type', '')
 590         webpage_bytes = urlh.read()
 591         if prefix is not None:
 592             webpage_bytes = prefix + webpage_bytes
 593         if not encoding:
 594             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 595         if self._downloader.params.get('dump_intermediate_pages', False):
 596             self.to_screen('Dumping request to ' + urlh.geturl())
 597             dump = base64.b64encode(webpage_bytes).decode('ascii')
 598             self._downloader.to_screen(dump)
 599         if self._downloader.params.get('write_pages', False):
 600             basen = '%s_%s' % (video_id, urlh.geturl())
 601             if len(basen) > 240:
 602                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 603                 basen = basen[:240 - len(h)] + h
 604             raw_filename = basen + '.dump'
 605             filename = sanitize_filename(raw_filename, restricted=True)
 606             self.to_screen('Saving request to ' + filename)
 607             # Working around MAX_PATH limitation on Windows (see
 608             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 609             if compat_os_name == 'nt':
 610                 absfilepath = os.path.abspath(filename)
 611                 if len(absfilepath) > 259:
 612                     filename = '\\\\?\\' + absfilepath
 613             with open(filename, 'wb') as outf:
 614                 outf.write(webpage_bytes)
 615
 616         try:
 617             content = webpage_bytes.decode(encoding, 'replace')
 618         except LookupError:
 619             content = webpage_bytes.decode('utf-8', 'replace')
 620
 621         self.__check_blocked(content)
 622
 623         return content
 624
 625     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 626         """ Returns the data of the page as a string """
 627         success = False
 628         try_count = 0
 629         while success is False:
 630             try:
 631                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 632                 success = True
 633             except compat_http_client.IncompleteRead as e:
 634                 try_count += 1
 635                 if try_count >= tries:
 636                     raise e
 637                 self._sleep(timeout, video_id)
 638         if res is False:
 639             return res
 640         else:
 641             content, _ = res
 642             return content
 643
 644     def _download_xml(self, url_or_request, video_id,
 645                       note='Downloading XML', errnote='Unable to download XML',
 646                       transform_source=None, fatal=True, encoding=None,
 647                       data=None, headers={}, query={}):
 648         """Return the xml as an xml.etree.ElementTree.Element"""
 649         xml_string = self._download_webpage(
 650             url_or_request, video_id, note, errnote, fatal=fatal,
 651             encoding=encoding, data=data, headers=headers, query=query)
 652         if xml_string is False:
 653             return xml_string
 654         return self._parse_xml(
 655             xml_string, video_id, transform_source=transform_source,
 656             fatal=fatal)
 657
 658     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 659         if transform_source:
 660             xml_string = transform_source(xml_string)
 661         try:
 662             return compat_etree_fromstring(xml_string.encode('utf-8'))
 663         except compat_xml_parse_error as ve:
 664             errmsg = '%s: Failed to parse XML ' % video_id
 665             if fatal:
 666                 raise ExtractorError(errmsg, cause=ve)
 667             else:
 668                 self.report_warning(errmsg + str(ve))
 669
 670     def _download_json(self, url_or_request, video_id,
 671                        note='Downloading JSON metadata',
 672                        errnote='Unable to download JSON metadata',
 673                        transform_source=None,
 674                        fatal=True, encoding=None, data=None, headers={}, query={}):
 675         json_string = self._download_webpage(
 676             url_or_request, video_id, note, errnote, fatal=fatal,
 677             encoding=encoding, data=data, headers=headers, query=query)
 678         if (not fatal) and json_string is False:
 679             return None
 680         return self._parse_json(
 681             json_string, video_id, transform_source=transform_source, fatal=fatal)
 682
 683     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 684         if transform_source:
 685             json_string = transform_source(json_string)
 686         try:
 687             return json.loads(json_string)
 688         except ValueError as ve:
 689             errmsg = '%s: Failed to parse JSON ' % video_id
 690             if fatal:
 691                 raise ExtractorError(errmsg, cause=ve)
 692             else:
 693                 self.report_warning(errmsg + str(ve))
 694
 695     def report_warning(self, msg, video_id=None):
 696         idstr = '' if video_id is None else '%s: ' % video_id
 697         self._downloader.report_warning(
 698             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 699
 700     def to_screen(self, msg):
 701         """Print msg to screen, prefixing it with '[ie_name]'"""
 702         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 703
 704     def report_extraction(self, id_or_name):
 705         """Report information extraction."""
 706         self.to_screen('%s: Extracting information' % id_or_name)
 707
 708     def report_download_webpage(self, video_id):
 709         """Report webpage download."""
 710         self.to_screen('%s: Downloading webpage' % video_id)
 711
 712     def report_age_confirmation(self):
 713         """Report attempt to confirm age."""
 714         self.to_screen('Confirming age')
 715
 716     def report_login(self):
 717         """Report attempt to log in."""
 718         self.to_screen('Logging in')
 719
 720     @staticmethod
 721     def raise_login_required(msg='This video is only available for registered users'):
 722         raise ExtractorError(
 723             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 724             expected=True)
 725
 726     @staticmethod
 727     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 728         raise GeoRestrictedError(msg, countries=countries)
 729
 730     # Methods for following #608
 731     @staticmethod
 732     def url_result(url, ie=None, video_id=None, video_title=None):
 733         """Returns a URL that points to a page that should be processed"""
 734         # TODO: ie should be the class used for getting the info
 735         video_info = {'_type': 'url',
 736                       'url': url,
 737                       'ie_key': ie}
 738         if video_id is not None:
 739             video_info['id'] = video_id
 740         if video_title is not None:
 741             video_info['title'] = video_title
 742         return video_info
 743
 744     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 745         urls = orderedSet(
 746             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 747             for m in matches)
 748         return self.playlist_result(
 749             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 750
 751     @staticmethod
 752     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 753         """Returns a playlist"""
 754         video_info = {'_type': 'playlist',
 755                       'entries': entries}
 756         if playlist_id:
 757             video_info['id'] = playlist_id
 758         if playlist_title:
 759             video_info['title'] = playlist_title
 760         if playlist_description:
 761             video_info['description'] = playlist_description
 762         return video_info
 763
 764     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 765         """
 766         Perform a regex search on the given string, using a single or a list of
 767         patterns returning the first matching group.
 768         In case of failure return a default value or raise a WARNING or a
 769         RegexNotFoundError, depending on fatal, specifying the field name.
 770         """
 771         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 772             mobj = re.search(pattern, string, flags)
 773         else:
 774             for p in pattern:
 775                 mobj = re.search(p, string, flags)
 776                 if mobj:
 777                     break
 778
 779         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 780             _name = '\033[0;34m%s\033[0m' % name
 781         else:
 782             _name = name
 783
 784         if mobj:
 785             if group is None:
 786                 # return the first matching group
 787                 return next(g for g in mobj.groups() if g is not None)
 788             else:
 789                 return mobj.group(group)
 790         elif default is not NO_DEFAULT:
 791             return default
 792         elif fatal:
 793             raise RegexNotFoundError('Unable to extract %s' % _name)
 794         else:
 795             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 796             return None
 797
 798     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 799         """
 800         Like _search_regex, but strips HTML tags and unescapes entities.
 801         """
 802         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 803         if res:
 804             return clean_html(res).strip()
 805         else:
 806             return res
 807
 808     def _get_netrc_login_info(self, netrc_machine=None):
 809         username = None
 810         password = None
 811         netrc_machine = netrc_machine or self._NETRC_MACHINE
 812
 813         if self._downloader.params.get('usenetrc', False):
 814             try:
 815                 info = netrc.netrc().authenticators(netrc_machine)
 816                 if info is not None:
 817                     username = info[0]
 818                     password = info[2]
 819                 else:
 820                     raise netrc.NetrcParseError(
 821                         'No authenticators for %s' % netrc_machine)
 822             except (IOError, netrc.NetrcParseError) as err:
 823                 self._downloader.report_warning(
 824                     'parsing .netrc: %s' % error_to_compat_str(err))
 825
 826         return username, password
 827
 828     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 829         """
 830         Get the login info as (username, password)
 831         First look for the manually specified credentials using username_option
 832         and password_option as keys in params dictionary. If no such credentials
 833         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 834         value.
 835         If there's no info available, return (None, None)
 836         """
 837         if self._downloader is None:
 838             return (None, None)
 839
 840         downloader_params = self._downloader.params
 841
 842         # Attempt to use provided username and password or .netrc data
 843         if downloader_params.get(username_option) is not None:
 844             username = downloader_params[username_option]
 845             password = downloader_params[password_option]
 846         else:
 847             username, password = self._get_netrc_login_info(netrc_machine)
 848
 849         return username, password
 850
 851     def _get_tfa_info(self, note='two-factor verification code'):
 852         """
 853         Get the two-factor authentication info
 854         TODO - asking the user will be required for sms/phone verify
 855         currently just uses the command line option
 856         If there's no info available, return None
 857         """
 858         if self._downloader is None:
 859             return None
 860         downloader_params = self._downloader.params
 861
 862         if downloader_params.get('twofactor') is not None:
 863             return downloader_params['twofactor']
 864
 865         return compat_getpass('Type %s and press [Return]: ' % note)
 866
 867     # Helper functions for extracting OpenGraph info
 868     @staticmethod
 869     def _og_regexes(prop):
 870         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 871         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 872                        % {'prop': re.escape(prop)})
 873         template = r'<meta[^>]+?%s[^>]+?%s'
 874         return [
 875             template % (property_re, content_re),
 876             template % (content_re, property_re),
 877         ]
 878
 879     @staticmethod
 880     def _meta_regex(prop):
 881         return r'''(?isx)<meta
 882                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 883                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 884
 885     def _og_search_property(self, prop, html, name=None, **kargs):
 886         if not isinstance(prop, (list, tuple)):
 887             prop = [prop]
 888         if name is None:
 889             name = 'OpenGraph %s' % prop[0]
 890         og_regexes = []
 891         for p in prop:
 892             og_regexes.extend(self._og_regexes(p))
 893         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 894         if escaped is None:
 895             return None
 896         return unescapeHTML(escaped)
 897
 898     def _og_search_thumbnail(self, html, **kargs):
 899         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 900
 901     def _og_search_description(self, html, **kargs):
 902         return self._og_search_property('description', html, fatal=False, **kargs)
 903
 904     def _og_search_title(self, html, **kargs):
 905         return self._og_search_property('title', html, **kargs)
 906
 907     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 908         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 909         if secure:
 910             regexes = self._og_regexes('video:secure_url') + regexes
 911         return self._html_search_regex(regexes, html, name, **kargs)
 912
 913     def _og_search_url(self, html, **kargs):
 914         return self._og_search_property('url', html, **kargs)
 915
 916     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 917         if not isinstance(name, (list, tuple)):
 918             name = [name]
 919         if display_name is None:
 920             display_name = name[0]
 921         return self._html_search_regex(
 922             [self._meta_regex(n) for n in name],
 923             html, display_name, fatal=fatal, group='content', **kwargs)
 924
 925     def _dc_search_uploader(self, html):
 926         return self._html_search_meta('dc.creator', html, 'uploader')
 927
 928     def _rta_search(self, html):
 929         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 930         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 931                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 932                      html):
 933             return 18
 934         return 0
 935
 936     def _media_rating_search(self, html):
 937         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 938         rating = self._html_search_meta('rating', html)
 939
 940         if not rating:
 941             return None
 942
 943         RATING_TABLE = {
 944             'safe for kids': 0,
 945             'general': 8,
 946             '14 years': 14,
 947             'mature': 17,
 948             'restricted': 19,
 949         }
 950         return RATING_TABLE.get(rating.lower())
 951
 952     def _family_friendly_search(self, html):
 953         # See http://schema.org/VideoObject
 954         family_friendly = self._html_search_meta(
 955             'isFamilyFriendly', html, default=None)
 956
 957         if not family_friendly:
 958             return None
 959
 960         RATING_TABLE = {
 961             '1': 0,
 962             'true': 0,
 963             '0': 18,
 964             'false': 18,
 965         }
 966         return RATING_TABLE.get(family_friendly.lower())
 967
 968     def _twitter_search_player(self, html):
 969         return self._html_search_meta('twitter:player', html,
 970                                       'twitter card player')
 971
 972     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 973         json_ld = self._search_regex(
 974             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 975             html, 'JSON-LD', group='json_ld', **kwargs)
 976         default = kwargs.get('default', NO_DEFAULT)
 977         if not json_ld:
 978             return default if default is not NO_DEFAULT else {}
 979         # JSON-LD may be malformed and thus `fatal` should be respected.
 980         # At the same time `default` may be passed that assumes `fatal=False`
 981         # for _search_regex. Let's simulate the same behavior here as well.
 982         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 983         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 984
 985     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 986         if isinstance(json_ld, compat_str):
 987             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 988         if not json_ld:
 989             return {}
 990         info = {}
 991         if not isinstance(json_ld, (list, tuple, dict)):
 992             return info
 993         if isinstance(json_ld, dict):
 994             json_ld = [json_ld]
 995
 996         def extract_video_object(e):
 997             assert e['@type'] == 'VideoObject'
 998             info.update({
 999                 'url': e.get('contentUrl'),
1000                 'title': unescapeHTML(e.get('name')),
1001                 'description': unescapeHTML(e.get('description')),
1002                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1003                 'duration': parse_duration(e.get('duration')),
1004                 'timestamp': unified_timestamp(e.get('uploadDate')),
1005                 'filesize': float_or_none(e.get('contentSize')),
1006                 'tbr': int_or_none(e.get('bitrate')),
1007                 'width': int_or_none(e.get('width')),
1008                 'height': int_or_none(e.get('height')),
1009                 'view_count': int_or_none(e.get('interactionCount')),
1010             })
1011
1012         for e in json_ld:
1013             if e.get('@context') == 'http://schema.org':
1014                 item_type = e.get('@type')
1015                 if expected_type is not None and expected_type != item_type:
1016                     return info
1017                 if item_type in ('TVEpisode', 'Episode'):
1018                     info.update({
1019                         'episode': unescapeHTML(e.get('name')),
1020                         'episode_number': int_or_none(e.get('episodeNumber')),
1021                         'description': unescapeHTML(e.get('description')),
1022                     })
1023                     part_of_season = e.get('partOfSeason')
1024                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1025                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1026                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1027                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1028                         info['series'] = unescapeHTML(part_of_series.get('name'))
1029                 elif item_type == 'Article':
1030                     info.update({
1031                         'timestamp': parse_iso8601(e.get('datePublished')),
1032                         'title': unescapeHTML(e.get('headline')),
1033                         'description': unescapeHTML(e.get('articleBody')),
1034                     })
1035                 elif item_type == 'VideoObject':
1036                     extract_video_object(e)
1037                     continue
1038                 video = e.get('video')
1039                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1040                     extract_video_object(video)
1041                 break
1042         return dict((k, v) for k, v in info.items() if v is not None)
1043
1044     @staticmethod
1045     def _hidden_inputs(html):
1046         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1047         hidden_inputs = {}
1048         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1049             attrs = extract_attributes(input)
1050             if not input:
1051                 continue
1052             if attrs.get('type') not in ('hidden', 'submit'):
1053                 continue
1054             name = attrs.get('name') or attrs.get('id')
1055             value = attrs.get('value')
1056             if name and value is not None:
1057                 hidden_inputs[name] = value
1058         return hidden_inputs
1059
1060     def _form_hidden_inputs(self, form_id, html):
1061         form = self._search_regex(
1062             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1063             html, '%s form' % form_id, group='form')
1064         return self._hidden_inputs(form)
1065
1066     def _sort_formats(self, formats, field_preference=None):
1067         if not formats:
1068             raise ExtractorError('No video formats found')
1069
1070         for f in formats:
1071             # Automatically determine tbr when missing based on abr and vbr (improves
1072             # formats sorting in some cases)
1073             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1074                 f['tbr'] = f['abr'] + f['vbr']
1075
1076         def _formats_key(f):
1077             # TODO remove the following workaround
1078             from ..utils import determine_ext
1079             if not f.get('ext') and 'url' in f:
1080                 f['ext'] = determine_ext(f['url'])
1081
1082             if isinstance(field_preference, (list, tuple)):
1083                 return tuple(
1084                     f.get(field)
1085                     if f.get(field) is not None
1086                     else ('' if field == 'format_id' else -1)
1087                     for field in field_preference)
1088
1089             preference = f.get('preference')
1090             if preference is None:
1091                 preference = 0
1092                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1093                     preference -= 0.5
1094
1095             protocol = f.get('protocol') or determine_protocol(f)
1096             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1097
1098             if f.get('vcodec') == 'none':  # audio only
1099                 preference -= 50
1100                 if self._downloader.params.get('prefer_free_formats'):
1101                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1102                 else:
1103                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1104                 ext_preference = 0
1105                 try:
1106                     audio_ext_preference = ORDER.index(f['ext'])
1107                 except ValueError:
1108                     audio_ext_preference = -1
1109             else:
1110                 if f.get('acodec') == 'none':  # video only
1111                     preference -= 40
1112                 if self._downloader.params.get('prefer_free_formats'):
1113                     ORDER = ['flv', 'mp4', 'webm']
1114                 else:
1115                     ORDER = ['webm', 'flv', 'mp4']
1116                 try:
1117                     ext_preference = ORDER.index(f['ext'])
1118                 except ValueError:
1119                     ext_preference = -1
1120                 audio_ext_preference = 0
1121
1122             return (
1123                 preference,
1124                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1125                 f.get('quality') if f.get('quality') is not None else -1,
1126                 f.get('tbr') if f.get('tbr') is not None else -1,
1127                 f.get('filesize') if f.get('filesize') is not None else -1,
1128                 f.get('vbr') if f.get('vbr') is not None else -1,
1129                 f.get('height') if f.get('height') is not None else -1,
1130                 f.get('width') if f.get('width') is not None else -1,
1131                 proto_preference,
1132                 ext_preference,
1133                 f.get('abr') if f.get('abr') is not None else -1,
1134                 audio_ext_preference,
1135                 f.get('fps') if f.get('fps') is not None else -1,
1136                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1137                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1138                 f.get('format_id') if f.get('format_id') is not None else '',
1139             )
1140         formats.sort(key=_formats_key)
1141
1142     def _check_formats(self, formats, video_id):
1143         if formats:
1144             formats[:] = filter(
1145                 lambda f: self._is_valid_url(
1146                     f['url'], video_id,
1147                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1148                 formats)
1149
1150     @staticmethod
1151     def _remove_duplicate_formats(formats):
1152         format_urls = set()
1153         unique_formats = []
1154         for f in formats:
1155             if f['url'] not in format_urls:
1156                 format_urls.add(f['url'])
1157                 unique_formats.append(f)
1158         formats[:] = unique_formats
1159
1160     def _is_valid_url(self, url, video_id, item='video', headers={}):
1161         url = self._proto_relative_url(url, scheme='http:')
1162         # For now assume non HTTP(S) URLs always valid
1163         if not (url.startswith('http://') or url.startswith('https://')):
1164             return True
1165         try:
1166             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1167             return True
1168         except ExtractorError as e:
1169             if isinstance(e.cause, compat_urllib_error.URLError):
1170                 self.to_screen(
1171                     '%s: %s URL is invalid, skipping' % (video_id, item))
1172                 return False
1173             raise
1174
1175     def http_scheme(self):
1176         """ Either "http:" or "https:", depending on the user's preferences """
1177         return (
1178             'http:'
1179             if self._downloader.params.get('prefer_insecure', False)
1180             else 'https:')
1181
1182     def _proto_relative_url(self, url, scheme=None):
1183         if url is None:
1184             return url
1185         if url.startswith('//'):
1186             if scheme is None:
1187                 scheme = self.http_scheme()
1188             return scheme + url
1189         else:
1190             return url
1191
1192     def _sleep(self, timeout, video_id, msg_template=None):
1193         if msg_template is None:
1194             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1195         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1196         self.to_screen(msg)
1197         time.sleep(timeout)
1198
1199     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1200                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1201                              fatal=True, m3u8_id=None):
1202         manifest = self._download_xml(
1203             manifest_url, video_id, 'Downloading f4m manifest',
1204             'Unable to download f4m manifest',
1205             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1206             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1207             transform_source=transform_source,
1208             fatal=fatal)
1209
1210         if manifest is False:
1211             return []
1212
1213         return self._parse_f4m_formats(
1214             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1215             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1216
1217     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1218                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1219                            fatal=True, m3u8_id=None):
1220         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1221         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1222         if akamai_pv is not None and ';' in akamai_pv.text:
1223             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1224             if playerVerificationChallenge.strip() != '':
1225                 return []
1226
1227         formats = []
1228         manifest_version = '1.0'
1229         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1230         if not media_nodes:
1231             manifest_version = '2.0'
1232             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1233         # Remove unsupported DRM protected media from final formats
1234         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1235         media_nodes = remove_encrypted_media(media_nodes)
1236         if not media_nodes:
1237             return formats
1238
1239         manifest_base_url = get_base_url(manifest)
1240
1241         bootstrap_info = xpath_element(
1242             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1243             'bootstrap info', default=None)
1244
1245         vcodec = None
1246         mime_type = xpath_text(
1247             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1248             'base URL', default=None)
1249         if mime_type and mime_type.startswith('audio/'):
1250             vcodec = 'none'
1251
1252         for i, media_el in enumerate(media_nodes):
1253             tbr = int_or_none(media_el.attrib.get('bitrate'))
1254             width = int_or_none(media_el.attrib.get('width'))
1255             height = int_or_none(media_el.attrib.get('height'))
1256             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1257             # If <bootstrapInfo> is present, the specified f4m is a
1258             # stream-level manifest, and only set-level manifests may refer to
1259             # external resources.  See section 11.4 and section 4 of F4M spec
1260             if bootstrap_info is None:
1261                 media_url = None
1262                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1263                 if manifest_version == '2.0':
1264                     media_url = media_el.attrib.get('href')
1265                 if media_url is None:
1266                     media_url = media_el.attrib.get('url')
1267                 if not media_url:
1268                     continue
1269                 manifest_url = (
1270                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1271                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1272                 # If media_url is itself a f4m manifest do the recursive extraction
1273                 # since bitrates in parent manifest (this one) and media_url manifest
1274                 # may differ leading to inability to resolve the format by requested
1275                 # bitrate in f4m downloader
1276                 ext = determine_ext(manifest_url)
1277                 if ext == 'f4m':
1278                     f4m_formats = self._extract_f4m_formats(
1279                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1280                         transform_source=transform_source, fatal=fatal)
1281                     # Sometimes stream-level manifest contains single media entry that
1282                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1283                     # At the same time parent's media entry in set-level manifest may
1284                     # contain it. We will copy it from parent in such cases.
1285                     if len(f4m_formats) == 1:
1286                         f = f4m_formats[0]
1287                         f.update({
1288                             'tbr': f.get('tbr') or tbr,
1289                             'width': f.get('width') or width,
1290                             'height': f.get('height') or height,
1291                             'format_id': f.get('format_id') if not tbr else format_id,
1292                             'vcodec': vcodec,
1293                         })
1294                     formats.extend(f4m_formats)
1295                     continue
1296                 elif ext == 'm3u8':
1297                     formats.extend(self._extract_m3u8_formats(
1298                         manifest_url, video_id, 'mp4', preference=preference,
1299                         m3u8_id=m3u8_id, fatal=fatal))
1300                     continue
1301             formats.append({
1302                 'format_id': format_id,
1303                 'url': manifest_url,
1304                 'manifest_url': manifest_url,
1305                 'ext': 'flv' if bootstrap_info is not None else None,
1306                 'protocol': 'f4m',
1307                 'tbr': tbr,
1308                 'width': width,
1309                 'height': height,
1310                 'vcodec': vcodec,
1311                 'preference': preference,
1312             })
1313         return formats
1314
1315     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1316         return {
1317             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1318             'url': m3u8_url,
1319             'ext': ext,
1320             'protocol': 'm3u8',
1321             'preference': preference - 100 if preference else -100,
1322             'resolution': 'multiple',
1323             'format_note': 'Quality selection URL',
1324         }
1325
1326     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1327                               entry_protocol='m3u8', preference=None,
1328                               m3u8_id=None, note=None, errnote=None,
1329                               fatal=True, live=False):
1330         res = self._download_webpage_handle(
1331             m3u8_url, video_id,
1332             note=note or 'Downloading m3u8 information',
1333             errnote=errnote or 'Failed to download m3u8 information',
1334             fatal=fatal)
1335
1336         if res is False:
1337             return []
1338
1339         m3u8_doc, urlh = res
1340         m3u8_url = urlh.geturl()
1341
1342         return self._parse_m3u8_formats(
1343             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1344             preference=preference, m3u8_id=m3u8_id, live=live)
1345
1346     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1347                             entry_protocol='m3u8', preference=None,
1348                             m3u8_id=None, live=False):
1349         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1350             return []
1351
1352         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1353             return []
1354
1355         formats = []
1356
1357         format_url = lambda u: (
1358             u
1359             if re.match(r'^https?://', u)
1360             else compat_urlparse.urljoin(m3u8_url, u))
1361
1362         # References:
1363         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1364         # 2. https://github.com/rg3/youtube-dl/issues/12211
1365
1366         # We should try extracting formats only from master playlists [1, 4.3.4],
1367         # i.e. playlists that describe available qualities. On the other hand
1368         # media playlists [1, 4.3.3] should be returned as is since they contain
1369         # just the media without qualities renditions.
1370         # Fortunately, master playlist can be easily distinguished from media
1371         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1372         # master playlist tags MUST NOT appear in a media playist and vice versa.
1373         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1374         # media playlist and MUST NOT appear in master playlist thus we can
1375         # clearly detect media playlist with this criterion.
1376
1377         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1378             return [{
1379                 'url': m3u8_url,
1380                 'format_id': m3u8_id,
1381                 'ext': ext,
1382                 'protocol': entry_protocol,
1383                 'preference': preference,
1384             }]
1385
1386         groups = {}
1387         last_stream_inf = {}
1388
1389         def extract_media(x_media_line):
1390             media = parse_m3u8_attributes(x_media_line)
1391             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1392             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1393             if not (media_type and group_id and name):
1394                 return
1395             groups.setdefault(group_id, []).append(media)
1396             if media_type not in ('VIDEO', 'AUDIO'):
1397                 return
1398             media_url = media.get('URI')
1399             if media_url:
1400                 format_id = []
1401                 for v in (m3u8_id, group_id, name):
1402                     if v:
1403                         format_id.append(v)
1404                 f = {
1405                     'format_id': '-'.join(format_id),
1406                     'url': format_url(media_url),
1407                     'manifest_url': m3u8_url,
1408                     'language': media.get('LANGUAGE'),
1409                     'ext': ext,
1410                     'protocol': entry_protocol,
1411                     'preference': preference,
1412                 }
1413                 if media_type == 'AUDIO':
1414                     f['vcodec'] = 'none'
1415                 formats.append(f)
1416
1417         def build_stream_name():
1418             # Despite specification does not mention NAME attribute for
1419             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1420             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1421             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1422             stream_name = last_stream_inf.get('NAME')
1423             if stream_name:
1424                 return stream_name
1425             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1426             # from corresponding rendition group
1427             stream_group_id = last_stream_inf.get('VIDEO')
1428             if not stream_group_id:
1429                 return
1430             stream_group = groups.get(stream_group_id)
1431             if not stream_group:
1432                 return stream_group_id
1433             rendition = stream_group[0]
1434             return rendition.get('NAME') or stream_group_id
1435
1436         for line in m3u8_doc.splitlines():
1437             if line.startswith('#EXT-X-STREAM-INF:'):
1438                 last_stream_inf = parse_m3u8_attributes(line)
1439             elif line.startswith('#EXT-X-MEDIA:'):
1440                 extract_media(line)
1441             elif line.startswith('#') or not line.strip():
1442                 continue
1443             else:
1444                 tbr = float_or_none(
1445                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1446                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1447                 format_id = []
1448                 if m3u8_id:
1449                     format_id.append(m3u8_id)
1450                 stream_name = build_stream_name()
1451                 # Bandwidth of live streams may differ over time thus making
1452                 # format_id unpredictable. So it's better to keep provided
1453                 # format_id intact.
1454                 if not live:
1455                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1456                 manifest_url = format_url(line.strip())
1457                 f = {
1458                     'format_id': '-'.join(format_id),
1459                     'url': manifest_url,
1460                     'manifest_url': m3u8_url,
1461                     'tbr': tbr,
1462                     'ext': ext,
1463                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1464                     'protocol': entry_protocol,
1465                     'preference': preference,
1466                 }
1467                 resolution = last_stream_inf.get('RESOLUTION')
1468                 if resolution:
1469                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1470                     if mobj:
1471                         f['width'] = int(mobj.group('width'))
1472                         f['height'] = int(mobj.group('height'))
1473                 # Unified Streaming Platform
1474                 mobj = re.search(
1475                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1476                 if mobj:
1477                     abr, vbr = mobj.groups()
1478                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1479                     f.update({
1480                         'vbr': vbr,
1481                         'abr': abr,
1482                     })
1483                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1484                 f.update(codecs)
1485                 audio_group_id = last_stream_inf.get('AUDIO')
1486                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1487                 # references a rendition group MUST have a CODECS attribute.
1488                 # However, this is not always respected, for example, [2]
1489                 # contains EXT-X-STREAM-INF tag which references AUDIO
1490                 # rendition group but does not have CODECS and despite
1491                 # referencing audio group an audio group, it represents
1492                 # a complete (with audio and video) format. So, for such cases
1493                 # we will ignore references to rendition groups and treat them
1494                 # as complete formats.
1495                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1496                     audio_group = groups.get(audio_group_id)
1497                     if audio_group and audio_group[0].get('URI'):
1498                         # TODO: update acodec for audio only formats with
1499                         # the same GROUP-ID
1500                         f['acodec'] = 'none'
1501                 formats.append(f)
1502                 last_stream_inf = {}
1503         return formats
1504
1505     @staticmethod
1506     def _xpath_ns(path, namespace=None):
1507         if not namespace:
1508             return path
1509         out = []
1510         for c in path.split('/'):
1511             if not c or c == '.':
1512                 out.append(c)
1513             else:
1514                 out.append('{%s}%s' % (namespace, c))
1515         return '/'.join(out)
1516
1517     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1518         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1519
1520         if smil is False:
1521             assert not fatal
1522             return []
1523
1524         namespace = self._parse_smil_namespace(smil)
1525
1526         return self._parse_smil_formats(
1527             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1528
1529     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1530         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1531         if smil is False:
1532             return {}
1533         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1534
1535     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1536         return self._download_xml(
1537             smil_url, video_id, 'Downloading SMIL file',
1538             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1539
1540     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1541         namespace = self._parse_smil_namespace(smil)
1542
1543         formats = self._parse_smil_formats(
1544             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1545         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1546
1547         video_id = os.path.splitext(url_basename(smil_url))[0]
1548         title = None
1549         description = None
1550         upload_date = None
1551         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1552             name = meta.attrib.get('name')
1553             content = meta.attrib.get('content')
1554             if not name or not content:
1555                 continue
1556             if not title and name == 'title':
1557                 title = content
1558             elif not description and name in ('description', 'abstract'):
1559                 description = content
1560             elif not upload_date and name == 'date':
1561                 upload_date = unified_strdate(content)
1562
1563         thumbnails = [{
1564             'id': image.get('type'),
1565             'url': image.get('src'),
1566             'width': int_or_none(image.get('width')),
1567             'height': int_or_none(image.get('height')),
1568         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1569
1570         return {
1571             'id': video_id,
1572             'title': title or video_id,
1573             'description': description,
1574             'upload_date': upload_date,
1575             'thumbnails': thumbnails,
1576             'formats': formats,
1577             'subtitles': subtitles,
1578         }
1579
1580     def _parse_smil_namespace(self, smil):
1581         return self._search_regex(
1582             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1583
1584     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1585         base = smil_url
1586         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1587             b = meta.get('base') or meta.get('httpBase')
1588             if b:
1589                 base = b
1590                 break
1591
1592         formats = []
1593         rtmp_count = 0
1594         http_count = 0
1595         m3u8_count = 0
1596
1597         srcs = []
1598         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1599         for medium in media:
1600             src = medium.get('src')
1601             if not src or src in srcs:
1602                 continue
1603             srcs.append(src)
1604
1605             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1606             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1607             width = int_or_none(medium.get('width'))
1608             height = int_or_none(medium.get('height'))
1609             proto = medium.get('proto')
1610             ext = medium.get('ext')
1611             src_ext = determine_ext(src)
1612             streamer = medium.get('streamer') or base
1613
1614             if proto == 'rtmp' or streamer.startswith('rtmp'):
1615                 rtmp_count += 1
1616                 formats.append({
1617                     'url': streamer,
1618                     'play_path': src,
1619                     'ext': 'flv',
1620                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1621                     'tbr': bitrate,
1622                     'filesize': filesize,
1623                     'width': width,
1624                     'height': height,
1625                 })
1626                 if transform_rtmp_url:
1627                     streamer, src = transform_rtmp_url(streamer, src)
1628                     formats[-1].update({
1629                         'url': streamer,
1630                         'play_path': src,
1631                     })
1632                 continue
1633
1634             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1635             src_url = src_url.strip()
1636
1637             if proto == 'm3u8' or src_ext == 'm3u8':
1638                 m3u8_formats = self._extract_m3u8_formats(
1639                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1640                 if len(m3u8_formats) == 1:
1641                     m3u8_count += 1
1642                     m3u8_formats[0].update({
1643                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1644                         'tbr': bitrate,
1645                         'width': width,
1646                         'height': height,
1647                     })
1648                 formats.extend(m3u8_formats)
1649                 continue
1650
1651             if src_ext == 'f4m':
1652                 f4m_url = src_url
1653                 if not f4m_params:
1654                     f4m_params = {
1655                         'hdcore': '3.2.0',
1656                         'plugin': 'flowplayer-3.2.0.1',
1657                     }
1658                 f4m_url += '&' if '?' in f4m_url else '?'
1659                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1660                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1661                 continue
1662
1663             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1664                 http_count += 1
1665                 formats.append({
1666                     'url': src_url,
1667                     'ext': ext or src_ext or 'flv',
1668                     'format_id': 'http-%d' % (bitrate or http_count),
1669                     'tbr': bitrate,
1670                     'filesize': filesize,
1671                     'width': width,
1672                     'height': height,
1673                 })
1674                 continue
1675
1676         return formats
1677
1678     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1679         urls = []
1680         subtitles = {}
1681         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1682             src = textstream.get('src')
1683             if not src or src in urls:
1684                 continue
1685             urls.append(src)
1686             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1687             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1688             subtitles.setdefault(lang, []).append({
1689                 'url': src,
1690                 'ext': ext,
1691             })
1692         return subtitles
1693
1694     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1695         xspf = self._download_xml(
1696             playlist_url, playlist_id, 'Downloading xpsf playlist',
1697             'Unable to download xspf manifest', fatal=fatal)
1698         if xspf is False:
1699             return []
1700         return self._parse_xspf(xspf, playlist_id)
1701
1702     def _parse_xspf(self, playlist, playlist_id):
1703         NS_MAP = {
1704             'xspf': 'http://xspf.org/ns/0/',
1705             's1': 'http://static.streamone.nl/player/ns/0',
1706         }
1707
1708         entries = []
1709         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1710             title = xpath_text(
1711                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1712             description = xpath_text(
1713                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1714             thumbnail = xpath_text(
1715                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1716             duration = float_or_none(
1717                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1718
1719             formats = [{
1720                 'url': location.text,
1721                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1722                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1723                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1724             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1725             self._sort_formats(formats)
1726
1727             entries.append({
1728                 'id': playlist_id,
1729                 'title': title,
1730                 'description': description,
1731                 'thumbnail': thumbnail,
1732                 'duration': duration,
1733                 'formats': formats,
1734             })
1735         return entries
1736
1737     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1738         res = self._download_webpage_handle(
1739             mpd_url, video_id,
1740             note=note or 'Downloading MPD manifest',
1741             errnote=errnote or 'Failed to download MPD manifest',
1742             fatal=fatal)
1743         if res is False:
1744             return []
1745         mpd, urlh = res
1746         mpd_base_url = base_url(urlh.geturl())
1747
1748         return self._parse_mpd_formats(
1749             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1750             formats_dict=formats_dict, mpd_url=mpd_url)
1751
1752     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1753         """
1754         Parse formats from MPD manifest.
1755         References:
1756          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1757             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1758          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1759         """
1760         if mpd_doc.get('type') == 'dynamic':
1761             return []
1762
1763         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1764
1765         def _add_ns(path):
1766             return self._xpath_ns(path, namespace)
1767
1768         def is_drm_protected(element):
1769             return element.find(_add_ns('ContentProtection')) is not None
1770
1771         def extract_multisegment_info(element, ms_parent_info):
1772             ms_info = ms_parent_info.copy()
1773
1774             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1775             # common attributes and elements.  We will only extract relevant
1776             # for us.
1777             def extract_common(source):
1778                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1779                 if segment_timeline is not None:
1780                     s_e = segment_timeline.findall(_add_ns('S'))
1781                     if s_e:
1782                         ms_info['total_number'] = 0
1783                         ms_info['s'] = []
1784                         for s in s_e:
1785                             r = int(s.get('r', 0))
1786                             ms_info['total_number'] += 1 + r
1787                             ms_info['s'].append({
1788                                 't': int(s.get('t', 0)),
1789                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1790                                 'd': int(s.attrib['d']),
1791                                 'r': r,
1792                             })
1793                 start_number = source.get('startNumber')
1794                 if start_number:
1795                     ms_info['start_number'] = int(start_number)
1796                 timescale = source.get('timescale')
1797                 if timescale:
1798                     ms_info['timescale'] = int(timescale)
1799                 segment_duration = source.get('duration')
1800                 if segment_duration:
1801                     ms_info['segment_duration'] = float(segment_duration)
1802
1803             def extract_Initialization(source):
1804                 initialization = source.find(_add_ns('Initialization'))
1805                 if initialization is not None:
1806                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1807
1808             segment_list = element.find(_add_ns('SegmentList'))
1809             if segment_list is not None:
1810                 extract_common(segment_list)
1811                 extract_Initialization(segment_list)
1812                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1813                 if segment_urls_e:
1814                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1815             else:
1816                 segment_template = element.find(_add_ns('SegmentTemplate'))
1817                 if segment_template is not None:
1818                     extract_common(segment_template)
1819                     media = segment_template.get('media')
1820                     if media:
1821                         ms_info['media'] = media
1822                     initialization = segment_template.get('initialization')
1823                     if initialization:
1824                         ms_info['initialization'] = initialization
1825                     else:
1826                         extract_Initialization(segment_template)
1827             return ms_info
1828
1829         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1830         formats = []
1831         for period in mpd_doc.findall(_add_ns('Period')):
1832             period_duration = parse_duration(period.get('duration')) or mpd_duration
1833             period_ms_info = extract_multisegment_info(period, {
1834                 'start_number': 1,
1835                 'timescale': 1,
1836             })
1837             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1838                 if is_drm_protected(adaptation_set):
1839                     continue
1840                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1841                 for representation in adaptation_set.findall(_add_ns('Representation')):
1842                     if is_drm_protected(representation):
1843                         continue
1844                     representation_attrib = adaptation_set.attrib.copy()
1845                     representation_attrib.update(representation.attrib)
1846                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1847                     mime_type = representation_attrib['mimeType']
1848                     content_type = mime_type.split('/')[0]
1849                     if content_type == 'text':
1850                         # TODO implement WebVTT downloading
1851                         pass
1852                     elif content_type in ('video', 'audio'):
1853                         base_url = ''
1854                         for element in (representation, adaptation_set, period, mpd_doc):
1855                             base_url_e = element.find(_add_ns('BaseURL'))
1856                             if base_url_e is not None:
1857                                 base_url = base_url_e.text + base_url
1858                                 if re.match(r'^https?://', base_url):
1859                                     break
1860                         if mpd_base_url and not re.match(r'^https?://', base_url):
1861                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1862                                 mpd_base_url += '/'
1863                             base_url = mpd_base_url + base_url
1864                         representation_id = representation_attrib.get('id')
1865                         lang = representation_attrib.get('lang')
1866                         url_el = representation.find(_add_ns('BaseURL'))
1867                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1868                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1869                         f = {
1870                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1871                             'url': base_url,
1872                             'manifest_url': mpd_url,
1873                             'ext': mimetype2ext(mime_type),
1874                             'width': int_or_none(representation_attrib.get('width')),
1875                             'height': int_or_none(representation_attrib.get('height')),
1876                             'tbr': float_or_none(bandwidth, 1000),
1877                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1878                             'fps': int_or_none(representation_attrib.get('frameRate')),
1879                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1880                             'format_note': 'DASH %s' % content_type,
1881                             'filesize': filesize,
1882                         }
1883                         f.update(parse_codecs(representation_attrib.get('codecs')))
1884                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1885
1886                         def prepare_template(template_name, identifiers):
1887                             t = representation_ms_info[template_name]
1888                             t = t.replace('$RepresentationID$', representation_id)
1889                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1890                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1891                             t.replace('$$', '$')
1892                             return t
1893
1894                         # @initialization is a regular template like @media one
1895                         # so it should be handled just the same way (see
1896                         # https://github.com/rg3/youtube-dl/issues/11605)
1897                         if 'initialization' in representation_ms_info:
1898                             initialization_template = prepare_template(
1899                                 'initialization',
1900                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1901                                 # $Time$ shall not be included for @initialization thus
1902                                 # only $Bandwidth$ remains
1903                                 ('Bandwidth', ))
1904                             representation_ms_info['initialization_url'] = initialization_template % {
1905                                 'Bandwidth': bandwidth,
1906                             }
1907
1908                         def location_key(location):
1909                             return 'url' if re.match(r'^https?://', location) else 'path'
1910
1911                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1912
1913                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1914                             media_location_key = location_key(media_template)
1915
1916                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1917                             # can't be used at the same time
1918                             if '%(Number' in media_template and 's' not in representation_ms_info:
1919                                 segment_duration = None
1920                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1921                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1922                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1923                                 representation_ms_info['fragments'] = [{
1924                                     media_location_key: media_template % {
1925                                         'Number': segment_number,
1926                                         'Bandwidth': bandwidth,
1927                                     },
1928                                     'duration': segment_duration,
1929                                 } for segment_number in range(
1930                                     representation_ms_info['start_number'],
1931                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1932                             else:
1933                                 # $Number*$ or $Time$ in media template with S list available
1934                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1935                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1936                                 representation_ms_info['fragments'] = []
1937                                 segment_time = 0
1938                                 segment_d = None
1939                                 segment_number = representation_ms_info['start_number']
1940
1941                                 def add_segment_url():
1942                                     segment_url = media_template % {
1943                                         'Time': segment_time,
1944                                         'Bandwidth': bandwidth,
1945                                         'Number': segment_number,
1946                                     }
1947                                     representation_ms_info['fragments'].append({
1948                                         media_location_key: segment_url,
1949                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1950                                     })
1951
1952                                 for num, s in enumerate(representation_ms_info['s']):
1953                                     segment_time = s.get('t') or segment_time
1954                                     segment_d = s['d']
1955                                     add_segment_url()
1956                                     segment_number += 1
1957                                     for r in range(s.get('r', 0)):
1958                                         segment_time += segment_d
1959                                         add_segment_url()
1960                                         segment_number += 1
1961                                     segment_time += segment_d
1962                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1963                             # No media template
1964                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1965                             # or any YouTube dashsegments video
1966                             fragments = []
1967                             segment_index = 0
1968                             timescale = representation_ms_info['timescale']
1969                             for s in representation_ms_info['s']:
1970                                 duration = float_or_none(s['d'], timescale)
1971                                 for r in range(s.get('r', 0) + 1):
1972                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1973                                     fragments.append({
1974                                         location_key(segment_uri): segment_uri,
1975                                         'duration': duration,
1976                                     })
1977                                     segment_index += 1
1978                             representation_ms_info['fragments'] = fragments
1979                         elif 'segment_urls' in representation_ms_info:
1980                             # Segment URLs with no SegmentTimeline
1981                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
1982                             # https://github.com/rg3/youtube-dl/pull/14844
1983                             fragments = []
1984                             segment_duration = float_or_none(
1985                                 representation_ms_info['segment_duration'],
1986                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
1987                             for segment_url in representation_ms_info['segment_urls']:
1988                                 fragment = {
1989                                     location_key(segment_url): segment_url,
1990                                 }
1991                                 if segment_duration:
1992                                     fragment['duration'] = segment_duration
1993                                 fragments.append(fragment)
1994                             representation_ms_info['fragments'] = fragments
1995                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1996                         # No fragments key is present in this case.
1997                         if 'fragments' in representation_ms_info:
1998                             f.update({
1999                                 'fragment_base_url': base_url,
2000                                 'fragments': [],
2001                                 'protocol': 'http_dash_segments',
2002                             })
2003                             if 'initialization_url' in representation_ms_info:
2004                                 initialization_url = representation_ms_info['initialization_url']
2005                                 if not f.get('url'):
2006                                     f['url'] = initialization_url
2007                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2008                             f['fragments'].extend(representation_ms_info['fragments'])
2009                         try:
2010                             existing_format = next(
2011                                 fo for fo in formats
2012                                 if fo['format_id'] == representation_id)
2013                         except StopIteration:
2014                             full_info = formats_dict.get(representation_id, {}).copy()
2015                             full_info.update(f)
2016                             formats.append(full_info)
2017                         else:
2018                             existing_format.update(f)
2019                     else:
2020                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2021         return formats
2022
2023     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2024         res = self._download_webpage_handle(
2025             ism_url, video_id,
2026             note=note or 'Downloading ISM manifest',
2027             errnote=errnote or 'Failed to download ISM manifest',
2028             fatal=fatal)
2029         if res is False:
2030             return []
2031         ism, urlh = res
2032
2033         return self._parse_ism_formats(
2034             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2035
2036     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2037         """
2038         Parse formats from ISM manifest.
2039         References:
2040          1. [MS-SSTR]: Smooth Streaming Protocol,
2041             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2042         """
2043         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2044             return []
2045
2046         duration = int(ism_doc.attrib['Duration'])
2047         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2048
2049         formats = []
2050         for stream in ism_doc.findall('StreamIndex'):
2051             stream_type = stream.get('Type')
2052             if stream_type not in ('video', 'audio'):
2053                 continue
2054             url_pattern = stream.attrib['Url']
2055             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2056             stream_name = stream.get('Name')
2057             for track in stream.findall('QualityLevel'):
2058                 fourcc = track.get('FourCC')
2059                 # TODO: add support for WVC1 and WMAP
2060                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2061                     self.report_warning('%s is not a supported codec' % fourcc)
2062                     continue
2063                 tbr = int(track.attrib['Bitrate']) // 1000
2064                 # [1] does not mention Width and Height attributes. However,
2065                 # they're often present while MaxWidth and MaxHeight are
2066                 # missing, so should be used as fallbacks
2067                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2068                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2069                 sampling_rate = int_or_none(track.get('SamplingRate'))
2070
2071                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2072                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2073
2074                 fragments = []
2075                 fragment_ctx = {
2076                     'time': 0,
2077                 }
2078                 stream_fragments = stream.findall('c')
2079                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2080                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2081                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2082                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2083                     if not fragment_ctx['duration']:
2084                         try:
2085                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2086                         except IndexError:
2087                             next_fragment_time = duration
2088                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2089                     for _ in range(fragment_repeat):
2090                         fragments.append({
2091                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2092                             'duration': fragment_ctx['duration'] / stream_timescale,
2093                         })
2094                         fragment_ctx['time'] += fragment_ctx['duration']
2095
2096                 format_id = []
2097                 if ism_id:
2098                     format_id.append(ism_id)
2099                 if stream_name:
2100                     format_id.append(stream_name)
2101                 format_id.append(compat_str(tbr))
2102
2103                 formats.append({
2104                     'format_id': '-'.join(format_id),
2105                     'url': ism_url,
2106                     'manifest_url': ism_url,
2107                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2108                     'width': width,
2109                     'height': height,
2110                     'tbr': tbr,
2111                     'asr': sampling_rate,
2112                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2113                     'acodec': 'none' if stream_type == 'video' else fourcc,
2114                     'protocol': 'ism',
2115                     'fragments': fragments,
2116                     '_download_params': {
2117                         'duration': duration,
2118                         'timescale': stream_timescale,
2119                         'width': width or 0,
2120                         'height': height or 0,
2121                         'fourcc': fourcc,
2122                         'codec_private_data': track.get('CodecPrivateData'),
2123                         'sampling_rate': sampling_rate,
2124                         'channels': int_or_none(track.get('Channels', 2)),
2125                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2126                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2127                     },
2128                 })
2129         return formats
2130
2131     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2132         def absolute_url(video_url):
2133             return compat_urlparse.urljoin(base_url, video_url)
2134
2135         def parse_content_type(content_type):
2136             if not content_type:
2137                 return {}
2138             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2139             if ctr:
2140                 mimetype, codecs = ctr.groups()
2141                 f = parse_codecs(codecs)
2142                 f['ext'] = mimetype2ext(mimetype)
2143                 return f
2144             return {}
2145
2146         def _media_formats(src, cur_media_type, type_info={}):
2147             full_url = absolute_url(src)
2148             ext = type_info.get('ext') or determine_ext(full_url)
2149             if ext == 'm3u8':
2150                 is_plain_url = False
2151                 formats = self._extract_m3u8_formats(
2152                     full_url, video_id, ext='mp4',
2153                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2154                     preference=preference, fatal=False)
2155             elif ext == 'mpd':
2156                 is_plain_url = False
2157                 formats = self._extract_mpd_formats(
2158                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2159             else:
2160                 is_plain_url = True
2161                 formats = [{
2162                     'url': full_url,
2163                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2164                 }]
2165             return is_plain_url, formats
2166
2167         entries = []
2168         # amp-video and amp-audio are very similar to their HTML5 counterparts
2169         # so we wll include them right here (see
2170         # https://www.ampproject.org/docs/reference/components/amp-video)
2171         media_tags = [(media_tag, media_type, '')
2172                       for media_tag, media_type
2173                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2174         media_tags.extend(re.findall(
2175             # We only allow video|audio followed by a whitespace or '>'.
2176             # Allowing more characters may end up in significant slow down (see
2177             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2178             # http://www.porntrex.com/maps/videositemap.xml).
2179             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2180         for media_tag, media_type, media_content in media_tags:
2181             media_info = {
2182                 'formats': [],
2183                 'subtitles': {},
2184             }
2185             media_attributes = extract_attributes(media_tag)
2186             src = media_attributes.get('src')
2187             if src:
2188                 _, formats = _media_formats(src, media_type)
2189                 media_info['formats'].extend(formats)
2190             media_info['thumbnail'] = media_attributes.get('poster')
2191             if media_content:
2192                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2193                     source_attributes = extract_attributes(source_tag)
2194                     src = source_attributes.get('src')
2195                     if not src:
2196                         continue
2197                     f = parse_content_type(source_attributes.get('type'))
2198                     is_plain_url, formats = _media_formats(src, media_type, f)
2199                     if is_plain_url:
2200                         # res attribute is not standard but seen several times
2201                         # in the wild
2202                         f.update({
2203                             'height': int_or_none(source_attributes.get('res')),
2204                             'format_id': source_attributes.get('label'),
2205                         })
2206                         f.update(formats[0])
2207                         media_info['formats'].append(f)
2208                     else:
2209                         media_info['formats'].extend(formats)
2210                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2211                     track_attributes = extract_attributes(track_tag)
2212                     kind = track_attributes.get('kind')
2213                     if not kind or kind in ('subtitles', 'captions'):
2214                         src = track_attributes.get('src')
2215                         if not src:
2216                             continue
2217                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2218                         media_info['subtitles'].setdefault(lang, []).append({
2219                             'url': absolute_url(src),
2220                         })
2221             if media_info['formats'] or media_info['subtitles']:
2222                 entries.append(media_info)
2223         return entries
2224
2225     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2226         formats = []
2227         hdcore_sign = 'hdcore=3.7.0'
2228         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2229         hds_host = hosts.get('hds')
2230         if hds_host:
2231             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2232         if 'hdcore=' not in f4m_url:
2233             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2234         f4m_formats = self._extract_f4m_formats(
2235             f4m_url, video_id, f4m_id='hds', fatal=False)
2236         for entry in f4m_formats:
2237             entry.update({'extra_param_to_segment_url': hdcore_sign})
2238         formats.extend(f4m_formats)
2239         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2240         hls_host = hosts.get('hls')
2241         if hls_host:
2242             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2243         formats.extend(self._extract_m3u8_formats(
2244             m3u8_url, video_id, 'mp4', 'm3u8_native',
2245             m3u8_id='hls', fatal=False))
2246         return formats
2247
2248     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2249         query = compat_urlparse.urlparse(url).query
2250         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2251         url_base = self._search_regex(
2252             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2253         http_base_url = '%s:%s' % ('http', url_base)
2254         formats = []
2255
2256         def manifest_url(manifest):
2257             m_url = '%s/%s' % (http_base_url, manifest)
2258             if query:
2259                 m_url += '?%s' % query
2260             return m_url
2261
2262         if 'm3u8' not in skip_protocols:
2263             formats.extend(self._extract_m3u8_formats(
2264                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2265                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2266         if 'f4m' not in skip_protocols:
2267             formats.extend(self._extract_f4m_formats(
2268                 manifest_url('manifest.f4m'),
2269                 video_id, f4m_id='hds', fatal=False))
2270         if 'dash' not in skip_protocols:
2271             formats.extend(self._extract_mpd_formats(
2272                 manifest_url('manifest.mpd'),
2273                 video_id, mpd_id='dash', fatal=False))
2274         if re.search(r'(?:/smil:|\.smil)', url_base):
2275             if 'smil' not in skip_protocols:
2276                 rtmp_formats = self._extract_smil_formats(
2277                     manifest_url('jwplayer.smil'),
2278                     video_id, fatal=False)
2279                 for rtmp_format in rtmp_formats:
2280                     rtsp_format = rtmp_format.copy()
2281                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2282                     del rtsp_format['play_path']
2283                     del rtsp_format['ext']
2284                     rtsp_format.update({
2285                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2286                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2287                         'protocol': 'rtsp',
2288                     })
2289                     formats.extend([rtmp_format, rtsp_format])
2290         else:
2291             for protocol in ('rtmp', 'rtsp'):
2292                 if protocol not in skip_protocols:
2293                     formats.append({
2294                         'url': '%s:%s' % (protocol, url_base),
2295                         'format_id': protocol,
2296                         'protocol': protocol,
2297                     })
2298         return formats
2299
2300     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2301         mobj = re.search(
2302             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2303             webpage)
2304         if mobj:
2305             try:
2306                 jwplayer_data = self._parse_json(mobj.group('options'),
2307                                                  video_id=video_id,
2308                                                  transform_source=transform_source)
2309             except ExtractorError:
2310                 pass
2311             else:
2312                 if isinstance(jwplayer_data, dict):
2313                     return jwplayer_data
2314
2315     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2316         jwplayer_data = self._find_jwplayer_data(
2317             webpage, video_id, transform_source=js_to_json)
2318         return self._parse_jwplayer_data(
2319             jwplayer_data, video_id, *args, **kwargs)
2320
2321     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2322                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2323         # JWPlayer backward compatibility: flattened playlists
2324         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2325         if 'playlist' not in jwplayer_data:
2326             jwplayer_data = {'playlist': [jwplayer_data]}
2327
2328         entries = []
2329
2330         # JWPlayer backward compatibility: single playlist item
2331         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2332         if not isinstance(jwplayer_data['playlist'], list):
2333             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2334
2335         for video_data in jwplayer_data['playlist']:
2336             # JWPlayer backward compatibility: flattened sources
2337             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2338             if 'sources' not in video_data:
2339                 video_data['sources'] = [video_data]
2340
2341             this_video_id = video_id or video_data['mediaid']
2342
2343             formats = self._parse_jwplayer_formats(
2344                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2345                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2346
2347             subtitles = {}
2348             tracks = video_data.get('tracks')
2349             if tracks and isinstance(tracks, list):
2350                 for track in tracks:
2351                     if not isinstance(track, dict):
2352                         continue
2353                     if track.get('kind') != 'captions':
2354                         continue
2355                     track_url = urljoin(base_url, track.get('file'))
2356                     if not track_url:
2357                         continue
2358                     subtitles.setdefault(track.get('label') or 'en', []).append({
2359                         'url': self._proto_relative_url(track_url)
2360                     })
2361
2362             entry = {
2363                 'id': this_video_id,
2364                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2365                 'description': video_data.get('description'),
2366                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2367                 'timestamp': int_or_none(video_data.get('pubdate')),
2368                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2369                 'subtitles': subtitles,
2370             }
2371             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2372             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2373                 entry.update({
2374                     '_type': 'url_transparent',
2375                     'url': formats[0]['url'],
2376                 })
2377             else:
2378                 self._sort_formats(formats)
2379                 entry['formats'] = formats
2380             entries.append(entry)
2381         if len(entries) == 1:
2382             return entries[0]
2383         else:
2384             return self.playlist_result(entries)
2385
2386     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2387                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2388         urls = []
2389         formats = []
2390         for source in jwplayer_sources_data:
2391             if not isinstance(source, dict):
2392                 continue
2393             source_url = self._proto_relative_url(source.get('file'))
2394             if not source_url:
2395                 continue
2396             if base_url:
2397                 source_url = compat_urlparse.urljoin(base_url, source_url)
2398             if source_url in urls:
2399                 continue
2400             urls.append(source_url)
2401             source_type = source.get('type') or ''
2402             ext = mimetype2ext(source_type) or determine_ext(source_url)
2403             if source_type == 'hls' or ext == 'm3u8':
2404                 formats.extend(self._extract_m3u8_formats(
2405                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2406                     m3u8_id=m3u8_id, fatal=False))
2407             elif ext == 'mpd':
2408                 formats.extend(self._extract_mpd_formats(
2409                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2410             elif ext == 'smil':
2411                 formats.extend(self._extract_smil_formats(
2412                     source_url, video_id, fatal=False))
2413             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2414             elif source_type.startswith('audio') or ext in (
2415                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2416                 formats.append({
2417                     'url': source_url,
2418                     'vcodec': 'none',
2419                     'ext': ext,
2420                 })
2421             else:
2422                 height = int_or_none(source.get('height'))
2423                 if height is None:
2424                     # Often no height is provided but there is a label in
2425                     # format like "1080p", "720p SD", or 1080.
2426                     height = int_or_none(self._search_regex(
2427                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2428                         'height', default=None))
2429                 a_format = {
2430                     'url': source_url,
2431                     'width': int_or_none(source.get('width')),
2432                     'height': height,
2433                     'tbr': int_or_none(source.get('bitrate')),
2434                     'ext': ext,
2435                 }
2436                 if source_url.startswith('rtmp'):
2437                     a_format['ext'] = 'flv'
2438                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2439                     # of jwplayer.flash.swf
2440                     rtmp_url_parts = re.split(
2441                         r'((?:mp4|mp3|flv):)', source_url, 1)
2442                     if len(rtmp_url_parts) == 3:
2443                         rtmp_url, prefix, play_path = rtmp_url_parts
2444                         a_format.update({
2445                             'url': rtmp_url,
2446                             'play_path': prefix + play_path,
2447                         })
2448                     if rtmp_params:
2449                         a_format.update(rtmp_params)
2450                 formats.append(a_format)
2451         return formats
2452
2453     def _live_title(self, name):
2454         """ Generate the title for a live video """
2455         now = datetime.datetime.now()
2456         now_str = now.strftime('%Y-%m-%d %H:%M')
2457         return name + ' ' + now_str
2458
2459     def _int(self, v, name, fatal=False, **kwargs):
2460         res = int_or_none(v, **kwargs)
2461         if 'get_attr' in kwargs:
2462             print(getattr(v, kwargs['get_attr']))
2463         if res is None:
2464             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2465             if fatal:
2466                 raise ExtractorError(msg)
2467             else:
2468                 self._downloader.report_warning(msg)
2469         return res
2470
2471     def _float(self, v, name, fatal=False, **kwargs):
2472         res = float_or_none(v, **kwargs)
2473         if res is None:
2474             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2475             if fatal:
2476                 raise ExtractorError(msg)
2477             else:
2478                 self._downloader.report_warning(msg)
2479         return res
2480
2481     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2482                     path='/', secure=False, discard=False, rest={}, **kwargs):
2483         cookie = compat_cookiejar.Cookie(
2484             0, name, value, port, port is not None, domain, True,
2485             domain.startswith('.'), path, True, secure, expire_time,
2486             discard, None, None, rest)
2487         self._downloader.cookiejar.set_cookie(cookie)
2488
2489     def _get_cookies(self, url):
2490         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2491         req = sanitized_Request(url)
2492         self._downloader.cookiejar.add_cookie_header(req)
2493         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2494
2495     def get_testcases(self, include_onlymatching=False):
2496         t = getattr(self, '_TEST', None)
2497         if t:
2498             assert not hasattr(self, '_TESTS'), \
2499                 '%s has _TEST and _TESTS' % type(self).__name__
2500             tests = [t]
2501         else:
2502             tests = getattr(self, '_TESTS', [])
2503         for t in tests:
2504             if not include_onlymatching and t.get('only_matching', False):
2505                 continue
2506             t['name'] = type(self).__name__[:-len('IE')]
2507             yield t
2508
2509     def is_suitable(self, age_limit):
2510         """ Test whether the extractor is generally suitable for the given
2511         age limit (i.e. pornographic sites are not, all others usually are) """
2512
2513         any_restricted = False
2514         for tc in self.get_testcases(include_onlymatching=False):
2515             if tc.get('playlist', []):
2516                 tc = tc['playlist'][0]
2517             is_restricted = age_restricted(
2518                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2519             if not is_restricted:
2520                 return True
2521             any_restricted = any_restricted or is_restricted
2522         return not any_restricted
2523
2524     def extract_subtitles(self, *args, **kwargs):
2525         if (self._downloader.params.get('writesubtitles', False) or
2526                 self._downloader.params.get('listsubtitles')):
2527             return self._get_subtitles(*args, **kwargs)
2528         return {}
2529
2530     def _get_subtitles(self, *args, **kwargs):
2531         raise NotImplementedError('This method must be implemented by subclasses')
2532
2533     @staticmethod
2534     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2535         """ Merge subtitle items for one language. Items with duplicated URLs
2536         will be dropped. """
2537         list1_urls = set([item['url'] for item in subtitle_list1])
2538         ret = list(subtitle_list1)
2539         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2540         return ret
2541
2542     @classmethod
2543     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2544         """ Merge two subtitle dictionaries, language by language. """
2545         ret = dict(subtitle_dict1)
2546         for lang in subtitle_dict2:
2547             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2548         return ret
2549
2550     def extract_automatic_captions(self, *args, **kwargs):
2551         if (self._downloader.params.get('writeautomaticsub', False) or
2552                 self._downloader.params.get('listsubtitles')):
2553             return self._get_automatic_captions(*args, **kwargs)
2554         return {}
2555
2556     def _get_automatic_captions(self, *args, **kwargs):
2557         raise NotImplementedError('This method must be implemented by subclasses')
2558
2559     def mark_watched(self, *args, **kwargs):
2560         if (self._downloader.params.get('mark_watched', False) and
2561                 (self._get_login_info()[0] is not None or
2562                     self._downloader.params.get('cookiefile') is not None)):
2563             self._mark_watched(*args, **kwargs)
2564
2565     def _mark_watched(self, *args, **kwargs):
2566         raise NotImplementedError('This method must be implemented by subclasses')
2567
2568     def geo_verification_headers(self):
2569         headers = {}
2570         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2571         if geo_verification_proxy:
2572             headers['Ytdl-request-proxy'] = geo_verification_proxy
2573         return headers
2574
2575     def _generic_id(self, url):
2576         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2577
2578     def _generic_title(self, url):
2579         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2580
2581
2582 class SearchInfoExtractor(InfoExtractor):
2583     """
2584     Base class for paged search queries extractors.
2585     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2586     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2587     """
2588
2589     @classmethod
2590     def _make_valid_url(cls):
2591         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2592
2593     @classmethod
2594     def suitable(cls, url):
2595         return re.match(cls._make_valid_url(), url) is not None
2596
2597     def _real_extract(self, query):
2598         mobj = re.match(self._make_valid_url(), query)
2599         if mobj is None:
2600             raise ExtractorError('Invalid search query "%s"' % query)
2601
2602         prefix = mobj.group('prefix')
2603         query = mobj.group('query')
2604         if prefix == '':
2605             return self._get_n_results(query, 1)
2606         elif prefix == 'all':
2607             return self._get_n_results(query, self._MAX_RESULTS)
2608         else:
2609             n = int(prefix)
2610             if n <= 0:
2611                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2612             elif n > self._MAX_RESULTS:
2613                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2614                 n = self._MAX_RESULTS
2615             return self._get_n_results(query, n)
2616
2617     def _get_n_results(self, query, n):
2618         """Get a specified number of results for a query"""
2619         raise NotImplementedError('This method must be implemented by subclasses')
2620
2621     @property
2622     def SEARCH_KEY(self):
2623         return self._SEARCH_KEY