_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import random
  10 import re
  11 import socket
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar,
  18     compat_cookies,
  19     compat_etree_fromstring,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader.f4m import remove_encrypted_media
  31 from ..utils import (
  32     NO_DEFAULT,
  33     age_restricted,
  34     base_url,
  35     bug_reports_message,
  36     clean_html,
  37     compiled_regex_type,
  38     determine_ext,
  39     error_to_compat_str,
  40     ExtractorError,
  41     fix_xml_ampersands,
  42     float_or_none,
  43     GeoRestrictedError,
  44     GeoUtils,
  45     int_or_none,
  46     js_to_json,
  47     parse_iso8601,
  48     RegexNotFoundError,
  49     sanitize_filename,
  50     sanitized_Request,
  51     unescapeHTML,
  52     unified_strdate,
  53     unified_timestamp,
  54     url_basename,
  55     xpath_element,
  56     xpath_text,
  57     xpath_with_ns,
  58     determine_protocol,
  59     parse_duration,
  60     mimetype2ext,
  61     update_Request,
  62     update_url_query,
  63     parse_m3u8_attributes,
  64     extract_attributes,
  65     parse_codecs,
  66     urljoin,
  67 )
  68
  69
  70 class InfoExtractor(object):
  71     """Information Extractor class.
  72
  73     Information extractors are the classes that, given a URL, extract
  74     information about the video (or videos) the URL refers to. This
  75     information includes the real video URL, the video title, author and
  76     others. The information is stored in a dictionary which is then
  77     passed to the YoutubeDL. The YoutubeDL processes this
  78     information possibly downloading the video to the file system, among
  79     other possible outcomes.
  80
  81     The type field determines the type of the result.
  82     By far the most common value (and the default if _type is missing) is
  83     "video", which indicates a single video.
  84
  85     For a video, the dictionaries must include the following fields:
  86
  87     id:             Video identifier.
  88     title:          Video title, unescaped.
  89
  90     Additionally, it must contain either a formats entry or a url one:
  91
  92     formats:        A list of dictionaries for each format available, ordered
  93                     from worst to best quality.
  94
  95                     Potential fields:
  96                     * url        Mandatory. The URL of the video file
  97                     * manifest_url
  98                                  The URL of the manifest file in case of
  99                                  fragmented media (DASH, hls, hds)
 100                     * ext        Will be calculated from URL if missing
 101                     * format     A human-readable description of the format
 102                                  ("mp4 container with h264/opus").
 103                                  Calculated from the format_id, width, height.
 104                                  and format_note fields if missing.
 105                     * format_id  A short description of the format
 106                                  ("mp4_h264_opus" or "19").
 107                                 Technically optional, but strongly recommended.
 108                     * format_note Additional info about the format
 109                                  ("3D" or "DASH video")
 110                     * width      Width of the video, if known
 111                     * height     Height of the video, if known
 112                     * resolution Textual description of width and height
 113                     * tbr        Average bitrate of audio and video in KBit/s
 114                     * abr        Average audio bitrate in KBit/s
 115                     * acodec     Name of the audio codec in use
 116                     * asr        Audio sampling rate in Hertz
 117                     * vbr        Average video bitrate in KBit/s
 118                     * fps        Frame rate
 119                     * vcodec     Name of the video codec in use
 120                     * container  Name of the container format
 121                     * filesize   The number of bytes, if known in advance
 122                     * filesize_approx  An estimate for the number of bytes
 123                     * player_url SWF Player URL (used for rtmpdump).
 124                     * protocol   The protocol that will be used for the actual
 125                                  download, lower-case.
 126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 127                                  "m3u8", "m3u8_native" or "http_dash_segments".
 128                     * fragment_base_url
 129                                  Base URL for fragments. Each fragment's path
 130                                  value (if present) will be relative to
 131                                  this URL.
 132                     * fragments  A list of fragments of a fragmented media.
 133                                  Each fragment entry must contain either an url
 134                                  or a path. If an url is present it should be
 135                                  considered by a client. Otherwise both path and
 136                                  fragment_base_url must be present. Here is
 137                                  the list of all potential fields:
 138                                  * "url" - fragment's URL
 139                                  * "path" - fragment's path relative to
 140                                             fragment_base_url
 141                                  * "duration" (optional, int or float)
 142                                  * "filesize" (optional, int)
 143                     * preference Order number of this format. If this field is
 144                                  present and not None, the formats get sorted
 145                                  by this field, regardless of all other values.
 146                                  -1 for default (order by other properties),
 147                                  -2 or smaller for less than default.
 148                                  < -1000 to hide the format (if there is
 149                                     another one which is strictly better)
 150                     * language   Language code, e.g. "de" or "en-US".
 151                     * language_preference  Is this in the language mentioned in
 152                                  the URL?
 153                                  10 if it's what the URL is about,
 154                                  -1 for default (don't know),
 155                                  -10 otherwise, other values reserved for now.
 156                     * quality    Order number of the video quality of this
 157                                  format, irrespective of the file format.
 158                                  -1 for default (order by other properties),
 159                                  -2 or smaller for less than default.
 160                     * source_preference  Order number for this video source
 161                                   (quality takes higher priority)
 162                                  -1 for default (order by other properties),
 163                                  -2 or smaller for less than default.
 164                     * http_headers  A dictionary of additional HTTP headers
 165                                  to add to the request.
 166                     * stretched_ratio  If given and not 1, indicates that the
 167                                  video's pixels are not square.
 168                                  width : height ratio as float.
 169                     * no_resume  The server does not support resuming the
 170                                  (HTTP or RTMP) download. Boolean.
 171
 172     url:            Final video URL.
 173     ext:            Video filename extension.
 174     format:         The video format, defaults to ext (used for --get-format)
 175     player_url:     SWF Player URL (used for rtmpdump).
 176
 177     The following fields are optional:
 178
 179     alt_title:      A secondary title of the video.
 180     display_id      An alternative identifier for the video, not necessarily
 181                     unique, but available before title. Typically, id is
 182                     something like "4234987", title "Dancing naked mole rats",
 183                     and display_id "dancing-naked-mole-rats"
 184     thumbnails:     A list of dictionaries, with the following entries:
 185                         * "id" (optional, string) - Thumbnail format ID
 186                         * "url"
 187                         * "preference" (optional, int) - quality of the image
 188                         * "width" (optional, int)
 189                         * "height" (optional, int)
 190                         * "resolution" (optional, string "{width}x{height"},
 191                                         deprecated)
 192                         * "filesize" (optional, int)
 193     thumbnail:      Full URL to a video thumbnail image.
 194     description:    Full video description.
 195     uploader:       Full name of the video uploader.
 196     license:        License name the video is licensed under.
 197     creator:        The creator of the video.
 198     release_date:   The date (YYYYMMDD) when the video was released.
 199     timestamp:      UNIX timestamp of the moment the video became available.
 200     upload_date:    Video upload date (YYYYMMDD).
 201                     If not explicitly set, calculated from timestamp.
 202     uploader_id:    Nickname or id of the video uploader.
 203     uploader_url:   Full URL to a personal webpage of the video uploader.
 204     location:       Physical location where the video was filmed.
 205     subtitles:      The available subtitles as a dictionary in the format
 206                     {tag: subformats}. "tag" is usually a language code, and
 207                     "subformats" is a list sorted from lower to higher
 208                     preference, each element is a dictionary with the "ext"
 209                     entry and one of:
 210                         * "data": The subtitles file contents
 211                         * "url": A URL pointing to the subtitles file
 212                     "ext" will be calculated from URL if missing
 213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 214                     automatically generated captions
 215     duration:       Length of the video in seconds, as an integer or float.
 216     view_count:     How many users have watched the video on the platform.
 217     like_count:     Number of positive ratings of the video
 218     dislike_count:  Number of negative ratings of the video
 219     repost_count:   Number of reposts of the video
 220     average_rating: Average rating give by users, the scale used depends on the webpage
 221     comment_count:  Number of comments on the video
 222     comments:       A list of comments, each with one or more of the following
 223                     properties (all but one of text or html optional):
 224                         * "author" - human-readable name of the comment author
 225                         * "author_id" - user ID of the comment author
 226                         * "id" - Comment ID
 227                         * "html" - Comment as HTML
 228                         * "text" - Plain text of the comment
 229                         * "timestamp" - UNIX timestamp of comment
 230                         * "parent" - ID of the comment this one is replying to.
 231                                      Set to "root" to indicate that this is a
 232                                      comment to the original video.
 233     age_limit:      Age restriction for the video, as an integer (years)
 234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 235                     should allow to get the same result again. (It will be set
 236                     by YoutubeDL if it's missing)
 237     categories:     A list of categories that the video falls in, for example
 238                     ["Sports", "Berlin"]
 239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 240     is_live:        True, False, or None (=unknown). Whether this video is a
 241                     live stream that goes on instead of a fixed-length video.
 242     start_time:     Time in seconds where the reproduction should start, as
 243                     specified in the URL.
 244     end_time:       Time in seconds where the reproduction should end, as
 245                     specified in the URL.
 246
 247     The following fields should only be used when the video belongs to some logical
 248     chapter or section:
 249
 250     chapter:        Name or title of the chapter the video belongs to.
 251     chapter_number: Number of the chapter the video belongs to, as an integer.
 252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 253
 254     The following fields should only be used when the video is an episode of some
 255     series, programme or podcast:
 256
 257     series:         Title of the series or programme the video episode belongs to.
 258     season:         Title of the season the video episode belongs to.
 259     season_number:  Number of the season the video episode belongs to, as an integer.
 260     season_id:      Id of the season the video episode belongs to, as a unicode string.
 261     episode:        Title of the video episode. Unlike mandatory video title field,
 262                     this field should denote the exact title of the video episode
 263                     without any kind of decoration.
 264     episode_number: Number of the video episode within a season, as an integer.
 265     episode_id:     Id of the video episode, as a unicode string.
 266
 267     The following fields should only be used when the media is a track or a part of
 268     a music album:
 269
 270     track:          Title of the track.
 271     track_number:   Number of the track within an album or a disc, as an integer.
 272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 273                     as a unicode string.
 274     artist:         Artist(s) of the track.
 275     genre:          Genre(s) of the track.
 276     album:          Title of the album the track belongs to.
 277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 278     album_artist:   List of all artists appeared on the album (e.g.
 279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 280                     and compilations).
 281     disc_number:    Number of the disc or other physical medium the track belongs to,
 282                     as an integer.
 283     release_year:   Year (YYYY) when the album was released.
 284
 285     Unless mentioned otherwise, the fields should be Unicode strings.
 286
 287     Unless mentioned otherwise, None is equivalent to absence of information.
 288
 289
 290     _type "playlist" indicates multiple videos.
 291     There must be a key "entries", which is a list, an iterable, or a PagedList
 292     object, each element of which is a valid dictionary by this specification.
 293
 294     Additionally, playlists can have "title", "description" and "id" attributes
 295     with the same semantics as videos (see above).
 296
 297
 298     _type "multi_video" indicates that there are multiple videos that
 299     form a single show, for examples multiple acts of an opera or TV episode.
 300     It must have an entries key like a playlist and contain all the keys
 301     required for a video at the same time.
 302
 303
 304     _type "url" indicates that the video must be extracted from another
 305     location, possibly by a different extractor. Its only required key is:
 306     "url" - the next URL to extract.
 307     The key "ie_key" can be set to the class name (minus the trailing "IE",
 308     e.g. "Youtube") if the extractor class is known in advance.
 309     Additionally, the dictionary may have any properties of the resolved entity
 310     known in advance, for example "title" if the title of the referred video is
 311     known ahead of time.
 312
 313
 314     _type "url_transparent" entities have the same specification as "url", but
 315     indicate that the given additional information is more precise than the one
 316     associated with the resolved URL.
 317     This is useful when a site employs a video service that hosts the video and
 318     its technical metadata, but that video service does not embed a useful
 319     title, description etc.
 320
 321
 322     Subclasses of this one should re-define the _real_initialize() and
 323     _real_extract() methods and define a _VALID_URL regexp.
 324     Probably, they should also be added to the list of extractors.
 325
 326     _BYPASS_GEO attribute may be set to False in order to disable
 327     geo restriction bypass mechanisms for a particular extractor.
 328     Though it won't disable explicit geo restriction bypass based on
 329     country code provided with geo_bypass_country.
 330
 331     Finally, the _WORKING attribute should be set to False for broken IEs
 332     in order to warn the users and skip the tests.
 333     """
 334
 335     _ready = False
 336     _downloader = None
 337     _x_forwarded_for_ip = None
 338     _BYPASS_GEO = True
 339     _WORKING = True
 340
 341     def __init__(self, downloader=None):
 342         """Constructor. Receives an optional downloader."""
 343         self._ready = False
 344         self._x_forwarded_for_ip = None
 345         self.set_downloader(downloader)
 346
 347     @classmethod
 348     def suitable(cls, url):
 349         """Receives a URL and returns True if suitable for this IE."""
 350
 351         # This does not use has/getattr intentionally - we want to know whether
 352         # we have cached the regexp for *this* class, whereas getattr would also
 353         # match the superclass
 354         if '_VALID_URL_RE' not in cls.__dict__:
 355             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 356         return cls._VALID_URL_RE.match(url) is not None
 357
 358     @classmethod
 359     def _match_id(cls, url):
 360         if '_VALID_URL_RE' not in cls.__dict__:
 361             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 362         m = cls._VALID_URL_RE.match(url)
 363         assert m
 364         return m.group('id')
 365
 366     @classmethod
 367     def working(cls):
 368         """Getter method for _WORKING."""
 369         return cls._WORKING
 370
 371     def initialize(self):
 372         """Initializes an instance (authentication, etc)."""
 373         if not self._x_forwarded_for_ip:
 374             country_code = self._downloader.params.get('geo_bypass_country', None)
 375             if country_code:
 376                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 377         if not self._ready:
 378             self._real_initialize()
 379             self._ready = True
 380
 381     def extract(self, url):
 382         """Extracts URL information and returns it in list of dicts."""
 383         try:
 384             for _ in range(2):
 385                 try:
 386                     self.initialize()
 387                     ie_result = self._real_extract(url)
 388                     if self._x_forwarded_for_ip:
 389                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 390                     return ie_result
 391                 except GeoRestrictedError as e:
 392                     if (not self._downloader.params.get('geo_bypass_country', None) and
 393                             self._BYPASS_GEO and
 394                             self._downloader.params.get('geo_bypass', True) and
 395                             not self._x_forwarded_for_ip and
 396                             e.countries):
 397                         self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
 398                         if self._x_forwarded_for_ip:
 399                             self.report_warning(
 400                                 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
 401                             continue
 402                     raise
 403         except ExtractorError:
 404             raise
 405         except compat_http_client.IncompleteRead as e:
 406             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 407         except (KeyError, StopIteration) as e:
 408             raise ExtractorError('An extractor error has occurred.', cause=e)
 409
 410     def set_downloader(self, downloader):
 411         """Sets the downloader for this IE."""
 412         self._downloader = downloader
 413
 414     def _real_initialize(self):
 415         """Real initialization process. Redefine in subclasses."""
 416         pass
 417
 418     def _real_extract(self, url):
 419         """Real extraction process. Redefine in subclasses."""
 420         pass
 421
 422     @classmethod
 423     def ie_key(cls):
 424         """A string for getting the InfoExtractor with get_info_extractor"""
 425         return compat_str(cls.__name__[:-2])
 426
 427     @property
 428     def IE_NAME(self):
 429         return compat_str(type(self).__name__[:-2])
 430
 431     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 432         """ Returns the response handle """
 433         if note is None:
 434             self.report_download_webpage(video_id)
 435         elif note is not False:
 436             if video_id is None:
 437                 self.to_screen('%s' % (note,))
 438             else:
 439                 self.to_screen('%s: %s' % (video_id, note))
 440         if isinstance(url_or_request, compat_urllib_request.Request):
 441             url_or_request = update_Request(
 442                 url_or_request, data=data, headers=headers, query=query)
 443         else:
 444             if query:
 445                 url_or_request = update_url_query(url_or_request, query)
 446             if data is not None or headers:
 447                 url_or_request = sanitized_Request(url_or_request, data, headers)
 448         try:
 449             return self._downloader.urlopen(url_or_request)
 450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 451             if errnote is False:
 452                 return False
 453             if errnote is None:
 454                 errnote = 'Unable to download webpage'
 455
 456             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 457             if fatal:
 458                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 459             else:
 460                 self._downloader.report_warning(errmsg)
 461                 return False
 462
 463     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 464         """ Returns a tuple (page content as string, URL handle) """
 465         # Strip hashes from the URL (#1038)
 466         if isinstance(url_or_request, (compat_str, str)):
 467             url_or_request = url_or_request.partition('#')[0]
 468
 469         # Some sites check X-Forwarded-For HTTP header in order to figure out
 470         # the origin of the client behind proxy. This allows bypassing geo
 471         # restriction by faking this header's value to IP that belongs to some
 472         # geo unrestricted country. We will do so once we encounter any
 473         # geo restriction error.
 474         if self._x_forwarded_for_ip:
 475             if 'X-Forwarded-For' not in headers:
 476                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 477
 478         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 479         if urlh is False:
 480             assert not fatal
 481             return False
 482         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 483         return (content, urlh)
 484
 485     @staticmethod
 486     def _guess_encoding_from_content(content_type, webpage_bytes):
 487         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 488         if m:
 489             encoding = m.group(1)
 490         else:
 491             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 492                           webpage_bytes[:1024])
 493             if m:
 494                 encoding = m.group(1).decode('ascii')
 495             elif webpage_bytes.startswith(b'\xff\xfe'):
 496                 encoding = 'utf-16'
 497             else:
 498                 encoding = 'utf-8'
 499
 500         return encoding
 501
 502     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 503         content_type = urlh.headers.get('Content-Type', '')
 504         webpage_bytes = urlh.read()
 505         if prefix is not None:
 506             webpage_bytes = prefix + webpage_bytes
 507         if not encoding:
 508             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 509         if self._downloader.params.get('dump_intermediate_pages', False):
 510             try:
 511                 url = url_or_request.get_full_url()
 512             except AttributeError:
 513                 url = url_or_request
 514             self.to_screen('Dumping request to ' + url)
 515             dump = base64.b64encode(webpage_bytes).decode('ascii')
 516             self._downloader.to_screen(dump)
 517         if self._downloader.params.get('write_pages', False):
 518             try:
 519                 url = url_or_request.get_full_url()
 520             except AttributeError:
 521                 url = url_or_request
 522             basen = '%s_%s' % (video_id, url)
 523             if len(basen) > 240:
 524                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 525                 basen = basen[:240 - len(h)] + h
 526             raw_filename = basen + '.dump'
 527             filename = sanitize_filename(raw_filename, restricted=True)
 528             self.to_screen('Saving request to ' + filename)
 529             # Working around MAX_PATH limitation on Windows (see
 530             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 531             if compat_os_name == 'nt':
 532                 absfilepath = os.path.abspath(filename)
 533                 if len(absfilepath) > 259:
 534                     filename = '\\\\?\\' + absfilepath
 535             with open(filename, 'wb') as outf:
 536                 outf.write(webpage_bytes)
 537
 538         try:
 539             content = webpage_bytes.decode(encoding, 'replace')
 540         except LookupError:
 541             content = webpage_bytes.decode('utf-8', 'replace')
 542
 543         if ('<title>Access to this site is blocked</title>' in content and
 544                 'Websense' in content[:512]):
 545             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 546             blocked_iframe = self._html_search_regex(
 547                 r'<iframe src="([^"]+)"', content,
 548                 'Websense information URL', default=None)
 549             if blocked_iframe:
 550                 msg += ' Visit %s for more details' % blocked_iframe
 551             raise ExtractorError(msg, expected=True)
 552         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 553             msg = (
 554                 'Access to this webpage has been blocked by Indian censorship. '
 555                 'Use a VPN or proxy server (with --proxy) to route around it.')
 556             block_msg = self._html_search_regex(
 557                 r'</h1><p>(.*?)</p>',
 558                 content, 'block message', default=None)
 559             if block_msg:
 560                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 561             raise ExtractorError(msg, expected=True)
 562
 563         return content
 564
 565     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 566         """ Returns the data of the page as a string """
 567         success = False
 568         try_count = 0
 569         while success is False:
 570             try:
 571                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 572                 success = True
 573             except compat_http_client.IncompleteRead as e:
 574                 try_count += 1
 575                 if try_count >= tries:
 576                     raise e
 577                 self._sleep(timeout, video_id)
 578         if res is False:
 579             return res
 580         else:
 581             content, _ = res
 582             return content
 583
 584     def _download_xml(self, url_or_request, video_id,
 585                       note='Downloading XML', errnote='Unable to download XML',
 586                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 587         """Return the xml as an xml.etree.ElementTree.Element"""
 588         xml_string = self._download_webpage(
 589             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 590         if xml_string is False:
 591             return xml_string
 592         if transform_source:
 593             xml_string = transform_source(xml_string)
 594         return compat_etree_fromstring(xml_string.encode('utf-8'))
 595
 596     def _download_json(self, url_or_request, video_id,
 597                        note='Downloading JSON metadata',
 598                        errnote='Unable to download JSON metadata',
 599                        transform_source=None,
 600                        fatal=True, encoding=None, data=None, headers={}, query={}):
 601         json_string = self._download_webpage(
 602             url_or_request, video_id, note, errnote, fatal=fatal,
 603             encoding=encoding, data=data, headers=headers, query=query)
 604         if (not fatal) and json_string is False:
 605             return None
 606         return self._parse_json(
 607             json_string, video_id, transform_source=transform_source, fatal=fatal)
 608
 609     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 610         if transform_source:
 611             json_string = transform_source(json_string)
 612         try:
 613             return json.loads(json_string)
 614         except ValueError as ve:
 615             errmsg = '%s: Failed to parse JSON ' % video_id
 616             if fatal:
 617                 raise ExtractorError(errmsg, cause=ve)
 618             else:
 619                 self.report_warning(errmsg + str(ve))
 620
 621     def report_warning(self, msg, video_id=None):
 622         idstr = '' if video_id is None else '%s: ' % video_id
 623         self._downloader.report_warning(
 624             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 625
 626     def to_screen(self, msg):
 627         """Print msg to screen, prefixing it with '[ie_name]'"""
 628         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 629
 630     def report_extraction(self, id_or_name):
 631         """Report information extraction."""
 632         self.to_screen('%s: Extracting information' % id_or_name)
 633
 634     def report_download_webpage(self, video_id):
 635         """Report webpage download."""
 636         self.to_screen('%s: Downloading webpage' % video_id)
 637
 638     def report_age_confirmation(self):
 639         """Report attempt to confirm age."""
 640         self.to_screen('Confirming age')
 641
 642     def report_login(self):
 643         """Report attempt to log in."""
 644         self.to_screen('Logging in')
 645
 646     @staticmethod
 647     def raise_login_required(msg='This video is only available for registered users'):
 648         raise ExtractorError(
 649             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 650             expected=True)
 651
 652     @staticmethod
 653     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 654         raise GeoRestrictedError(msg, countries=countries)
 655
 656     # Methods for following #608
 657     @staticmethod
 658     def url_result(url, ie=None, video_id=None, video_title=None):
 659         """Returns a URL that points to a page that should be processed"""
 660         # TODO: ie should be the class used for getting the info
 661         video_info = {'_type': 'url',
 662                       'url': url,
 663                       'ie_key': ie}
 664         if video_id is not None:
 665             video_info['id'] = video_id
 666         if video_title is not None:
 667             video_info['title'] = video_title
 668         return video_info
 669
 670     @staticmethod
 671     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 672         """Returns a playlist"""
 673         video_info = {'_type': 'playlist',
 674                       'entries': entries}
 675         if playlist_id:
 676             video_info['id'] = playlist_id
 677         if playlist_title:
 678             video_info['title'] = playlist_title
 679         if playlist_description:
 680             video_info['description'] = playlist_description
 681         return video_info
 682
 683     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 684         """
 685         Perform a regex search on the given string, using a single or a list of
 686         patterns returning the first matching group.
 687         In case of failure return a default value or raise a WARNING or a
 688         RegexNotFoundError, depending on fatal, specifying the field name.
 689         """
 690         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 691             mobj = re.search(pattern, string, flags)
 692         else:
 693             for p in pattern:
 694                 mobj = re.search(p, string, flags)
 695                 if mobj:
 696                     break
 697
 698         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 699             _name = '\033[0;34m%s\033[0m' % name
 700         else:
 701             _name = name
 702
 703         if mobj:
 704             if group is None:
 705                 # return the first matching group
 706                 return next(g for g in mobj.groups() if g is not None)
 707             else:
 708                 return mobj.group(group)
 709         elif default is not NO_DEFAULT:
 710             return default
 711         elif fatal:
 712             raise RegexNotFoundError('Unable to extract %s' % _name)
 713         else:
 714             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 715             return None
 716
 717     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 718         """
 719         Like _search_regex, but strips HTML tags and unescapes entities.
 720         """
 721         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 722         if res:
 723             return clean_html(res).strip()
 724         else:
 725             return res
 726
 727     def _get_netrc_login_info(self, netrc_machine=None):
 728         username = None
 729         password = None
 730         netrc_machine = netrc_machine or self._NETRC_MACHINE
 731
 732         if self._downloader.params.get('usenetrc', False):
 733             try:
 734                 info = netrc.netrc().authenticators(netrc_machine)
 735                 if info is not None:
 736                     username = info[0]
 737                     password = info[2]
 738                 else:
 739                     raise netrc.NetrcParseError(
 740                         'No authenticators for %s' % netrc_machine)
 741             except (IOError, netrc.NetrcParseError) as err:
 742                 self._downloader.report_warning(
 743                     'parsing .netrc: %s' % error_to_compat_str(err))
 744
 745         return username, password
 746
 747     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 748         """
 749         Get the login info as (username, password)
 750         First look for the manually specified credentials using username_option
 751         and password_option as keys in params dictionary. If no such credentials
 752         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 753         value.
 754         If there's no info available, return (None, None)
 755         """
 756         if self._downloader is None:
 757             return (None, None)
 758
 759         downloader_params = self._downloader.params
 760
 761         # Attempt to use provided username and password or .netrc data
 762         if downloader_params.get(username_option) is not None:
 763             username = downloader_params[username_option]
 764             password = downloader_params[password_option]
 765         else:
 766             username, password = self._get_netrc_login_info(netrc_machine)
 767
 768         return username, password
 769
 770     def _get_tfa_info(self, note='two-factor verification code'):
 771         """
 772         Get the two-factor authentication info
 773         TODO - asking the user will be required for sms/phone verify
 774         currently just uses the command line option
 775         If there's no info available, return None
 776         """
 777         if self._downloader is None:
 778             return None
 779         downloader_params = self._downloader.params
 780
 781         if downloader_params.get('twofactor') is not None:
 782             return downloader_params['twofactor']
 783
 784         return compat_getpass('Type %s and press [Return]: ' % note)
 785
 786     # Helper functions for extracting OpenGraph info
 787     @staticmethod
 788     def _og_regexes(prop):
 789         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 790         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 791                        % {'prop': re.escape(prop)})
 792         template = r'<meta[^>]+?%s[^>]+?%s'
 793         return [
 794             template % (property_re, content_re),
 795             template % (content_re, property_re),
 796         ]
 797
 798     @staticmethod
 799     def _meta_regex(prop):
 800         return r'''(?isx)<meta
 801                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 802                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 803
 804     def _og_search_property(self, prop, html, name=None, **kargs):
 805         if not isinstance(prop, (list, tuple)):
 806             prop = [prop]
 807         if name is None:
 808             name = 'OpenGraph %s' % prop[0]
 809         og_regexes = []
 810         for p in prop:
 811             og_regexes.extend(self._og_regexes(p))
 812         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 813         if escaped is None:
 814             return None
 815         return unescapeHTML(escaped)
 816
 817     def _og_search_thumbnail(self, html, **kargs):
 818         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 819
 820     def _og_search_description(self, html, **kargs):
 821         return self._og_search_property('description', html, fatal=False, **kargs)
 822
 823     def _og_search_title(self, html, **kargs):
 824         return self._og_search_property('title', html, **kargs)
 825
 826     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 827         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 828         if secure:
 829             regexes = self._og_regexes('video:secure_url') + regexes
 830         return self._html_search_regex(regexes, html, name, **kargs)
 831
 832     def _og_search_url(self, html, **kargs):
 833         return self._og_search_property('url', html, **kargs)
 834
 835     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 836         if not isinstance(name, (list, tuple)):
 837             name = [name]
 838         if display_name is None:
 839             display_name = name[0]
 840         return self._html_search_regex(
 841             [self._meta_regex(n) for n in name],
 842             html, display_name, fatal=fatal, group='content', **kwargs)
 843
 844     def _dc_search_uploader(self, html):
 845         return self._html_search_meta('dc.creator', html, 'uploader')
 846
 847     def _rta_search(self, html):
 848         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 849         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 850                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 851                      html):
 852             return 18
 853         return 0
 854
 855     def _media_rating_search(self, html):
 856         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 857         rating = self._html_search_meta('rating', html)
 858
 859         if not rating:
 860             return None
 861
 862         RATING_TABLE = {
 863             'safe for kids': 0,
 864             'general': 8,
 865             '14 years': 14,
 866             'mature': 17,
 867             'restricted': 19,
 868         }
 869         return RATING_TABLE.get(rating.lower())
 870
 871     def _family_friendly_search(self, html):
 872         # See http://schema.org/VideoObject
 873         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 874
 875         if not family_friendly:
 876             return None
 877
 878         RATING_TABLE = {
 879             '1': 0,
 880             'true': 0,
 881             '0': 18,
 882             'false': 18,
 883         }
 884         return RATING_TABLE.get(family_friendly.lower())
 885
 886     def _twitter_search_player(self, html):
 887         return self._html_search_meta('twitter:player', html,
 888                                       'twitter card player')
 889
 890     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 891         json_ld = self._search_regex(
 892             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 893             html, 'JSON-LD', group='json_ld', **kwargs)
 894         default = kwargs.get('default', NO_DEFAULT)
 895         if not json_ld:
 896             return default if default is not NO_DEFAULT else {}
 897         # JSON-LD may be malformed and thus `fatal` should be respected.
 898         # At the same time `default` may be passed that assumes `fatal=False`
 899         # for _search_regex. Let's simulate the same behavior here as well.
 900         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 901         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 902
 903     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 904         if isinstance(json_ld, compat_str):
 905             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 906         if not json_ld:
 907             return {}
 908         info = {}
 909         if not isinstance(json_ld, (list, tuple, dict)):
 910             return info
 911         if isinstance(json_ld, dict):
 912             json_ld = [json_ld]
 913         for e in json_ld:
 914             if e.get('@context') == 'http://schema.org':
 915                 item_type = e.get('@type')
 916                 if expected_type is not None and expected_type != item_type:
 917                     return info
 918                 if item_type == 'TVEpisode':
 919                     info.update({
 920                         'episode': unescapeHTML(e.get('name')),
 921                         'episode_number': int_or_none(e.get('episodeNumber')),
 922                         'description': unescapeHTML(e.get('description')),
 923                     })
 924                     part_of_season = e.get('partOfSeason')
 925                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 926                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 927                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 928                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 929                         info['series'] = unescapeHTML(part_of_series.get('name'))
 930                 elif item_type == 'Article':
 931                     info.update({
 932                         'timestamp': parse_iso8601(e.get('datePublished')),
 933                         'title': unescapeHTML(e.get('headline')),
 934                         'description': unescapeHTML(e.get('articleBody')),
 935                     })
 936                 elif item_type == 'VideoObject':
 937                     info.update({
 938                         'url': e.get('contentUrl'),
 939                         'title': unescapeHTML(e.get('name')),
 940                         'description': unescapeHTML(e.get('description')),
 941                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 942                         'duration': parse_duration(e.get('duration')),
 943                         'timestamp': unified_timestamp(e.get('uploadDate')),
 944                         'filesize': float_or_none(e.get('contentSize')),
 945                         'tbr': int_or_none(e.get('bitrate')),
 946                         'width': int_or_none(e.get('width')),
 947                         'height': int_or_none(e.get('height')),
 948                     })
 949                 break
 950         return dict((k, v) for k, v in info.items() if v is not None)
 951
 952     @staticmethod
 953     def _hidden_inputs(html):
 954         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 955         hidden_inputs = {}
 956         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 957             attrs = extract_attributes(input)
 958             if not input:
 959                 continue
 960             if attrs.get('type') not in ('hidden', 'submit'):
 961                 continue
 962             name = attrs.get('name') or attrs.get('id')
 963             value = attrs.get('value')
 964             if name and value is not None:
 965                 hidden_inputs[name] = value
 966         return hidden_inputs
 967
 968     def _form_hidden_inputs(self, form_id, html):
 969         form = self._search_regex(
 970             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 971             html, '%s form' % form_id, group='form')
 972         return self._hidden_inputs(form)
 973
 974     def _sort_formats(self, formats, field_preference=None):
 975         if not formats:
 976             raise ExtractorError('No video formats found')
 977
 978         for f in formats:
 979             # Automatically determine tbr when missing based on abr and vbr (improves
 980             # formats sorting in some cases)
 981             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 982                 f['tbr'] = f['abr'] + f['vbr']
 983
 984         def _formats_key(f):
 985             # TODO remove the following workaround
 986             from ..utils import determine_ext
 987             if not f.get('ext') and 'url' in f:
 988                 f['ext'] = determine_ext(f['url'])
 989
 990             if isinstance(field_preference, (list, tuple)):
 991                 return tuple(
 992                     f.get(field)
 993                     if f.get(field) is not None
 994                     else ('' if field == 'format_id' else -1)
 995                     for field in field_preference)
 996
 997             preference = f.get('preference')
 998             if preference is None:
 999                 preference = 0
1000                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1001                     preference -= 0.5
1002
1003             protocol = f.get('protocol') or determine_protocol(f)
1004             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1005
1006             if f.get('vcodec') == 'none':  # audio only
1007                 preference -= 50
1008                 if self._downloader.params.get('prefer_free_formats'):
1009                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1010                 else:
1011                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1012                 ext_preference = 0
1013                 try:
1014                     audio_ext_preference = ORDER.index(f['ext'])
1015                 except ValueError:
1016                     audio_ext_preference = -1
1017             else:
1018                 if f.get('acodec') == 'none':  # video only
1019                     preference -= 40
1020                 if self._downloader.params.get('prefer_free_formats'):
1021                     ORDER = ['flv', 'mp4', 'webm']
1022                 else:
1023                     ORDER = ['webm', 'flv', 'mp4']
1024                 try:
1025                     ext_preference = ORDER.index(f['ext'])
1026                 except ValueError:
1027                     ext_preference = -1
1028                 audio_ext_preference = 0
1029
1030             return (
1031                 preference,
1032                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1033                 f.get('quality') if f.get('quality') is not None else -1,
1034                 f.get('tbr') if f.get('tbr') is not None else -1,
1035                 f.get('filesize') if f.get('filesize') is not None else -1,
1036                 f.get('vbr') if f.get('vbr') is not None else -1,
1037                 f.get('height') if f.get('height') is not None else -1,
1038                 f.get('width') if f.get('width') is not None else -1,
1039                 proto_preference,
1040                 ext_preference,
1041                 f.get('abr') if f.get('abr') is not None else -1,
1042                 audio_ext_preference,
1043                 f.get('fps') if f.get('fps') is not None else -1,
1044                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1045                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1046                 f.get('format_id') if f.get('format_id') is not None else '',
1047             )
1048         formats.sort(key=_formats_key)
1049
1050     def _check_formats(self, formats, video_id):
1051         if formats:
1052             formats[:] = filter(
1053                 lambda f: self._is_valid_url(
1054                     f['url'], video_id,
1055                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1056                 formats)
1057
1058     @staticmethod
1059     def _remove_duplicate_formats(formats):
1060         format_urls = set()
1061         unique_formats = []
1062         for f in formats:
1063             if f['url'] not in format_urls:
1064                 format_urls.add(f['url'])
1065                 unique_formats.append(f)
1066         formats[:] = unique_formats
1067
1068     def _is_valid_url(self, url, video_id, item='video', headers={}):
1069         url = self._proto_relative_url(url, scheme='http:')
1070         # For now assume non HTTP(S) URLs always valid
1071         if not (url.startswith('http://') or url.startswith('https://')):
1072             return True
1073         try:
1074             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1075             return True
1076         except ExtractorError as e:
1077             if isinstance(e.cause, compat_urllib_error.URLError):
1078                 self.to_screen(
1079                     '%s: %s URL is invalid, skipping' % (video_id, item))
1080                 return False
1081             raise
1082
1083     def http_scheme(self):
1084         """ Either "http:" or "https:", depending on the user's preferences """
1085         return (
1086             'http:'
1087             if self._downloader.params.get('prefer_insecure', False)
1088             else 'https:')
1089
1090     def _proto_relative_url(self, url, scheme=None):
1091         if url is None:
1092             return url
1093         if url.startswith('//'):
1094             if scheme is None:
1095                 scheme = self.http_scheme()
1096             return scheme + url
1097         else:
1098             return url
1099
1100     def _sleep(self, timeout, video_id, msg_template=None):
1101         if msg_template is None:
1102             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1103         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1104         self.to_screen(msg)
1105         time.sleep(timeout)
1106
1107     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1108                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1109                              fatal=True, m3u8_id=None):
1110         manifest = self._download_xml(
1111             manifest_url, video_id, 'Downloading f4m manifest',
1112             'Unable to download f4m manifest',
1113             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1114             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1115             transform_source=transform_source,
1116             fatal=fatal)
1117
1118         if manifest is False:
1119             return []
1120
1121         return self._parse_f4m_formats(
1122             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1123             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1124
1125     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1126                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1127                            fatal=True, m3u8_id=None):
1128         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1129         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1130         if akamai_pv is not None and ';' in akamai_pv.text:
1131             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1132             if playerVerificationChallenge.strip() != '':
1133                 return []
1134
1135         formats = []
1136         manifest_version = '1.0'
1137         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1138         if not media_nodes:
1139             manifest_version = '2.0'
1140             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1141         # Remove unsupported DRM protected media from final formats
1142         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1143         media_nodes = remove_encrypted_media(media_nodes)
1144         if not media_nodes:
1145             return formats
1146         base_url = xpath_text(
1147             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1148             'base URL', default=None)
1149         if base_url:
1150             base_url = base_url.strip()
1151
1152         bootstrap_info = xpath_element(
1153             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1154             'bootstrap info', default=None)
1155
1156         vcodec = None
1157         mime_type = xpath_text(
1158             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1159             'base URL', default=None)
1160         if mime_type and mime_type.startswith('audio/'):
1161             vcodec = 'none'
1162
1163         for i, media_el in enumerate(media_nodes):
1164             tbr = int_or_none(media_el.attrib.get('bitrate'))
1165             width = int_or_none(media_el.attrib.get('width'))
1166             height = int_or_none(media_el.attrib.get('height'))
1167             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1168             # If <bootstrapInfo> is present, the specified f4m is a
1169             # stream-level manifest, and only set-level manifests may refer to
1170             # external resources.  See section 11.4 and section 4 of F4M spec
1171             if bootstrap_info is None:
1172                 media_url = None
1173                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1174                 if manifest_version == '2.0':
1175                     media_url = media_el.attrib.get('href')
1176                 if media_url is None:
1177                     media_url = media_el.attrib.get('url')
1178                 if not media_url:
1179                     continue
1180                 manifest_url = (
1181                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1182                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1183                 # If media_url is itself a f4m manifest do the recursive extraction
1184                 # since bitrates in parent manifest (this one) and media_url manifest
1185                 # may differ leading to inability to resolve the format by requested
1186                 # bitrate in f4m downloader
1187                 ext = determine_ext(manifest_url)
1188                 if ext == 'f4m':
1189                     f4m_formats = self._extract_f4m_formats(
1190                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1191                         transform_source=transform_source, fatal=fatal)
1192                     # Sometimes stream-level manifest contains single media entry that
1193                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1194                     # At the same time parent's media entry in set-level manifest may
1195                     # contain it. We will copy it from parent in such cases.
1196                     if len(f4m_formats) == 1:
1197                         f = f4m_formats[0]
1198                         f.update({
1199                             'tbr': f.get('tbr') or tbr,
1200                             'width': f.get('width') or width,
1201                             'height': f.get('height') or height,
1202                             'format_id': f.get('format_id') if not tbr else format_id,
1203                             'vcodec': vcodec,
1204                         })
1205                     formats.extend(f4m_formats)
1206                     continue
1207                 elif ext == 'm3u8':
1208                     formats.extend(self._extract_m3u8_formats(
1209                         manifest_url, video_id, 'mp4', preference=preference,
1210                         m3u8_id=m3u8_id, fatal=fatal))
1211                     continue
1212             formats.append({
1213                 'format_id': format_id,
1214                 'url': manifest_url,
1215                 'manifest_url': manifest_url,
1216                 'ext': 'flv' if bootstrap_info is not None else None,
1217                 'tbr': tbr,
1218                 'width': width,
1219                 'height': height,
1220                 'vcodec': vcodec,
1221                 'preference': preference,
1222             })
1223         return formats
1224
1225     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1226         return {
1227             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1228             'url': m3u8_url,
1229             'ext': ext,
1230             'protocol': 'm3u8',
1231             'preference': preference - 100 if preference else -100,
1232             'resolution': 'multiple',
1233             'format_note': 'Quality selection URL',
1234         }
1235
1236     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1237                               entry_protocol='m3u8', preference=None,
1238                               m3u8_id=None, note=None, errnote=None,
1239                               fatal=True, live=False):
1240
1241         res = self._download_webpage_handle(
1242             m3u8_url, video_id,
1243             note=note or 'Downloading m3u8 information',
1244             errnote=errnote or 'Failed to download m3u8 information',
1245             fatal=fatal)
1246         if res is False:
1247             return []
1248         m3u8_doc, urlh = res
1249         m3u8_url = urlh.geturl()
1250
1251         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1252             return []
1253
1254         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1255
1256         format_url = lambda u: (
1257             u
1258             if re.match(r'^https?://', u)
1259             else compat_urlparse.urljoin(m3u8_url, u))
1260
1261         # We should try extracting formats only from master playlists [1], i.e.
1262         # playlists that describe available qualities. On the other hand media
1263         # playlists [2] should be returned as is since they contain just the media
1264         # without qualities renditions.
1265         # Fortunately, master playlist can be easily distinguished from media
1266         # playlist based on particular tags availability. As of [1, 2] master
1267         # playlist tags MUST NOT appear in a media playist and vice versa.
1268         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1269         # and MUST NOT appear in master playlist thus we can clearly detect media
1270         # playlist with this criterion.
1271         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1272         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1273         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1274         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1275             return [{
1276                 'url': m3u8_url,
1277                 'format_id': m3u8_id,
1278                 'ext': ext,
1279                 'protocol': entry_protocol,
1280                 'preference': preference,
1281             }]
1282         audio_in_video_stream = {}
1283         last_info = {}
1284         last_media = {}
1285         for line in m3u8_doc.splitlines():
1286             if line.startswith('#EXT-X-STREAM-INF:'):
1287                 last_info = parse_m3u8_attributes(line)
1288             elif line.startswith('#EXT-X-MEDIA:'):
1289                 media = parse_m3u8_attributes(line)
1290                 media_type = media.get('TYPE')
1291                 if media_type in ('VIDEO', 'AUDIO'):
1292                     group_id = media.get('GROUP-ID')
1293                     media_url = media.get('URI')
1294                     if media_url:
1295                         format_id = []
1296                         for v in (group_id, media.get('NAME')):
1297                             if v:
1298                                 format_id.append(v)
1299                         f = {
1300                             'format_id': '-'.join(format_id),
1301                             'url': format_url(media_url),
1302                             'language': media.get('LANGUAGE'),
1303                             'ext': ext,
1304                             'protocol': entry_protocol,
1305                             'preference': preference,
1306                         }
1307                         if media_type == 'AUDIO':
1308                             f['vcodec'] = 'none'
1309                             if group_id and not audio_in_video_stream.get(group_id):
1310                                 audio_in_video_stream[group_id] = False
1311                         formats.append(f)
1312                     else:
1313                         # When there is no URI in EXT-X-MEDIA let this tag's
1314                         # data be used by regular URI lines below
1315                         last_media = media
1316                         if media_type == 'AUDIO' and group_id:
1317                             audio_in_video_stream[group_id] = True
1318             elif line.startswith('#') or not line.strip():
1319                 continue
1320             else:
1321                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1322                 format_id = []
1323                 if m3u8_id:
1324                     format_id.append(m3u8_id)
1325                 # Despite specification does not mention NAME attribute for
1326                 # EXT-X-STREAM-INF it still sometimes may be present
1327                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1328                 # Bandwidth of live streams may differ over time thus making
1329                 # format_id unpredictable. So it's better to keep provided
1330                 # format_id intact.
1331                 if not live:
1332                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1333                 manifest_url = format_url(line.strip())
1334                 f = {
1335                     'format_id': '-'.join(format_id),
1336                     'url': manifest_url,
1337                     'manifest_url': manifest_url,
1338                     'tbr': tbr,
1339                     'ext': ext,
1340                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1341                     'protocol': entry_protocol,
1342                     'preference': preference,
1343                 }
1344                 resolution = last_info.get('RESOLUTION')
1345                 if resolution:
1346                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1347                     if mobj:
1348                         f['width'] = int(mobj.group('width'))
1349                         f['height'] = int(mobj.group('height'))
1350                 # Unified Streaming Platform
1351                 mobj = re.search(
1352                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1353                 if mobj:
1354                     abr, vbr = mobj.groups()
1355                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1356                     f.update({
1357                         'vbr': vbr,
1358                         'abr': abr,
1359                     })
1360                 f.update(parse_codecs(last_info.get('CODECS')))
1361                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1362                     # TODO: update acodec for audio only formats with the same GROUP-ID
1363                     f['acodec'] = 'none'
1364                 formats.append(f)
1365                 last_info = {}
1366                 last_media = {}
1367         return formats
1368
1369     @staticmethod
1370     def _xpath_ns(path, namespace=None):
1371         if not namespace:
1372             return path
1373         out = []
1374         for c in path.split('/'):
1375             if not c or c == '.':
1376                 out.append(c)
1377             else:
1378                 out.append('{%s}%s' % (namespace, c))
1379         return '/'.join(out)
1380
1381     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1382         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1383
1384         if smil is False:
1385             assert not fatal
1386             return []
1387
1388         namespace = self._parse_smil_namespace(smil)
1389
1390         return self._parse_smil_formats(
1391             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1392
1393     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1394         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1395         if smil is False:
1396             return {}
1397         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1398
1399     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1400         return self._download_xml(
1401             smil_url, video_id, 'Downloading SMIL file',
1402             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1403
1404     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1405         namespace = self._parse_smil_namespace(smil)
1406
1407         formats = self._parse_smil_formats(
1408             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1409         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1410
1411         video_id = os.path.splitext(url_basename(smil_url))[0]
1412         title = None
1413         description = None
1414         upload_date = None
1415         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1416             name = meta.attrib.get('name')
1417             content = meta.attrib.get('content')
1418             if not name or not content:
1419                 continue
1420             if not title and name == 'title':
1421                 title = content
1422             elif not description and name in ('description', 'abstract'):
1423                 description = content
1424             elif not upload_date and name == 'date':
1425                 upload_date = unified_strdate(content)
1426
1427         thumbnails = [{
1428             'id': image.get('type'),
1429             'url': image.get('src'),
1430             'width': int_or_none(image.get('width')),
1431             'height': int_or_none(image.get('height')),
1432         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1433
1434         return {
1435             'id': video_id,
1436             'title': title or video_id,
1437             'description': description,
1438             'upload_date': upload_date,
1439             'thumbnails': thumbnails,
1440             'formats': formats,
1441             'subtitles': subtitles,
1442         }
1443
1444     def _parse_smil_namespace(self, smil):
1445         return self._search_regex(
1446             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1447
1448     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1449         base = smil_url
1450         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1451             b = meta.get('base') or meta.get('httpBase')
1452             if b:
1453                 base = b
1454                 break
1455
1456         formats = []
1457         rtmp_count = 0
1458         http_count = 0
1459         m3u8_count = 0
1460
1461         srcs = []
1462         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1463         for medium in media:
1464             src = medium.get('src')
1465             if not src or src in srcs:
1466                 continue
1467             srcs.append(src)
1468
1469             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1470             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1471             width = int_or_none(medium.get('width'))
1472             height = int_or_none(medium.get('height'))
1473             proto = medium.get('proto')
1474             ext = medium.get('ext')
1475             src_ext = determine_ext(src)
1476             streamer = medium.get('streamer') or base
1477
1478             if proto == 'rtmp' or streamer.startswith('rtmp'):
1479                 rtmp_count += 1
1480                 formats.append({
1481                     'url': streamer,
1482                     'play_path': src,
1483                     'ext': 'flv',
1484                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1485                     'tbr': bitrate,
1486                     'filesize': filesize,
1487                     'width': width,
1488                     'height': height,
1489                 })
1490                 if transform_rtmp_url:
1491                     streamer, src = transform_rtmp_url(streamer, src)
1492                     formats[-1].update({
1493                         'url': streamer,
1494                         'play_path': src,
1495                     })
1496                 continue
1497
1498             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1499             src_url = src_url.strip()
1500
1501             if proto == 'm3u8' or src_ext == 'm3u8':
1502                 m3u8_formats = self._extract_m3u8_formats(
1503                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1504                 if len(m3u8_formats) == 1:
1505                     m3u8_count += 1
1506                     m3u8_formats[0].update({
1507                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1508                         'tbr': bitrate,
1509                         'width': width,
1510                         'height': height,
1511                     })
1512                 formats.extend(m3u8_formats)
1513                 continue
1514
1515             if src_ext == 'f4m':
1516                 f4m_url = src_url
1517                 if not f4m_params:
1518                     f4m_params = {
1519                         'hdcore': '3.2.0',
1520                         'plugin': 'flowplayer-3.2.0.1',
1521                     }
1522                 f4m_url += '&' if '?' in f4m_url else '?'
1523                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1524                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1525                 continue
1526
1527             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1528                 http_count += 1
1529                 formats.append({
1530                     'url': src_url,
1531                     'ext': ext or src_ext or 'flv',
1532                     'format_id': 'http-%d' % (bitrate or http_count),
1533                     'tbr': bitrate,
1534                     'filesize': filesize,
1535                     'width': width,
1536                     'height': height,
1537                 })
1538                 continue
1539
1540         return formats
1541
1542     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1543         urls = []
1544         subtitles = {}
1545         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1546             src = textstream.get('src')
1547             if not src or src in urls:
1548                 continue
1549             urls.append(src)
1550             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1551             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1552             subtitles.setdefault(lang, []).append({
1553                 'url': src,
1554                 'ext': ext,
1555             })
1556         return subtitles
1557
1558     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1559         xspf = self._download_xml(
1560             playlist_url, playlist_id, 'Downloading xpsf playlist',
1561             'Unable to download xspf manifest', fatal=fatal)
1562         if xspf is False:
1563             return []
1564         return self._parse_xspf(xspf, playlist_id)
1565
1566     def _parse_xspf(self, playlist, playlist_id):
1567         NS_MAP = {
1568             'xspf': 'http://xspf.org/ns/0/',
1569             's1': 'http://static.streamone.nl/player/ns/0',
1570         }
1571
1572         entries = []
1573         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1574             title = xpath_text(
1575                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1576             description = xpath_text(
1577                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1578             thumbnail = xpath_text(
1579                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1580             duration = float_or_none(
1581                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1582
1583             formats = [{
1584                 'url': location.text,
1585                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1586                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1587                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1588             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1589             self._sort_formats(formats)
1590
1591             entries.append({
1592                 'id': playlist_id,
1593                 'title': title,
1594                 'description': description,
1595                 'thumbnail': thumbnail,
1596                 'duration': duration,
1597                 'formats': formats,
1598             })
1599         return entries
1600
1601     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1602         res = self._download_webpage_handle(
1603             mpd_url, video_id,
1604             note=note or 'Downloading MPD manifest',
1605             errnote=errnote or 'Failed to download MPD manifest',
1606             fatal=fatal)
1607         if res is False:
1608             return []
1609         mpd, urlh = res
1610         mpd_base_url = base_url(urlh.geturl())
1611
1612         return self._parse_mpd_formats(
1613             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1614             formats_dict=formats_dict, mpd_url=mpd_url)
1615
1616     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1617         """
1618         Parse formats from MPD manifest.
1619         References:
1620          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1621             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1622          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1623         """
1624         if mpd_doc.get('type') == 'dynamic':
1625             return []
1626
1627         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1628
1629         def _add_ns(path):
1630             return self._xpath_ns(path, namespace)
1631
1632         def is_drm_protected(element):
1633             return element.find(_add_ns('ContentProtection')) is not None
1634
1635         def extract_multisegment_info(element, ms_parent_info):
1636             ms_info = ms_parent_info.copy()
1637
1638             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1639             # common attributes and elements.  We will only extract relevant
1640             # for us.
1641             def extract_common(source):
1642                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1643                 if segment_timeline is not None:
1644                     s_e = segment_timeline.findall(_add_ns('S'))
1645                     if s_e:
1646                         ms_info['total_number'] = 0
1647                         ms_info['s'] = []
1648                         for s in s_e:
1649                             r = int(s.get('r', 0))
1650                             ms_info['total_number'] += 1 + r
1651                             ms_info['s'].append({
1652                                 't': int(s.get('t', 0)),
1653                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1654                                 'd': int(s.attrib['d']),
1655                                 'r': r,
1656                             })
1657                 start_number = source.get('startNumber')
1658                 if start_number:
1659                     ms_info['start_number'] = int(start_number)
1660                 timescale = source.get('timescale')
1661                 if timescale:
1662                     ms_info['timescale'] = int(timescale)
1663                 segment_duration = source.get('duration')
1664                 if segment_duration:
1665                     ms_info['segment_duration'] = int(segment_duration)
1666
1667             def extract_Initialization(source):
1668                 initialization = source.find(_add_ns('Initialization'))
1669                 if initialization is not None:
1670                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1671
1672             segment_list = element.find(_add_ns('SegmentList'))
1673             if segment_list is not None:
1674                 extract_common(segment_list)
1675                 extract_Initialization(segment_list)
1676                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1677                 if segment_urls_e:
1678                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1679             else:
1680                 segment_template = element.find(_add_ns('SegmentTemplate'))
1681                 if segment_template is not None:
1682                     extract_common(segment_template)
1683                     media = segment_template.get('media')
1684                     if media:
1685                         ms_info['media'] = media
1686                     initialization = segment_template.get('initialization')
1687                     if initialization:
1688                         ms_info['initialization'] = initialization
1689                     else:
1690                         extract_Initialization(segment_template)
1691             return ms_info
1692
1693         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1694         formats = []
1695         for period in mpd_doc.findall(_add_ns('Period')):
1696             period_duration = parse_duration(period.get('duration')) or mpd_duration
1697             period_ms_info = extract_multisegment_info(period, {
1698                 'start_number': 1,
1699                 'timescale': 1,
1700             })
1701             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1702                 if is_drm_protected(adaptation_set):
1703                     continue
1704                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1705                 for representation in adaptation_set.findall(_add_ns('Representation')):
1706                     if is_drm_protected(representation):
1707                         continue
1708                     representation_attrib = adaptation_set.attrib.copy()
1709                     representation_attrib.update(representation.attrib)
1710                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1711                     mime_type = representation_attrib['mimeType']
1712                     content_type = mime_type.split('/')[0]
1713                     if content_type == 'text':
1714                         # TODO implement WebVTT downloading
1715                         pass
1716                     elif content_type == 'video' or content_type == 'audio':
1717                         base_url = ''
1718                         for element in (representation, adaptation_set, period, mpd_doc):
1719                             base_url_e = element.find(_add_ns('BaseURL'))
1720                             if base_url_e is not None:
1721                                 base_url = base_url_e.text + base_url
1722                                 if re.match(r'^https?://', base_url):
1723                                     break
1724                         if mpd_base_url and not re.match(r'^https?://', base_url):
1725                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1726                                 mpd_base_url += '/'
1727                             base_url = mpd_base_url + base_url
1728                         representation_id = representation_attrib.get('id')
1729                         lang = representation_attrib.get('lang')
1730                         url_el = representation.find(_add_ns('BaseURL'))
1731                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1732                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1733                         f = {
1734                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1735                             'url': base_url,
1736                             'manifest_url': mpd_url,
1737                             'ext': mimetype2ext(mime_type),
1738                             'width': int_or_none(representation_attrib.get('width')),
1739                             'height': int_or_none(representation_attrib.get('height')),
1740                             'tbr': int_or_none(bandwidth, 1000),
1741                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1742                             'fps': int_or_none(representation_attrib.get('frameRate')),
1743                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1744                             'format_note': 'DASH %s' % content_type,
1745                             'filesize': filesize,
1746                         }
1747                         f.update(parse_codecs(representation_attrib.get('codecs')))
1748                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1749
1750                         def prepare_template(template_name, identifiers):
1751                             t = representation_ms_info[template_name]
1752                             t = t.replace('$RepresentationID$', representation_id)
1753                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1754                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1755                             t.replace('$$', '$')
1756                             return t
1757
1758                         # @initialization is a regular template like @media one
1759                         # so it should be handled just the same way (see
1760                         # https://github.com/rg3/youtube-dl/issues/11605)
1761                         if 'initialization' in representation_ms_info:
1762                             initialization_template = prepare_template(
1763                                 'initialization',
1764                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1765                                 # $Time$ shall not be included for @initialization thus
1766                                 # only $Bandwidth$ remains
1767                                 ('Bandwidth', ))
1768                             representation_ms_info['initialization_url'] = initialization_template % {
1769                                 'Bandwidth': bandwidth,
1770                             }
1771
1772                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1773
1774                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1775
1776                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1777                             # can't be used at the same time
1778                             if '%(Number' in media_template and 's' not in representation_ms_info:
1779                                 segment_duration = None
1780                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1781                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1782                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1783                                 representation_ms_info['fragments'] = [{
1784                                     'url': media_template % {
1785                                         'Number': segment_number,
1786                                         'Bandwidth': bandwidth,
1787                                     },
1788                                     'duration': segment_duration,
1789                                 } for segment_number in range(
1790                                     representation_ms_info['start_number'],
1791                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1792                             else:
1793                                 # $Number*$ or $Time$ in media template with S list available
1794                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1795                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1796                                 representation_ms_info['fragments'] = []
1797                                 segment_time = 0
1798                                 segment_d = None
1799                                 segment_number = representation_ms_info['start_number']
1800
1801                                 def add_segment_url():
1802                                     segment_url = media_template % {
1803                                         'Time': segment_time,
1804                                         'Bandwidth': bandwidth,
1805                                         'Number': segment_number,
1806                                     }
1807                                     representation_ms_info['fragments'].append({
1808                                         'url': segment_url,
1809                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1810                                     })
1811
1812                                 for num, s in enumerate(representation_ms_info['s']):
1813                                     segment_time = s.get('t') or segment_time
1814                                     segment_d = s['d']
1815                                     add_segment_url()
1816                                     segment_number += 1
1817                                     for r in range(s.get('r', 0)):
1818                                         segment_time += segment_d
1819                                         add_segment_url()
1820                                         segment_number += 1
1821                                     segment_time += segment_d
1822                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1823                             # No media template
1824                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1825                             # or any YouTube dashsegments video
1826                             fragments = []
1827                             segment_index = 0
1828                             timescale = representation_ms_info['timescale']
1829                             for s in representation_ms_info['s']:
1830                                 duration = float_or_none(s['d'], timescale)
1831                                 for r in range(s.get('r', 0) + 1):
1832                                     fragments.append({
1833                                         'url': representation_ms_info['segment_urls'][segment_index],
1834                                         'duration': duration,
1835                                     })
1836                                     segment_index += 1
1837                             representation_ms_info['fragments'] = fragments
1838                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1839                         # No fragments key is present in this case.
1840                         if 'fragments' in representation_ms_info:
1841                             f.update({
1842                                 'fragments': [],
1843                                 'protocol': 'http_dash_segments',
1844                             })
1845                             if 'initialization_url' in representation_ms_info:
1846                                 initialization_url = representation_ms_info['initialization_url']
1847                                 if not f.get('url'):
1848                                     f['url'] = initialization_url
1849                                 f['fragments'].append({'url': initialization_url})
1850                             f['fragments'].extend(representation_ms_info['fragments'])
1851                             for fragment in f['fragments']:
1852                                 fragment['url'] = urljoin(base_url, fragment['url'])
1853                         try:
1854                             existing_format = next(
1855                                 fo for fo in formats
1856                                 if fo['format_id'] == representation_id)
1857                         except StopIteration:
1858                             full_info = formats_dict.get(representation_id, {}).copy()
1859                             full_info.update(f)
1860                             formats.append(full_info)
1861                         else:
1862                             existing_format.update(f)
1863                     else:
1864                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1865         return formats
1866
1867     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1868         res = self._download_webpage_handle(
1869             ism_url, video_id,
1870             note=note or 'Downloading ISM manifest',
1871             errnote=errnote or 'Failed to download ISM manifest',
1872             fatal=fatal)
1873         if res is False:
1874             return []
1875         ism, urlh = res
1876
1877         return self._parse_ism_formats(
1878             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1879
1880     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1881         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1882             return []
1883
1884         duration = int(ism_doc.attrib['Duration'])
1885         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1886
1887         formats = []
1888         for stream in ism_doc.findall('StreamIndex'):
1889             stream_type = stream.get('Type')
1890             if stream_type not in ('video', 'audio'):
1891                 continue
1892             url_pattern = stream.attrib['Url']
1893             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1894             stream_name = stream.get('Name')
1895             for track in stream.findall('QualityLevel'):
1896                 fourcc = track.get('FourCC')
1897                 # TODO: add support for WVC1 and WMAP
1898                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1899                     self.report_warning('%s is not a supported codec' % fourcc)
1900                     continue
1901                 tbr = int(track.attrib['Bitrate']) // 1000
1902                 width = int_or_none(track.get('MaxWidth'))
1903                 height = int_or_none(track.get('MaxHeight'))
1904                 sampling_rate = int_or_none(track.get('SamplingRate'))
1905
1906                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1907                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1908
1909                 fragments = []
1910                 fragment_ctx = {
1911                     'time': 0,
1912                 }
1913                 stream_fragments = stream.findall('c')
1914                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1915                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1916                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1917                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1918                     if not fragment_ctx['duration']:
1919                         try:
1920                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1921                         except IndexError:
1922                             next_fragment_time = duration
1923                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1924                     for _ in range(fragment_repeat):
1925                         fragments.append({
1926                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1927                             'duration': fragment_ctx['duration'] / stream_timescale,
1928                         })
1929                         fragment_ctx['time'] += fragment_ctx['duration']
1930
1931                 format_id = []
1932                 if ism_id:
1933                     format_id.append(ism_id)
1934                 if stream_name:
1935                     format_id.append(stream_name)
1936                 format_id.append(compat_str(tbr))
1937
1938                 formats.append({
1939                     'format_id': '-'.join(format_id),
1940                     'url': ism_url,
1941                     'manifest_url': ism_url,
1942                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1943                     'width': width,
1944                     'height': height,
1945                     'tbr': tbr,
1946                     'asr': sampling_rate,
1947                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1948                     'acodec': 'none' if stream_type == 'video' else fourcc,
1949                     'protocol': 'ism',
1950                     'fragments': fragments,
1951                     '_download_params': {
1952                         'duration': duration,
1953                         'timescale': stream_timescale,
1954                         'width': width or 0,
1955                         'height': height or 0,
1956                         'fourcc': fourcc,
1957                         'codec_private_data': track.get('CodecPrivateData'),
1958                         'sampling_rate': sampling_rate,
1959                         'channels': int_or_none(track.get('Channels', 2)),
1960                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1961                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1962                     },
1963                 })
1964         return formats
1965
1966     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1967         def absolute_url(video_url):
1968             return compat_urlparse.urljoin(base_url, video_url)
1969
1970         def parse_content_type(content_type):
1971             if not content_type:
1972                 return {}
1973             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1974             if ctr:
1975                 mimetype, codecs = ctr.groups()
1976                 f = parse_codecs(codecs)
1977                 f['ext'] = mimetype2ext(mimetype)
1978                 return f
1979             return {}
1980
1981         def _media_formats(src, cur_media_type):
1982             full_url = absolute_url(src)
1983             ext = determine_ext(full_url)
1984             if ext == 'm3u8':
1985                 is_plain_url = False
1986                 formats = self._extract_m3u8_formats(
1987                     full_url, video_id, ext='mp4',
1988                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1989             elif ext == 'mpd':
1990                 is_plain_url = False
1991                 formats = self._extract_mpd_formats(
1992                     full_url, video_id, mpd_id=mpd_id)
1993             else:
1994                 is_plain_url = True
1995                 formats = [{
1996                     'url': full_url,
1997                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1998                 }]
1999             return is_plain_url, formats
2000
2001         entries = []
2002         media_tags = [(media_tag, media_type, '')
2003                       for media_tag, media_type
2004                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2005         media_tags.extend(re.findall(
2006             # We only allow video|audio followed by a whitespace or '>'.
2007             # Allowing more characters may end up in significant slow down (see
2008             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2009             # http://www.porntrex.com/maps/videositemap.xml).
2010             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2011         for media_tag, media_type, media_content in media_tags:
2012             media_info = {
2013                 'formats': [],
2014                 'subtitles': {},
2015             }
2016             media_attributes = extract_attributes(media_tag)
2017             src = media_attributes.get('src')
2018             if src:
2019                 _, formats = _media_formats(src, media_type)
2020                 media_info['formats'].extend(formats)
2021             media_info['thumbnail'] = media_attributes.get('poster')
2022             if media_content:
2023                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2024                     source_attributes = extract_attributes(source_tag)
2025                     src = source_attributes.get('src')
2026                     if not src:
2027                         continue
2028                     is_plain_url, formats = _media_formats(src, media_type)
2029                     if is_plain_url:
2030                         f = parse_content_type(source_attributes.get('type'))
2031                         f.update(formats[0])
2032                         media_info['formats'].append(f)
2033                     else:
2034                         media_info['formats'].extend(formats)
2035                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2036                     track_attributes = extract_attributes(track_tag)
2037                     kind = track_attributes.get('kind')
2038                     if not kind or kind in ('subtitles', 'captions'):
2039                         src = track_attributes.get('src')
2040                         if not src:
2041                             continue
2042                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2043                         media_info['subtitles'].setdefault(lang, []).append({
2044                             'url': absolute_url(src),
2045                         })
2046             if media_info['formats'] or media_info['subtitles']:
2047                 entries.append(media_info)
2048         return entries
2049
2050     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2051         formats = []
2052         hdcore_sign = 'hdcore=3.7.0'
2053         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2054         hds_host = hosts.get('hds')
2055         if hds_host:
2056             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2057         if 'hdcore=' not in f4m_url:
2058             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2059         f4m_formats = self._extract_f4m_formats(
2060             f4m_url, video_id, f4m_id='hds', fatal=False)
2061         for entry in f4m_formats:
2062             entry.update({'extra_param_to_segment_url': hdcore_sign})
2063         formats.extend(f4m_formats)
2064         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2065         hls_host = hosts.get('hls')
2066         if hls_host:
2067             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2068         formats.extend(self._extract_m3u8_formats(
2069             m3u8_url, video_id, 'mp4', 'm3u8_native',
2070             m3u8_id='hls', fatal=False))
2071         return formats
2072
2073     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2074         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2075         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2076         http_base_url = 'http' + url_base
2077         formats = []
2078         if 'm3u8' not in skip_protocols:
2079             formats.extend(self._extract_m3u8_formats(
2080                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2081                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2082         if 'f4m' not in skip_protocols:
2083             formats.extend(self._extract_f4m_formats(
2084                 http_base_url + '/manifest.f4m',
2085                 video_id, f4m_id='hds', fatal=False))
2086         if 'dash' not in skip_protocols:
2087             formats.extend(self._extract_mpd_formats(
2088                 http_base_url + '/manifest.mpd',
2089                 video_id, mpd_id='dash', fatal=False))
2090         if re.search(r'(?:/smil:|\.smil)', url_base):
2091             if 'smil' not in skip_protocols:
2092                 rtmp_formats = self._extract_smil_formats(
2093                     http_base_url + '/jwplayer.smil',
2094                     video_id, fatal=False)
2095                 for rtmp_format in rtmp_formats:
2096                     rtsp_format = rtmp_format.copy()
2097                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2098                     del rtsp_format['play_path']
2099                     del rtsp_format['ext']
2100                     rtsp_format.update({
2101                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2102                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2103                         'protocol': 'rtsp',
2104                     })
2105                     formats.extend([rtmp_format, rtsp_format])
2106         else:
2107             for protocol in ('rtmp', 'rtsp'):
2108                 if protocol not in skip_protocols:
2109                     formats.append({
2110                         'url': protocol + url_base,
2111                         'format_id': protocol,
2112                         'protocol': protocol,
2113                     })
2114         return formats
2115
2116     @staticmethod
2117     def _find_jwplayer_data(webpage):
2118         mobj = re.search(
2119             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2120             webpage)
2121         if mobj:
2122             return mobj.group('options')
2123
2124     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2125         jwplayer_data = self._parse_json(
2126             self._find_jwplayer_data(webpage), video_id,
2127             transform_source=js_to_json)
2128         return self._parse_jwplayer_data(
2129             jwplayer_data, video_id, *args, **kwargs)
2130
2131     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2132                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2133         # JWPlayer backward compatibility: flattened playlists
2134         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2135         if 'playlist' not in jwplayer_data:
2136             jwplayer_data = {'playlist': [jwplayer_data]}
2137
2138         entries = []
2139
2140         # JWPlayer backward compatibility: single playlist item
2141         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2142         if not isinstance(jwplayer_data['playlist'], list):
2143             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2144
2145         for video_data in jwplayer_data['playlist']:
2146             # JWPlayer backward compatibility: flattened sources
2147             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2148             if 'sources' not in video_data:
2149                 video_data['sources'] = [video_data]
2150
2151             this_video_id = video_id or video_data['mediaid']
2152
2153             formats = []
2154             for source in video_data['sources']:
2155                 source_url = self._proto_relative_url(source['file'])
2156                 if base_url:
2157                     source_url = compat_urlparse.urljoin(base_url, source_url)
2158                 source_type = source.get('type') or ''
2159                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2160                 if source_type == 'hls' or ext == 'm3u8':
2161                     formats.extend(self._extract_m3u8_formats(
2162                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2163                 elif ext == 'mpd':
2164                     formats.extend(self._extract_mpd_formats(
2165                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2166                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2167                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2168                     formats.append({
2169                         'url': source_url,
2170                         'vcodec': 'none',
2171                         'ext': ext,
2172                     })
2173                 else:
2174                     height = int_or_none(source.get('height'))
2175                     if height is None:
2176                         # Often no height is provided but there is a label in
2177                         # format like 1080p.
2178                         height = int_or_none(self._search_regex(
2179                             r'^(\d{3,})[pP]$', source.get('label') or '',
2180                             'height', default=None))
2181                     a_format = {
2182                         'url': source_url,
2183                         'width': int_or_none(source.get('width')),
2184                         'height': height,
2185                         'ext': ext,
2186                     }
2187                     if source_url.startswith('rtmp'):
2188                         a_format['ext'] = 'flv'
2189
2190                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2191                         # of jwplayer.flash.swf
2192                         rtmp_url_parts = re.split(
2193                             r'((?:mp4|mp3|flv):)', source_url, 1)
2194                         if len(rtmp_url_parts) == 3:
2195                             rtmp_url, prefix, play_path = rtmp_url_parts
2196                             a_format.update({
2197                                 'url': rtmp_url,
2198                                 'play_path': prefix + play_path,
2199                             })
2200                         if rtmp_params:
2201                             a_format.update(rtmp_params)
2202                     formats.append(a_format)
2203             self._sort_formats(formats)
2204
2205             subtitles = {}
2206             tracks = video_data.get('tracks')
2207             if tracks and isinstance(tracks, list):
2208                 for track in tracks:
2209                     if track.get('kind') != 'captions':
2210                         continue
2211                     track_url = urljoin(base_url, track.get('file'))
2212                     if not track_url:
2213                         continue
2214                     subtitles.setdefault(track.get('label') or 'en', []).append({
2215                         'url': self._proto_relative_url(track_url)
2216                     })
2217
2218             entries.append({
2219                 'id': this_video_id,
2220                 'title': video_data['title'] if require_title else video_data.get('title'),
2221                 'description': video_data.get('description'),
2222                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2223                 'timestamp': int_or_none(video_data.get('pubdate')),
2224                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2225                 'subtitles': subtitles,
2226                 'formats': formats,
2227             })
2228         if len(entries) == 1:
2229             return entries[0]
2230         else:
2231             return self.playlist_result(entries)
2232
2233     def _live_title(self, name):
2234         """ Generate the title for a live video """
2235         now = datetime.datetime.now()
2236         now_str = now.strftime('%Y-%m-%d %H:%M')
2237         return name + ' ' + now_str
2238
2239     def _int(self, v, name, fatal=False, **kwargs):
2240         res = int_or_none(v, **kwargs)
2241         if 'get_attr' in kwargs:
2242             print(getattr(v, kwargs['get_attr']))
2243         if res is None:
2244             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2245             if fatal:
2246                 raise ExtractorError(msg)
2247             else:
2248                 self._downloader.report_warning(msg)
2249         return res
2250
2251     def _float(self, v, name, fatal=False, **kwargs):
2252         res = float_or_none(v, **kwargs)
2253         if res is None:
2254             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2255             if fatal:
2256                 raise ExtractorError(msg)
2257             else:
2258                 self._downloader.report_warning(msg)
2259         return res
2260
2261     def _set_cookie(self, domain, name, value, expire_time=None):
2262         cookie = compat_cookiejar.Cookie(
2263             0, name, value, None, None, domain, None,
2264             None, '/', True, False, expire_time, '', None, None, None)
2265         self._downloader.cookiejar.set_cookie(cookie)
2266
2267     def _get_cookies(self, url):
2268         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2269         req = sanitized_Request(url)
2270         self._downloader.cookiejar.add_cookie_header(req)
2271         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2272
2273     def get_testcases(self, include_onlymatching=False):
2274         t = getattr(self, '_TEST', None)
2275         if t:
2276             assert not hasattr(self, '_TESTS'), \
2277                 '%s has _TEST and _TESTS' % type(self).__name__
2278             tests = [t]
2279         else:
2280             tests = getattr(self, '_TESTS', [])
2281         for t in tests:
2282             if not include_onlymatching and t.get('only_matching', False):
2283                 continue
2284             t['name'] = type(self).__name__[:-len('IE')]
2285             yield t
2286
2287     def is_suitable(self, age_limit):
2288         """ Test whether the extractor is generally suitable for the given
2289         age limit (i.e. pornographic sites are not, all others usually are) """
2290
2291         any_restricted = False
2292         for tc in self.get_testcases(include_onlymatching=False):
2293             if tc.get('playlist', []):
2294                 tc = tc['playlist'][0]
2295             is_restricted = age_restricted(
2296                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2297             if not is_restricted:
2298                 return True
2299             any_restricted = any_restricted or is_restricted
2300         return not any_restricted
2301
2302     def extract_subtitles(self, *args, **kwargs):
2303         if (self._downloader.params.get('writesubtitles', False) or
2304                 self._downloader.params.get('listsubtitles')):
2305             return self._get_subtitles(*args, **kwargs)
2306         return {}
2307
2308     def _get_subtitles(self, *args, **kwargs):
2309         raise NotImplementedError('This method must be implemented by subclasses')
2310
2311     @staticmethod
2312     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2313         """ Merge subtitle items for one language. Items with duplicated URLs
2314         will be dropped. """
2315         list1_urls = set([item['url'] for item in subtitle_list1])
2316         ret = list(subtitle_list1)
2317         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2318         return ret
2319
2320     @classmethod
2321     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2322         """ Merge two subtitle dictionaries, language by language. """
2323         ret = dict(subtitle_dict1)
2324         for lang in subtitle_dict2:
2325             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2326         return ret
2327
2328     def extract_automatic_captions(self, *args, **kwargs):
2329         if (self._downloader.params.get('writeautomaticsub', False) or
2330                 self._downloader.params.get('listsubtitles')):
2331             return self._get_automatic_captions(*args, **kwargs)
2332         return {}
2333
2334     def _get_automatic_captions(self, *args, **kwargs):
2335         raise NotImplementedError('This method must be implemented by subclasses')
2336
2337     def mark_watched(self, *args, **kwargs):
2338         if (self._downloader.params.get('mark_watched', False) and
2339                 (self._get_login_info()[0] is not None or
2340                     self._downloader.params.get('cookiefile') is not None)):
2341             self._mark_watched(*args, **kwargs)
2342
2343     def _mark_watched(self, *args, **kwargs):
2344         raise NotImplementedError('This method must be implemented by subclasses')
2345
2346     def geo_verification_headers(self):
2347         headers = {}
2348         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2349         if geo_verification_proxy:
2350             headers['Ytdl-request-proxy'] = geo_verification_proxy
2351         return headers
2352
2353     def _generic_id(self, url):
2354         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2355
2356     def _generic_title(self, url):
2357         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2358
2359
2360 class SearchInfoExtractor(InfoExtractor):
2361     """
2362     Base class for paged search queries extractors.
2363     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2364     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2365     """
2366
2367     @classmethod
2368     def _make_valid_url(cls):
2369         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2370
2371     @classmethod
2372     def suitable(cls, url):
2373         return re.match(cls._make_valid_url(), url) is not None
2374
2375     def _real_extract(self, query):
2376         mobj = re.match(self._make_valid_url(), query)
2377         if mobj is None:
2378             raise ExtractorError('Invalid search query "%s"' % query)
2379
2380         prefix = mobj.group('prefix')
2381         query = mobj.group('query')
2382         if prefix == '':
2383             return self._get_n_results(query, 1)
2384         elif prefix == 'all':
2385             return self._get_n_results(query, self._MAX_RESULTS)
2386         else:
2387             n = int(prefix)
2388             if n <= 0:
2389                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2390             elif n > self._MAX_RESULTS:
2391                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2392                 n = self._MAX_RESULTS
2393             return self._get_n_results(query, n)
2394
2395     def _get_n_results(self, query, n):
2396         """Get a specified number of results for a query"""
2397         raise NotImplementedError('This method must be implemented by subclasses')
2398
2399     @property
2400     def SEARCH_KEY(self):
2401         return self._SEARCH_KEY