git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import random
  10 import re
  11 import socket
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar,
  18     compat_cookies,
  19     compat_etree_fromstring,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader.f4m import remove_encrypted_media
  31 from ..utils import (
  32     NO_DEFAULT,
  33     age_restricted,
  34     base_url,
  35     bug_reports_message,
  36     clean_html,
  37     compiled_regex_type,
  38     determine_ext,
  39     error_to_compat_str,
  40     ExtractorError,
  41     fix_xml_ampersands,
  42     float_or_none,
  43     GeoRestrictedError,
  44     GeoUtils,
  45     int_or_none,
  46     js_to_json,
  47     parse_iso8601,
  48     RegexNotFoundError,
  49     sanitize_filename,
  50     sanitized_Request,
  51     unescapeHTML,
  52     unified_strdate,
  53     unified_timestamp,
  54     url_basename,
  55     xpath_element,
  56     xpath_text,
  57     xpath_with_ns,
  58     determine_protocol,
  59     parse_duration,
  60     mimetype2ext,
  61     update_Request,
  62     update_url_query,
  63     parse_m3u8_attributes,
  64     extract_attributes,
  65     parse_codecs,
  66     urljoin,
  67 )
  68
  69
  70 class InfoExtractor(object):
  71     """Information Extractor class.
  72
  73     Information extractors are the classes that, given a URL, extract
  74     information about the video (or videos) the URL refers to. This
  75     information includes the real video URL, the video title, author and
  76     others. The information is stored in a dictionary which is then
  77     passed to the YoutubeDL. The YoutubeDL processes this
  78     information possibly downloading the video to the file system, among
  79     other possible outcomes.
  80
  81     The type field determines the type of the result.
  82     By far the most common value (and the default if _type is missing) is
  83     "video", which indicates a single video.
  84
  85     For a video, the dictionaries must include the following fields:
  86
  87     id:             Video identifier.
  88     title:          Video title, unescaped.
  89
  90     Additionally, it must contain either a formats entry or a url one:
  91
  92     formats:        A list of dictionaries for each format available, ordered
  93                     from worst to best quality.
  94
  95                     Potential fields:
  96                     * url        Mandatory. The URL of the video file
  97                     * manifest_url
  98                                  The URL of the manifest file in case of
  99                                  fragmented media (DASH, hls, hds)
 100                     * ext        Will be calculated from URL if missing
 101                     * format     A human-readable description of the format
 102                                  ("mp4 container with h264/opus").
 103                                  Calculated from the format_id, width, height.
 104                                  and format_note fields if missing.
 105                     * format_id  A short description of the format
 106                                  ("mp4_h264_opus" or "19").
 107                                 Technically optional, but strongly recommended.
 108                     * format_note Additional info about the format
 109                                  ("3D" or "DASH video")
 110                     * width      Width of the video, if known
 111                     * height     Height of the video, if known
 112                     * resolution Textual description of width and height
 113                     * tbr        Average bitrate of audio and video in KBit/s
 114                     * abr        Average audio bitrate in KBit/s
 115                     * acodec     Name of the audio codec in use
 116                     * asr        Audio sampling rate in Hertz
 117                     * vbr        Average video bitrate in KBit/s
 118                     * fps        Frame rate
 119                     * vcodec     Name of the video codec in use
 120                     * container  Name of the container format
 121                     * filesize   The number of bytes, if known in advance
 122                     * filesize_approx  An estimate for the number of bytes
 123                     * player_url SWF Player URL (used for rtmpdump).
 124                     * protocol   The protocol that will be used for the actual
 125                                  download, lower-case.
 126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 127                                  "m3u8", "m3u8_native" or "http_dash_segments".
 128                     * fragment_base_url
 129                                  Base URL for fragments. Each fragment's path
 130                                  value (if present) will be relative to
 131                                  this URL.
 132                     * fragments  A list of fragments of a fragmented media.
 133                                  Each fragment entry must contain either an url
 134                                  or a path. If an url is present it should be
 135                                  considered by a client. Otherwise both path and
 136                                  fragment_base_url must be present. Here is
 137                                  the list of all potential fields:
 138                                  * "url" - fragment's URL
 139                                  * "path" - fragment's path relative to
 140                                             fragment_base_url
 141                                  * "duration" (optional, int or float)
 142                                  * "filesize" (optional, int)
 143                     * preference Order number of this format. If this field is
 144                                  present and not None, the formats get sorted
 145                                  by this field, regardless of all other values.
 146                                  -1 for default (order by other properties),
 147                                  -2 or smaller for less than default.
 148                                  < -1000 to hide the format (if there is
 149                                     another one which is strictly better)
 150                     * language   Language code, e.g. "de" or "en-US".
 151                     * language_preference  Is this in the language mentioned in
 152                                  the URL?
 153                                  10 if it's what the URL is about,
 154                                  -1 for default (don't know),
 155                                  -10 otherwise, other values reserved for now.
 156                     * quality    Order number of the video quality of this
 157                                  format, irrespective of the file format.
 158                                  -1 for default (order by other properties),
 159                                  -2 or smaller for less than default.
 160                     * source_preference  Order number for this video source
 161                                   (quality takes higher priority)
 162                                  -1 for default (order by other properties),
 163                                  -2 or smaller for less than default.
 164                     * http_headers  A dictionary of additional HTTP headers
 165                                  to add to the request.
 166                     * stretched_ratio  If given and not 1, indicates that the
 167                                  video's pixels are not square.
 168                                  width : height ratio as float.
 169                     * no_resume  The server does not support resuming the
 170                                  (HTTP or RTMP) download. Boolean.
 171
 172     url:            Final video URL.
 173     ext:            Video filename extension.
 174     format:         The video format, defaults to ext (used for --get-format)
 175     player_url:     SWF Player URL (used for rtmpdump).
 176
 177     The following fields are optional:
 178
 179     alt_title:      A secondary title of the video.
 180     display_id      An alternative identifier for the video, not necessarily
 181                     unique, but available before title. Typically, id is
 182                     something like "4234987", title "Dancing naked mole rats",
 183                     and display_id "dancing-naked-mole-rats"
 184     thumbnails:     A list of dictionaries, with the following entries:
 185                         * "id" (optional, string) - Thumbnail format ID
 186                         * "url"
 187                         * "preference" (optional, int) - quality of the image
 188                         * "width" (optional, int)
 189                         * "height" (optional, int)
 190                         * "resolution" (optional, string "{width}x{height"},
 191                                         deprecated)
 192                         * "filesize" (optional, int)
 193     thumbnail:      Full URL to a video thumbnail image.
 194     description:    Full video description.
 195     uploader:       Full name of the video uploader.
 196     license:        License name the video is licensed under.
 197     creator:        The creator of the video.
 198     release_date:   The date (YYYYMMDD) when the video was released.
 199     timestamp:      UNIX timestamp of the moment the video became available.
 200     upload_date:    Video upload date (YYYYMMDD).
 201                     If not explicitly set, calculated from timestamp.
 202     uploader_id:    Nickname or id of the video uploader.
 203     uploader_url:   Full URL to a personal webpage of the video uploader.
 204     location:       Physical location where the video was filmed.
 205     subtitles:      The available subtitles as a dictionary in the format
 206                     {tag: subformats}. "tag" is usually a language code, and
 207                     "subformats" is a list sorted from lower to higher
 208                     preference, each element is a dictionary with the "ext"
 209                     entry and one of:
 210                         * "data": The subtitles file contents
 211                         * "url": A URL pointing to the subtitles file
 212                     "ext" will be calculated from URL if missing
 213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 214                     automatically generated captions
 215     duration:       Length of the video in seconds, as an integer or float.
 216     view_count:     How many users have watched the video on the platform.
 217     like_count:     Number of positive ratings of the video
 218     dislike_count:  Number of negative ratings of the video
 219     repost_count:   Number of reposts of the video
 220     average_rating: Average rating give by users, the scale used depends on the webpage
 221     comment_count:  Number of comments on the video
 222     comments:       A list of comments, each with one or more of the following
 223                     properties (all but one of text or html optional):
 224                         * "author" - human-readable name of the comment author
 225                         * "author_id" - user ID of the comment author
 226                         * "id" - Comment ID
 227                         * "html" - Comment as HTML
 228                         * "text" - Plain text of the comment
 229                         * "timestamp" - UNIX timestamp of comment
 230                         * "parent" - ID of the comment this one is replying to.
 231                                      Set to "root" to indicate that this is a
 232                                      comment to the original video.
 233     age_limit:      Age restriction for the video, as an integer (years)
 234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 235                     should allow to get the same result again. (It will be set
 236                     by YoutubeDL if it's missing)
 237     categories:     A list of categories that the video falls in, for example
 238                     ["Sports", "Berlin"]
 239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 240     is_live:        True, False, or None (=unknown). Whether this video is a
 241                     live stream that goes on instead of a fixed-length video.
 242     start_time:     Time in seconds where the reproduction should start, as
 243                     specified in the URL.
 244     end_time:       Time in seconds where the reproduction should end, as
 245                     specified in the URL.
 246
 247     The following fields should only be used when the video belongs to some logical
 248     chapter or section:
 249
 250     chapter:        Name or title of the chapter the video belongs to.
 251     chapter_number: Number of the chapter the video belongs to, as an integer.
 252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 253
 254     The following fields should only be used when the video is an episode of some
 255     series, programme or podcast:
 256
 257     series:         Title of the series or programme the video episode belongs to.
 258     season:         Title of the season the video episode belongs to.
 259     season_number:  Number of the season the video episode belongs to, as an integer.
 260     season_id:      Id of the season the video episode belongs to, as a unicode string.
 261     episode:        Title of the video episode. Unlike mandatory video title field,
 262                     this field should denote the exact title of the video episode
 263                     without any kind of decoration.
 264     episode_number: Number of the video episode within a season, as an integer.
 265     episode_id:     Id of the video episode, as a unicode string.
 266
 267     The following fields should only be used when the media is a track or a part of
 268     a music album:
 269
 270     track:          Title of the track.
 271     track_number:   Number of the track within an album or a disc, as an integer.
 272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 273                     as a unicode string.
 274     artist:         Artist(s) of the track.
 275     genre:          Genre(s) of the track.
 276     album:          Title of the album the track belongs to.
 277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 278     album_artist:   List of all artists appeared on the album (e.g.
 279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 280                     and compilations).
 281     disc_number:    Number of the disc or other physical medium the track belongs to,
 282                     as an integer.
 283     release_year:   Year (YYYY) when the album was released.
 284
 285     Unless mentioned otherwise, the fields should be Unicode strings.
 286
 287     Unless mentioned otherwise, None is equivalent to absence of information.
 288
 289
 290     _type "playlist" indicates multiple videos.
 291     There must be a key "entries", which is a list, an iterable, or a PagedList
 292     object, each element of which is a valid dictionary by this specification.
 293
 294     Additionally, playlists can have "title", "description" and "id" attributes
 295     with the same semantics as videos (see above).
 296
 297
 298     _type "multi_video" indicates that there are multiple videos that
 299     form a single show, for examples multiple acts of an opera or TV episode.
 300     It must have an entries key like a playlist and contain all the keys
 301     required for a video at the same time.
 302
 303
 304     _type "url" indicates that the video must be extracted from another
 305     location, possibly by a different extractor. Its only required key is:
 306     "url" - the next URL to extract.
 307     The key "ie_key" can be set to the class name (minus the trailing "IE",
 308     e.g. "Youtube") if the extractor class is known in advance.
 309     Additionally, the dictionary may have any properties of the resolved entity
 310     known in advance, for example "title" if the title of the referred video is
 311     known ahead of time.
 312
 313
 314     _type "url_transparent" entities have the same specification as "url", but
 315     indicate that the given additional information is more precise than the one
 316     associated with the resolved URL.
 317     This is useful when a site employs a video service that hosts the video and
 318     its technical metadata, but that video service does not embed a useful
 319     title, description etc.
 320
 321
 322     Subclasses of this one should re-define the _real_initialize() and
 323     _real_extract() methods and define a _VALID_URL regexp.
 324     Probably, they should also be added to the list of extractors.
 325
 326     _BYPASS_GEO attribute may be set to False in order to disable
 327     geo restriction bypass mechanisms for a particular extractor.
 328     Though it won't disable explicit geo restriction bypass based on
 329     country code provided with bypass_geo_restriction_as_country.
 330
 331     Finally, the _WORKING attribute should be set to False for broken IEs
 332     in order to warn the users and skip the tests.
 333     """
 334
 335     _ready = False
 336     _downloader = None
 337     _x_forwarded_for_ip = None
 338     _BYPASS_GEO = True
 339     _WORKING = True
 340
 341     def __init__(self, downloader=None):
 342         """Constructor. Receives an optional downloader."""
 343         self._ready = False
 344         self._x_forwarded_for_ip = None
 345         self.set_downloader(downloader)
 346
 347     @classmethod
 348     def suitable(cls, url):
 349         """Receives a URL and returns True if suitable for this IE."""
 350
 351         # This does not use has/getattr intentionally - we want to know whether
 352         # we have cached the regexp for *this* class, whereas getattr would also
 353         # match the superclass
 354         if '_VALID_URL_RE' not in cls.__dict__:
 355             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 356         return cls._VALID_URL_RE.match(url) is not None
 357
 358     @classmethod
 359     def _match_id(cls, url):
 360         if '_VALID_URL_RE' not in cls.__dict__:
 361             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 362         m = cls._VALID_URL_RE.match(url)
 363         assert m
 364         return m.group('id')
 365
 366     @classmethod
 367     def working(cls):
 368         """Getter method for _WORKING."""
 369         return cls._WORKING
 370
 371     def initialize(self):
 372         """Initializes an instance (authentication, etc)."""
 373         if not self._x_forwarded_for_ip:
 374             country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None)
 375             if country_code:
 376                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 377         if not self._ready:
 378             self._real_initialize()
 379             self._ready = True
 380
 381     def extract(self, url):
 382         """Extracts URL information and returns it in list of dicts."""
 383         try:
 384             for _ in range(2):
 385                 try:
 386                     self.initialize()
 387                     return self._real_extract(url)
 388                 except GeoRestrictedError as e:
 389                     if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and
 390                             self._BYPASS_GEO and
 391                             self._downloader.params.get('bypass_geo_restriction', True) and
 392                             not self._x_forwarded_for_ip and
 393                             e.countries):
 394                         self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
 395                         if self._x_forwarded_for_ip:
 396                             self.report_warning(
 397                                 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
 398                             continue
 399                     raise
 400         except ExtractorError:
 401             raise
 402         except compat_http_client.IncompleteRead as e:
 403             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 404         except (KeyError, StopIteration) as e:
 405             raise ExtractorError('An extractor error has occurred.', cause=e)
 406
 407     def set_downloader(self, downloader):
 408         """Sets the downloader for this IE."""
 409         self._downloader = downloader
 410
 411     def _real_initialize(self):
 412         """Real initialization process. Redefine in subclasses."""
 413         pass
 414
 415     def _real_extract(self, url):
 416         """Real extraction process. Redefine in subclasses."""
 417         pass
 418
 419     @classmethod
 420     def ie_key(cls):
 421         """A string for getting the InfoExtractor with get_info_extractor"""
 422         return compat_str(cls.__name__[:-2])
 423
 424     @property
 425     def IE_NAME(self):
 426         return compat_str(type(self).__name__[:-2])
 427
 428     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 429         """ Returns the response handle """
 430         if note is None:
 431             self.report_download_webpage(video_id)
 432         elif note is not False:
 433             if video_id is None:
 434                 self.to_screen('%s' % (note,))
 435             else:
 436                 self.to_screen('%s: %s' % (video_id, note))
 437         if isinstance(url_or_request, compat_urllib_request.Request):
 438             url_or_request = update_Request(
 439                 url_or_request, data=data, headers=headers, query=query)
 440         else:
 441             if query:
 442                 url_or_request = update_url_query(url_or_request, query)
 443             if data is not None or headers:
 444                 url_or_request = sanitized_Request(url_or_request, data, headers)
 445         try:
 446             return self._downloader.urlopen(url_or_request)
 447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 448             if errnote is False:
 449                 return False
 450             if errnote is None:
 451                 errnote = 'Unable to download webpage'
 452
 453             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 454             if fatal:
 455                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 456             else:
 457                 self._downloader.report_warning(errmsg)
 458                 return False
 459
 460     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 461         """ Returns a tuple (page content as string, URL handle) """
 462         # Strip hashes from the URL (#1038)
 463         if isinstance(url_or_request, (compat_str, str)):
 464             url_or_request = url_or_request.partition('#')[0]
 465
 466         # Some sites check X-Forwarded-For HTTP header in order to figure out
 467         # the origin of the client behind proxy. This allows bypassing geo
 468         # restriction by faking this header's value to IP that belongs to some
 469         # geo unrestricted country. We will do so once we encounter any
 470         # geo restriction error.
 471         if self._x_forwarded_for_ip:
 472             if 'X-Forwarded-For' not in headers:
 473                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 474
 475         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 476         if urlh is False:
 477             assert not fatal
 478             return False
 479         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 480         return (content, urlh)
 481
 482     @staticmethod
 483     def _guess_encoding_from_content(content_type, webpage_bytes):
 484         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 485         if m:
 486             encoding = m.group(1)
 487         else:
 488             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 489                           webpage_bytes[:1024])
 490             if m:
 491                 encoding = m.group(1).decode('ascii')
 492             elif webpage_bytes.startswith(b'\xff\xfe'):
 493                 encoding = 'utf-16'
 494             else:
 495                 encoding = 'utf-8'
 496
 497         return encoding
 498
 499     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 500         content_type = urlh.headers.get('Content-Type', '')
 501         webpage_bytes = urlh.read()
 502         if prefix is not None:
 503             webpage_bytes = prefix + webpage_bytes
 504         if not encoding:
 505             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 506         if self._downloader.params.get('dump_intermediate_pages', False):
 507             try:
 508                 url = url_or_request.get_full_url()
 509             except AttributeError:
 510                 url = url_or_request
 511             self.to_screen('Dumping request to ' + url)
 512             dump = base64.b64encode(webpage_bytes).decode('ascii')
 513             self._downloader.to_screen(dump)
 514         if self._downloader.params.get('write_pages', False):
 515             try:
 516                 url = url_or_request.get_full_url()
 517             except AttributeError:
 518                 url = url_or_request
 519             basen = '%s_%s' % (video_id, url)
 520             if len(basen) > 240:
 521                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 522                 basen = basen[:240 - len(h)] + h
 523             raw_filename = basen + '.dump'
 524             filename = sanitize_filename(raw_filename, restricted=True)
 525             self.to_screen('Saving request to ' + filename)
 526             # Working around MAX_PATH limitation on Windows (see
 527             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 528             if compat_os_name == 'nt':
 529                 absfilepath = os.path.abspath(filename)
 530                 if len(absfilepath) > 259:
 531                     filename = '\\\\?\\' + absfilepath
 532             with open(filename, 'wb') as outf:
 533                 outf.write(webpage_bytes)
 534
 535         try:
 536             content = webpage_bytes.decode(encoding, 'replace')
 537         except LookupError:
 538             content = webpage_bytes.decode('utf-8', 'replace')
 539
 540         if ('<title>Access to this site is blocked</title>' in content and
 541                 'Websense' in content[:512]):
 542             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 543             blocked_iframe = self._html_search_regex(
 544                 r'<iframe src="([^"]+)"', content,
 545                 'Websense information URL', default=None)
 546             if blocked_iframe:
 547                 msg += ' Visit %s for more details' % blocked_iframe
 548             raise ExtractorError(msg, expected=True)
 549         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 550             msg = (
 551                 'Access to this webpage has been blocked by Indian censorship. '
 552                 'Use a VPN or proxy server (with --proxy) to route around it.')
 553             block_msg = self._html_search_regex(
 554                 r'</h1><p>(.*?)</p>',
 555                 content, 'block message', default=None)
 556             if block_msg:
 557                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 558             raise ExtractorError(msg, expected=True)
 559
 560         return content
 561
 562     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 563         """ Returns the data of the page as a string """
 564         success = False
 565         try_count = 0
 566         while success is False:
 567             try:
 568                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 569                 success = True
 570             except compat_http_client.IncompleteRead as e:
 571                 try_count += 1
 572                 if try_count >= tries:
 573                     raise e
 574                 self._sleep(timeout, video_id)
 575         if res is False:
 576             return res
 577         else:
 578             content, _ = res
 579             return content
 580
 581     def _download_xml(self, url_or_request, video_id,
 582                       note='Downloading XML', errnote='Unable to download XML',
 583                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 584         """Return the xml as an xml.etree.ElementTree.Element"""
 585         xml_string = self._download_webpage(
 586             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 587         if xml_string is False:
 588             return xml_string
 589         if transform_source:
 590             xml_string = transform_source(xml_string)
 591         return compat_etree_fromstring(xml_string.encode('utf-8'))
 592
 593     def _download_json(self, url_or_request, video_id,
 594                        note='Downloading JSON metadata',
 595                        errnote='Unable to download JSON metadata',
 596                        transform_source=None,
 597                        fatal=True, encoding=None, data=None, headers={}, query={}):
 598         json_string = self._download_webpage(
 599             url_or_request, video_id, note, errnote, fatal=fatal,
 600             encoding=encoding, data=data, headers=headers, query=query)
 601         if (not fatal) and json_string is False:
 602             return None
 603         return self._parse_json(
 604             json_string, video_id, transform_source=transform_source, fatal=fatal)
 605
 606     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 607         if transform_source:
 608             json_string = transform_source(json_string)
 609         try:
 610             return json.loads(json_string)
 611         except ValueError as ve:
 612             errmsg = '%s: Failed to parse JSON ' % video_id
 613             if fatal:
 614                 raise ExtractorError(errmsg, cause=ve)
 615             else:
 616                 self.report_warning(errmsg + str(ve))
 617
 618     def report_warning(self, msg, video_id=None):
 619         idstr = '' if video_id is None else '%s: ' % video_id
 620         self._downloader.report_warning(
 621             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 622
 623     def to_screen(self, msg):
 624         """Print msg to screen, prefixing it with '[ie_name]'"""
 625         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 626
 627     def report_extraction(self, id_or_name):
 628         """Report information extraction."""
 629         self.to_screen('%s: Extracting information' % id_or_name)
 630
 631     def report_download_webpage(self, video_id):
 632         """Report webpage download."""
 633         self.to_screen('%s: Downloading webpage' % video_id)
 634
 635     def report_age_confirmation(self):
 636         """Report attempt to confirm age."""
 637         self.to_screen('Confirming age')
 638
 639     def report_login(self):
 640         """Report attempt to log in."""
 641         self.to_screen('Logging in')
 642
 643     @staticmethod
 644     def raise_login_required(msg='This video is only available for registered users'):
 645         raise ExtractorError(
 646             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 647             expected=True)
 648
 649     @staticmethod
 650     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 651         raise GeoRestrictedError(msg, countries=countries)
 652
 653     # Methods for following #608
 654     @staticmethod
 655     def url_result(url, ie=None, video_id=None, video_title=None):
 656         """Returns a URL that points to a page that should be processed"""
 657         # TODO: ie should be the class used for getting the info
 658         video_info = {'_type': 'url',
 659                       'url': url,
 660                       'ie_key': ie}
 661         if video_id is not None:
 662             video_info['id'] = video_id
 663         if video_title is not None:
 664             video_info['title'] = video_title
 665         return video_info
 666
 667     @staticmethod
 668     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 669         """Returns a playlist"""
 670         video_info = {'_type': 'playlist',
 671                       'entries': entries}
 672         if playlist_id:
 673             video_info['id'] = playlist_id
 674         if playlist_title:
 675             video_info['title'] = playlist_title
 676         if playlist_description:
 677             video_info['description'] = playlist_description
 678         return video_info
 679
 680     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 681         """
 682         Perform a regex search on the given string, using a single or a list of
 683         patterns returning the first matching group.
 684         In case of failure return a default value or raise a WARNING or a
 685         RegexNotFoundError, depending on fatal, specifying the field name.
 686         """
 687         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 688             mobj = re.search(pattern, string, flags)
 689         else:
 690             for p in pattern:
 691                 mobj = re.search(p, string, flags)
 692                 if mobj:
 693                     break
 694
 695         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 696             _name = '\033[0;34m%s\033[0m' % name
 697         else:
 698             _name = name
 699
 700         if mobj:
 701             if group is None:
 702                 # return the first matching group
 703                 return next(g for g in mobj.groups() if g is not None)
 704             else:
 705                 return mobj.group(group)
 706         elif default is not NO_DEFAULT:
 707             return default
 708         elif fatal:
 709             raise RegexNotFoundError('Unable to extract %s' % _name)
 710         else:
 711             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 712             return None
 713
 714     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 715         """
 716         Like _search_regex, but strips HTML tags and unescapes entities.
 717         """
 718         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 719         if res:
 720             return clean_html(res).strip()
 721         else:
 722             return res
 723
 724     def _get_netrc_login_info(self, netrc_machine=None):
 725         username = None
 726         password = None
 727         netrc_machine = netrc_machine or self._NETRC_MACHINE
 728
 729         if self._downloader.params.get('usenetrc', False):
 730             try:
 731                 info = netrc.netrc().authenticators(netrc_machine)
 732                 if info is not None:
 733                     username = info[0]
 734                     password = info[2]
 735                 else:
 736                     raise netrc.NetrcParseError(
 737                         'No authenticators for %s' % netrc_machine)
 738             except (IOError, netrc.NetrcParseError) as err:
 739                 self._downloader.report_warning(
 740                     'parsing .netrc: %s' % error_to_compat_str(err))
 741
 742         return username, password
 743
 744     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 745         """
 746         Get the login info as (username, password)
 747         First look for the manually specified credentials using username_option
 748         and password_option as keys in params dictionary. If no such credentials
 749         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 750         value.
 751         If there's no info available, return (None, None)
 752         """
 753         if self._downloader is None:
 754             return (None, None)
 755
 756         downloader_params = self._downloader.params
 757
 758         # Attempt to use provided username and password or .netrc data
 759         if downloader_params.get(username_option) is not None:
 760             username = downloader_params[username_option]
 761             password = downloader_params[password_option]
 762         else:
 763             username, password = self._get_netrc_login_info(netrc_machine)
 764
 765         return username, password
 766
 767     def _get_tfa_info(self, note='two-factor verification code'):
 768         """
 769         Get the two-factor authentication info
 770         TODO - asking the user will be required for sms/phone verify
 771         currently just uses the command line option
 772         If there's no info available, return None
 773         """
 774         if self._downloader is None:
 775             return None
 776         downloader_params = self._downloader.params
 777
 778         if downloader_params.get('twofactor') is not None:
 779             return downloader_params['twofactor']
 780
 781         return compat_getpass('Type %s and press [Return]: ' % note)
 782
 783     # Helper functions for extracting OpenGraph info
 784     @staticmethod
 785     def _og_regexes(prop):
 786         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 787         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 788                        % {'prop': re.escape(prop)})
 789         template = r'<meta[^>]+?%s[^>]+?%s'
 790         return [
 791             template % (property_re, content_re),
 792             template % (content_re, property_re),
 793         ]
 794
 795     @staticmethod
 796     def _meta_regex(prop):
 797         return r'''(?isx)<meta
 798                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 799                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 800
 801     def _og_search_property(self, prop, html, name=None, **kargs):
 802         if not isinstance(prop, (list, tuple)):
 803             prop = [prop]
 804         if name is None:
 805             name = 'OpenGraph %s' % prop[0]
 806         og_regexes = []
 807         for p in prop:
 808             og_regexes.extend(self._og_regexes(p))
 809         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 810         if escaped is None:
 811             return None
 812         return unescapeHTML(escaped)
 813
 814     def _og_search_thumbnail(self, html, **kargs):
 815         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 816
 817     def _og_search_description(self, html, **kargs):
 818         return self._og_search_property('description', html, fatal=False, **kargs)
 819
 820     def _og_search_title(self, html, **kargs):
 821         return self._og_search_property('title', html, **kargs)
 822
 823     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 824         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 825         if secure:
 826             regexes = self._og_regexes('video:secure_url') + regexes
 827         return self._html_search_regex(regexes, html, name, **kargs)
 828
 829     def _og_search_url(self, html, **kargs):
 830         return self._og_search_property('url', html, **kargs)
 831
 832     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 833         if not isinstance(name, (list, tuple)):
 834             name = [name]
 835         if display_name is None:
 836             display_name = name[0]
 837         return self._html_search_regex(
 838             [self._meta_regex(n) for n in name],
 839             html, display_name, fatal=fatal, group='content', **kwargs)
 840
 841     def _dc_search_uploader(self, html):
 842         return self._html_search_meta('dc.creator', html, 'uploader')
 843
 844     def _rta_search(self, html):
 845         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 846         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 847                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 848                      html):
 849             return 18
 850         return 0
 851
 852     def _media_rating_search(self, html):
 853         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 854         rating = self._html_search_meta('rating', html)
 855
 856         if not rating:
 857             return None
 858
 859         RATING_TABLE = {
 860             'safe for kids': 0,
 861             'general': 8,
 862             '14 years': 14,
 863             'mature': 17,
 864             'restricted': 19,
 865         }
 866         return RATING_TABLE.get(rating.lower())
 867
 868     def _family_friendly_search(self, html):
 869         # See http://schema.org/VideoObject
 870         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 871
 872         if not family_friendly:
 873             return None
 874
 875         RATING_TABLE = {
 876             '1': 0,
 877             'true': 0,
 878             '0': 18,
 879             'false': 18,
 880         }
 881         return RATING_TABLE.get(family_friendly.lower())
 882
 883     def _twitter_search_player(self, html):
 884         return self._html_search_meta('twitter:player', html,
 885                                       'twitter card player')
 886
 887     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 888         json_ld = self._search_regex(
 889             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 890             html, 'JSON-LD', group='json_ld', **kwargs)
 891         default = kwargs.get('default', NO_DEFAULT)
 892         if not json_ld:
 893             return default if default is not NO_DEFAULT else {}
 894         # JSON-LD may be malformed and thus `fatal` should be respected.
 895         # At the same time `default` may be passed that assumes `fatal=False`
 896         # for _search_regex. Let's simulate the same behavior here as well.
 897         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 898         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 899
 900     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 901         if isinstance(json_ld, compat_str):
 902             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 903         if not json_ld:
 904             return {}
 905         info = {}
 906         if not isinstance(json_ld, (list, tuple, dict)):
 907             return info
 908         if isinstance(json_ld, dict):
 909             json_ld = [json_ld]
 910         for e in json_ld:
 911             if e.get('@context') == 'http://schema.org':
 912                 item_type = e.get('@type')
 913                 if expected_type is not None and expected_type != item_type:
 914                     return info
 915                 if item_type == 'TVEpisode':
 916                     info.update({
 917                         'episode': unescapeHTML(e.get('name')),
 918                         'episode_number': int_or_none(e.get('episodeNumber')),
 919                         'description': unescapeHTML(e.get('description')),
 920                     })
 921                     part_of_season = e.get('partOfSeason')
 922                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 923                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 924                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 925                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 926                         info['series'] = unescapeHTML(part_of_series.get('name'))
 927                 elif item_type == 'Article':
 928                     info.update({
 929                         'timestamp': parse_iso8601(e.get('datePublished')),
 930                         'title': unescapeHTML(e.get('headline')),
 931                         'description': unescapeHTML(e.get('articleBody')),
 932                     })
 933                 elif item_type == 'VideoObject':
 934                     info.update({
 935                         'url': e.get('contentUrl'),
 936                         'title': unescapeHTML(e.get('name')),
 937                         'description': unescapeHTML(e.get('description')),
 938                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 939                         'duration': parse_duration(e.get('duration')),
 940                         'timestamp': unified_timestamp(e.get('uploadDate')),
 941                         'filesize': float_or_none(e.get('contentSize')),
 942                         'tbr': int_or_none(e.get('bitrate')),
 943                         'width': int_or_none(e.get('width')),
 944                         'height': int_or_none(e.get('height')),
 945                     })
 946                 break
 947         return dict((k, v) for k, v in info.items() if v is not None)
 948
 949     @staticmethod
 950     def _hidden_inputs(html):
 951         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 952         hidden_inputs = {}
 953         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 954             attrs = extract_attributes(input)
 955             if not input:
 956                 continue
 957             if attrs.get('type') not in ('hidden', 'submit'):
 958                 continue
 959             name = attrs.get('name') or attrs.get('id')
 960             value = attrs.get('value')
 961             if name and value is not None:
 962                 hidden_inputs[name] = value
 963         return hidden_inputs
 964
 965     def _form_hidden_inputs(self, form_id, html):
 966         form = self._search_regex(
 967             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 968             html, '%s form' % form_id, group='form')
 969         return self._hidden_inputs(form)
 970
 971     def _sort_formats(self, formats, field_preference=None):
 972         if not formats:
 973             raise ExtractorError('No video formats found')
 974
 975         for f in formats:
 976             # Automatically determine tbr when missing based on abr and vbr (improves
 977             # formats sorting in some cases)
 978             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 979                 f['tbr'] = f['abr'] + f['vbr']
 980
 981         def _formats_key(f):
 982             # TODO remove the following workaround
 983             from ..utils import determine_ext
 984             if not f.get('ext') and 'url' in f:
 985                 f['ext'] = determine_ext(f['url'])
 986
 987             if isinstance(field_preference, (list, tuple)):
 988                 return tuple(
 989                     f.get(field)
 990                     if f.get(field) is not None
 991                     else ('' if field == 'format_id' else -1)
 992                     for field in field_preference)
 993
 994             preference = f.get('preference')
 995             if preference is None:
 996                 preference = 0
 997                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 998                     preference -= 0.5
 999
1000             protocol = f.get('protocol') or determine_protocol(f)
1001             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1002
1003             if f.get('vcodec') == 'none':  # audio only
1004                 preference -= 50
1005                 if self._downloader.params.get('prefer_free_formats'):
1006                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1007                 else:
1008                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1009                 ext_preference = 0
1010                 try:
1011                     audio_ext_preference = ORDER.index(f['ext'])
1012                 except ValueError:
1013                     audio_ext_preference = -1
1014             else:
1015                 if f.get('acodec') == 'none':  # video only
1016                     preference -= 40
1017                 if self._downloader.params.get('prefer_free_formats'):
1018                     ORDER = ['flv', 'mp4', 'webm']
1019                 else:
1020                     ORDER = ['webm', 'flv', 'mp4']
1021                 try:
1022                     ext_preference = ORDER.index(f['ext'])
1023                 except ValueError:
1024                     ext_preference = -1
1025                 audio_ext_preference = 0
1026
1027             return (
1028                 preference,
1029                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1030                 f.get('quality') if f.get('quality') is not None else -1,
1031                 f.get('tbr') if f.get('tbr') is not None else -1,
1032                 f.get('filesize') if f.get('filesize') is not None else -1,
1033                 f.get('vbr') if f.get('vbr') is not None else -1,
1034                 f.get('height') if f.get('height') is not None else -1,
1035                 f.get('width') if f.get('width') is not None else -1,
1036                 proto_preference,
1037                 ext_preference,
1038                 f.get('abr') if f.get('abr') is not None else -1,
1039                 audio_ext_preference,
1040                 f.get('fps') if f.get('fps') is not None else -1,
1041                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1042                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1043                 f.get('format_id') if f.get('format_id') is not None else '',
1044             )
1045         formats.sort(key=_formats_key)
1046
1047     def _check_formats(self, formats, video_id):
1048         if formats:
1049             formats[:] = filter(
1050                 lambda f: self._is_valid_url(
1051                     f['url'], video_id,
1052                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1053                 formats)
1054
1055     @staticmethod
1056     def _remove_duplicate_formats(formats):
1057         format_urls = set()
1058         unique_formats = []
1059         for f in formats:
1060             if f['url'] not in format_urls:
1061                 format_urls.add(f['url'])
1062                 unique_formats.append(f)
1063         formats[:] = unique_formats
1064
1065     def _is_valid_url(self, url, video_id, item='video', headers={}):
1066         url = self._proto_relative_url(url, scheme='http:')
1067         # For now assume non HTTP(S) URLs always valid
1068         if not (url.startswith('http://') or url.startswith('https://')):
1069             return True
1070         try:
1071             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1072             return True
1073         except ExtractorError as e:
1074             if isinstance(e.cause, compat_urllib_error.URLError):
1075                 self.to_screen(
1076                     '%s: %s URL is invalid, skipping' % (video_id, item))
1077                 return False
1078             raise
1079
1080     def http_scheme(self):
1081         """ Either "http:" or "https:", depending on the user's preferences """
1082         return (
1083             'http:'
1084             if self._downloader.params.get('prefer_insecure', False)
1085             else 'https:')
1086
1087     def _proto_relative_url(self, url, scheme=None):
1088         if url is None:
1089             return url
1090         if url.startswith('//'):
1091             if scheme is None:
1092                 scheme = self.http_scheme()
1093             return scheme + url
1094         else:
1095             return url
1096
1097     def _sleep(self, timeout, video_id, msg_template=None):
1098         if msg_template is None:
1099             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1100         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1101         self.to_screen(msg)
1102         time.sleep(timeout)
1103
1104     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1105                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1106                              fatal=True, m3u8_id=None):
1107         manifest = self._download_xml(
1108             manifest_url, video_id, 'Downloading f4m manifest',
1109             'Unable to download f4m manifest',
1110             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1111             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1112             transform_source=transform_source,
1113             fatal=fatal)
1114
1115         if manifest is False:
1116             return []
1117
1118         return self._parse_f4m_formats(
1119             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1120             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1121
1122     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1123                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1124                            fatal=True, m3u8_id=None):
1125         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1126         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1127         if akamai_pv is not None and ';' in akamai_pv.text:
1128             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1129             if playerVerificationChallenge.strip() != '':
1130                 return []
1131
1132         formats = []
1133         manifest_version = '1.0'
1134         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1135         if not media_nodes:
1136             manifest_version = '2.0'
1137             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1138         # Remove unsupported DRM protected media from final formats
1139         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1140         media_nodes = remove_encrypted_media(media_nodes)
1141         if not media_nodes:
1142             return formats
1143         base_url = xpath_text(
1144             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1145             'base URL', default=None)
1146         if base_url:
1147             base_url = base_url.strip()
1148
1149         bootstrap_info = xpath_element(
1150             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1151             'bootstrap info', default=None)
1152
1153         vcodec = None
1154         mime_type = xpath_text(
1155             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1156             'base URL', default=None)
1157         if mime_type and mime_type.startswith('audio/'):
1158             vcodec = 'none'
1159
1160         for i, media_el in enumerate(media_nodes):
1161             tbr = int_or_none(media_el.attrib.get('bitrate'))
1162             width = int_or_none(media_el.attrib.get('width'))
1163             height = int_or_none(media_el.attrib.get('height'))
1164             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1165             # If <bootstrapInfo> is present, the specified f4m is a
1166             # stream-level manifest, and only set-level manifests may refer to
1167             # external resources.  See section 11.4 and section 4 of F4M spec
1168             if bootstrap_info is None:
1169                 media_url = None
1170                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1171                 if manifest_version == '2.0':
1172                     media_url = media_el.attrib.get('href')
1173                 if media_url is None:
1174                     media_url = media_el.attrib.get('url')
1175                 if not media_url:
1176                     continue
1177                 manifest_url = (
1178                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1179                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1180                 # If media_url is itself a f4m manifest do the recursive extraction
1181                 # since bitrates in parent manifest (this one) and media_url manifest
1182                 # may differ leading to inability to resolve the format by requested
1183                 # bitrate in f4m downloader
1184                 ext = determine_ext(manifest_url)
1185                 if ext == 'f4m':
1186                     f4m_formats = self._extract_f4m_formats(
1187                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1188                         transform_source=transform_source, fatal=fatal)
1189                     # Sometimes stream-level manifest contains single media entry that
1190                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1191                     # At the same time parent's media entry in set-level manifest may
1192                     # contain it. We will copy it from parent in such cases.
1193                     if len(f4m_formats) == 1:
1194                         f = f4m_formats[0]
1195                         f.update({
1196                             'tbr': f.get('tbr') or tbr,
1197                             'width': f.get('width') or width,
1198                             'height': f.get('height') or height,
1199                             'format_id': f.get('format_id') if not tbr else format_id,
1200                             'vcodec': vcodec,
1201                         })
1202                     formats.extend(f4m_formats)
1203                     continue
1204                 elif ext == 'm3u8':
1205                     formats.extend(self._extract_m3u8_formats(
1206                         manifest_url, video_id, 'mp4', preference=preference,
1207                         m3u8_id=m3u8_id, fatal=fatal))
1208                     continue
1209             formats.append({
1210                 'format_id': format_id,
1211                 'url': manifest_url,
1212                 'manifest_url': manifest_url,
1213                 'ext': 'flv' if bootstrap_info is not None else None,
1214                 'tbr': tbr,
1215                 'width': width,
1216                 'height': height,
1217                 'vcodec': vcodec,
1218                 'preference': preference,
1219             })
1220         return formats
1221
1222     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1223         return {
1224             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1225             'url': m3u8_url,
1226             'ext': ext,
1227             'protocol': 'm3u8',
1228             'preference': preference - 100 if preference else -100,
1229             'resolution': 'multiple',
1230             'format_note': 'Quality selection URL',
1231         }
1232
1233     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1234                               entry_protocol='m3u8', preference=None,
1235                               m3u8_id=None, note=None, errnote=None,
1236                               fatal=True, live=False):
1237
1238         res = self._download_webpage_handle(
1239             m3u8_url, video_id,
1240             note=note or 'Downloading m3u8 information',
1241             errnote=errnote or 'Failed to download m3u8 information',
1242             fatal=fatal)
1243         if res is False:
1244             return []
1245         m3u8_doc, urlh = res
1246         m3u8_url = urlh.geturl()
1247
1248         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1249             return []
1250
1251         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1252
1253         format_url = lambda u: (
1254             u
1255             if re.match(r'^https?://', u)
1256             else compat_urlparse.urljoin(m3u8_url, u))
1257
1258         # We should try extracting formats only from master playlists [1], i.e.
1259         # playlists that describe available qualities. On the other hand media
1260         # playlists [2] should be returned as is since they contain just the media
1261         # without qualities renditions.
1262         # Fortunately, master playlist can be easily distinguished from media
1263         # playlist based on particular tags availability. As of [1, 2] master
1264         # playlist tags MUST NOT appear in a media playist and vice versa.
1265         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1266         # and MUST NOT appear in master playlist thus we can clearly detect media
1267         # playlist with this criterion.
1268         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1269         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1270         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1271         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1272             return [{
1273                 'url': m3u8_url,
1274                 'format_id': m3u8_id,
1275                 'ext': ext,
1276                 'protocol': entry_protocol,
1277                 'preference': preference,
1278             }]
1279         audio_in_video_stream = {}
1280         last_info = {}
1281         last_media = {}
1282         for line in m3u8_doc.splitlines():
1283             if line.startswith('#EXT-X-STREAM-INF:'):
1284                 last_info = parse_m3u8_attributes(line)
1285             elif line.startswith('#EXT-X-MEDIA:'):
1286                 media = parse_m3u8_attributes(line)
1287                 media_type = media.get('TYPE')
1288                 if media_type in ('VIDEO', 'AUDIO'):
1289                     group_id = media.get('GROUP-ID')
1290                     media_url = media.get('URI')
1291                     if media_url:
1292                         format_id = []
1293                         for v in (group_id, media.get('NAME')):
1294                             if v:
1295                                 format_id.append(v)
1296                         f = {
1297                             'format_id': '-'.join(format_id),
1298                             'url': format_url(media_url),
1299                             'language': media.get('LANGUAGE'),
1300                             'ext': ext,
1301                             'protocol': entry_protocol,
1302                             'preference': preference,
1303                         }
1304                         if media_type == 'AUDIO':
1305                             f['vcodec'] = 'none'
1306                             if group_id and not audio_in_video_stream.get(group_id):
1307                                 audio_in_video_stream[group_id] = False
1308                         formats.append(f)
1309                     else:
1310                         # When there is no URI in EXT-X-MEDIA let this tag's
1311                         # data be used by regular URI lines below
1312                         last_media = media
1313                         if media_type == 'AUDIO' and group_id:
1314                             audio_in_video_stream[group_id] = True
1315             elif line.startswith('#') or not line.strip():
1316                 continue
1317             else:
1318                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1319                 format_id = []
1320                 if m3u8_id:
1321                     format_id.append(m3u8_id)
1322                 # Despite specification does not mention NAME attribute for
1323                 # EXT-X-STREAM-INF it still sometimes may be present
1324                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1325                 # Bandwidth of live streams may differ over time thus making
1326                 # format_id unpredictable. So it's better to keep provided
1327                 # format_id intact.
1328                 if not live:
1329                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1330                 manifest_url = format_url(line.strip())
1331                 f = {
1332                     'format_id': '-'.join(format_id),
1333                     'url': manifest_url,
1334                     'manifest_url': manifest_url,
1335                     'tbr': tbr,
1336                     'ext': ext,
1337                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1338                     'protocol': entry_protocol,
1339                     'preference': preference,
1340                 }
1341                 resolution = last_info.get('RESOLUTION')
1342                 if resolution:
1343                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1344                     if mobj:
1345                         f['width'] = int(mobj.group('width'))
1346                         f['height'] = int(mobj.group('height'))
1347                 # Unified Streaming Platform
1348                 mobj = re.search(
1349                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1350                 if mobj:
1351                     abr, vbr = mobj.groups()
1352                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1353                     f.update({
1354                         'vbr': vbr,
1355                         'abr': abr,
1356                     })
1357                 f.update(parse_codecs(last_info.get('CODECS')))
1358                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1359                     # TODO: update acodec for audio only formats with the same GROUP-ID
1360                     f['acodec'] = 'none'
1361                 formats.append(f)
1362                 last_info = {}
1363                 last_media = {}
1364         return formats
1365
1366     @staticmethod
1367     def _xpath_ns(path, namespace=None):
1368         if not namespace:
1369             return path
1370         out = []
1371         for c in path.split('/'):
1372             if not c or c == '.':
1373                 out.append(c)
1374             else:
1375                 out.append('{%s}%s' % (namespace, c))
1376         return '/'.join(out)
1377
1378     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1379         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1380
1381         if smil is False:
1382             assert not fatal
1383             return []
1384
1385         namespace = self._parse_smil_namespace(smil)
1386
1387         return self._parse_smil_formats(
1388             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1389
1390     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1391         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1392         if smil is False:
1393             return {}
1394         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1395
1396     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1397         return self._download_xml(
1398             smil_url, video_id, 'Downloading SMIL file',
1399             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1400
1401     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1402         namespace = self._parse_smil_namespace(smil)
1403
1404         formats = self._parse_smil_formats(
1405             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1406         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1407
1408         video_id = os.path.splitext(url_basename(smil_url))[0]
1409         title = None
1410         description = None
1411         upload_date = None
1412         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1413             name = meta.attrib.get('name')
1414             content = meta.attrib.get('content')
1415             if not name or not content:
1416                 continue
1417             if not title and name == 'title':
1418                 title = content
1419             elif not description and name in ('description', 'abstract'):
1420                 description = content
1421             elif not upload_date and name == 'date':
1422                 upload_date = unified_strdate(content)
1423
1424         thumbnails = [{
1425             'id': image.get('type'),
1426             'url': image.get('src'),
1427             'width': int_or_none(image.get('width')),
1428             'height': int_or_none(image.get('height')),
1429         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1430
1431         return {
1432             'id': video_id,
1433             'title': title or video_id,
1434             'description': description,
1435             'upload_date': upload_date,
1436             'thumbnails': thumbnails,
1437             'formats': formats,
1438             'subtitles': subtitles,
1439         }
1440
1441     def _parse_smil_namespace(self, smil):
1442         return self._search_regex(
1443             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1444
1445     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1446         base = smil_url
1447         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1448             b = meta.get('base') or meta.get('httpBase')
1449             if b:
1450                 base = b
1451                 break
1452
1453         formats = []
1454         rtmp_count = 0
1455         http_count = 0
1456         m3u8_count = 0
1457
1458         srcs = []
1459         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1460         for medium in media:
1461             src = medium.get('src')
1462             if not src or src in srcs:
1463                 continue
1464             srcs.append(src)
1465
1466             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1467             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1468             width = int_or_none(medium.get('width'))
1469             height = int_or_none(medium.get('height'))
1470             proto = medium.get('proto')
1471             ext = medium.get('ext')
1472             src_ext = determine_ext(src)
1473             streamer = medium.get('streamer') or base
1474
1475             if proto == 'rtmp' or streamer.startswith('rtmp'):
1476                 rtmp_count += 1
1477                 formats.append({
1478                     'url': streamer,
1479                     'play_path': src,
1480                     'ext': 'flv',
1481                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1482                     'tbr': bitrate,
1483                     'filesize': filesize,
1484                     'width': width,
1485                     'height': height,
1486                 })
1487                 if transform_rtmp_url:
1488                     streamer, src = transform_rtmp_url(streamer, src)
1489                     formats[-1].update({
1490                         'url': streamer,
1491                         'play_path': src,
1492                     })
1493                 continue
1494
1495             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1496             src_url = src_url.strip()
1497
1498             if proto == 'm3u8' or src_ext == 'm3u8':
1499                 m3u8_formats = self._extract_m3u8_formats(
1500                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1501                 if len(m3u8_formats) == 1:
1502                     m3u8_count += 1
1503                     m3u8_formats[0].update({
1504                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1505                         'tbr': bitrate,
1506                         'width': width,
1507                         'height': height,
1508                     })
1509                 formats.extend(m3u8_formats)
1510                 continue
1511
1512             if src_ext == 'f4m':
1513                 f4m_url = src_url
1514                 if not f4m_params:
1515                     f4m_params = {
1516                         'hdcore': '3.2.0',
1517                         'plugin': 'flowplayer-3.2.0.1',
1518                     }
1519                 f4m_url += '&' if '?' in f4m_url else '?'
1520                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1521                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1522                 continue
1523
1524             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1525                 http_count += 1
1526                 formats.append({
1527                     'url': src_url,
1528                     'ext': ext or src_ext or 'flv',
1529                     'format_id': 'http-%d' % (bitrate or http_count),
1530                     'tbr': bitrate,
1531                     'filesize': filesize,
1532                     'width': width,
1533                     'height': height,
1534                 })
1535                 continue
1536
1537         return formats
1538
1539     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1540         urls = []
1541         subtitles = {}
1542         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1543             src = textstream.get('src')
1544             if not src or src in urls:
1545                 continue
1546             urls.append(src)
1547             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1548             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1549             subtitles.setdefault(lang, []).append({
1550                 'url': src,
1551                 'ext': ext,
1552             })
1553         return subtitles
1554
1555     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1556         xspf = self._download_xml(
1557             playlist_url, playlist_id, 'Downloading xpsf playlist',
1558             'Unable to download xspf manifest', fatal=fatal)
1559         if xspf is False:
1560             return []
1561         return self._parse_xspf(xspf, playlist_id)
1562
1563     def _parse_xspf(self, playlist, playlist_id):
1564         NS_MAP = {
1565             'xspf': 'http://xspf.org/ns/0/',
1566             's1': 'http://static.streamone.nl/player/ns/0',
1567         }
1568
1569         entries = []
1570         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1571             title = xpath_text(
1572                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1573             description = xpath_text(
1574                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1575             thumbnail = xpath_text(
1576                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1577             duration = float_or_none(
1578                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1579
1580             formats = [{
1581                 'url': location.text,
1582                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1583                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1584                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1585             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1586             self._sort_formats(formats)
1587
1588             entries.append({
1589                 'id': playlist_id,
1590                 'title': title,
1591                 'description': description,
1592                 'thumbnail': thumbnail,
1593                 'duration': duration,
1594                 'formats': formats,
1595             })
1596         return entries
1597
1598     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1599         res = self._download_webpage_handle(
1600             mpd_url, video_id,
1601             note=note or 'Downloading MPD manifest',
1602             errnote=errnote or 'Failed to download MPD manifest',
1603             fatal=fatal)
1604         if res is False:
1605             return []
1606         mpd, urlh = res
1607         mpd_base_url = base_url(urlh.geturl())
1608
1609         return self._parse_mpd_formats(
1610             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1611             formats_dict=formats_dict, mpd_url=mpd_url)
1612
1613     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1614         """
1615         Parse formats from MPD manifest.
1616         References:
1617          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1618             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1619          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1620         """
1621         if mpd_doc.get('type') == 'dynamic':
1622             return []
1623
1624         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1625
1626         def _add_ns(path):
1627             return self._xpath_ns(path, namespace)
1628
1629         def is_drm_protected(element):
1630             return element.find(_add_ns('ContentProtection')) is not None
1631
1632         def extract_multisegment_info(element, ms_parent_info):
1633             ms_info = ms_parent_info.copy()
1634
1635             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1636             # common attributes and elements.  We will only extract relevant
1637             # for us.
1638             def extract_common(source):
1639                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1640                 if segment_timeline is not None:
1641                     s_e = segment_timeline.findall(_add_ns('S'))
1642                     if s_e:
1643                         ms_info['total_number'] = 0
1644                         ms_info['s'] = []
1645                         for s in s_e:
1646                             r = int(s.get('r', 0))
1647                             ms_info['total_number'] += 1 + r
1648                             ms_info['s'].append({
1649                                 't': int(s.get('t', 0)),
1650                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1651                                 'd': int(s.attrib['d']),
1652                                 'r': r,
1653                             })
1654                 start_number = source.get('startNumber')
1655                 if start_number:
1656                     ms_info['start_number'] = int(start_number)
1657                 timescale = source.get('timescale')
1658                 if timescale:
1659                     ms_info['timescale'] = int(timescale)
1660                 segment_duration = source.get('duration')
1661                 if segment_duration:
1662                     ms_info['segment_duration'] = int(segment_duration)
1663
1664             def extract_Initialization(source):
1665                 initialization = source.find(_add_ns('Initialization'))
1666                 if initialization is not None:
1667                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1668
1669             segment_list = element.find(_add_ns('SegmentList'))
1670             if segment_list is not None:
1671                 extract_common(segment_list)
1672                 extract_Initialization(segment_list)
1673                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1674                 if segment_urls_e:
1675                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1676             else:
1677                 segment_template = element.find(_add_ns('SegmentTemplate'))
1678                 if segment_template is not None:
1679                     extract_common(segment_template)
1680                     media = segment_template.get('media')
1681                     if media:
1682                         ms_info['media'] = media
1683                     initialization = segment_template.get('initialization')
1684                     if initialization:
1685                         ms_info['initialization'] = initialization
1686                     else:
1687                         extract_Initialization(segment_template)
1688             return ms_info
1689
1690         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1691         formats = []
1692         for period in mpd_doc.findall(_add_ns('Period')):
1693             period_duration = parse_duration(period.get('duration')) or mpd_duration
1694             period_ms_info = extract_multisegment_info(period, {
1695                 'start_number': 1,
1696                 'timescale': 1,
1697             })
1698             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1699                 if is_drm_protected(adaptation_set):
1700                     continue
1701                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1702                 for representation in adaptation_set.findall(_add_ns('Representation')):
1703                     if is_drm_protected(representation):
1704                         continue
1705                     representation_attrib = adaptation_set.attrib.copy()
1706                     representation_attrib.update(representation.attrib)
1707                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1708                     mime_type = representation_attrib['mimeType']
1709                     content_type = mime_type.split('/')[0]
1710                     if content_type == 'text':
1711                         # TODO implement WebVTT downloading
1712                         pass
1713                     elif content_type == 'video' or content_type == 'audio':
1714                         base_url = ''
1715                         for element in (representation, adaptation_set, period, mpd_doc):
1716                             base_url_e = element.find(_add_ns('BaseURL'))
1717                             if base_url_e is not None:
1718                                 base_url = base_url_e.text + base_url
1719                                 if re.match(r'^https?://', base_url):
1720                                     break
1721                         if mpd_base_url and not re.match(r'^https?://', base_url):
1722                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1723                                 mpd_base_url += '/'
1724                             base_url = mpd_base_url + base_url
1725                         representation_id = representation_attrib.get('id')
1726                         lang = representation_attrib.get('lang')
1727                         url_el = representation.find(_add_ns('BaseURL'))
1728                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1729                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1730                         f = {
1731                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1732                             'url': base_url,
1733                             'manifest_url': mpd_url,
1734                             'ext': mimetype2ext(mime_type),
1735                             'width': int_or_none(representation_attrib.get('width')),
1736                             'height': int_or_none(representation_attrib.get('height')),
1737                             'tbr': int_or_none(bandwidth, 1000),
1738                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1739                             'fps': int_or_none(representation_attrib.get('frameRate')),
1740                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1741                             'format_note': 'DASH %s' % content_type,
1742                             'filesize': filesize,
1743                         }
1744                         f.update(parse_codecs(representation_attrib.get('codecs')))
1745                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1746
1747                         def prepare_template(template_name, identifiers):
1748                             t = representation_ms_info[template_name]
1749                             t = t.replace('$RepresentationID$', representation_id)
1750                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1751                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1752                             t.replace('$$', '$')
1753                             return t
1754
1755                         # @initialization is a regular template like @media one
1756                         # so it should be handled just the same way (see
1757                         # https://github.com/rg3/youtube-dl/issues/11605)
1758                         if 'initialization' in representation_ms_info:
1759                             initialization_template = prepare_template(
1760                                 'initialization',
1761                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1762                                 # $Time$ shall not be included for @initialization thus
1763                                 # only $Bandwidth$ remains
1764                                 ('Bandwidth', ))
1765                             representation_ms_info['initialization_url'] = initialization_template % {
1766                                 'Bandwidth': bandwidth,
1767                             }
1768
1769                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1770
1771                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1772
1773                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1774                             # can't be used at the same time
1775                             if '%(Number' in media_template and 's' not in representation_ms_info:
1776                                 segment_duration = None
1777                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1778                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1779                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1780                                 representation_ms_info['fragments'] = [{
1781                                     'url': media_template % {
1782                                         'Number': segment_number,
1783                                         'Bandwidth': bandwidth,
1784                                     },
1785                                     'duration': segment_duration,
1786                                 } for segment_number in range(
1787                                     representation_ms_info['start_number'],
1788                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1789                             else:
1790                                 # $Number*$ or $Time$ in media template with S list available
1791                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1792                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1793                                 representation_ms_info['fragments'] = []
1794                                 segment_time = 0
1795                                 segment_d = None
1796                                 segment_number = representation_ms_info['start_number']
1797
1798                                 def add_segment_url():
1799                                     segment_url = media_template % {
1800                                         'Time': segment_time,
1801                                         'Bandwidth': bandwidth,
1802                                         'Number': segment_number,
1803                                     }
1804                                     representation_ms_info['fragments'].append({
1805                                         'url': segment_url,
1806                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1807                                     })
1808
1809                                 for num, s in enumerate(representation_ms_info['s']):
1810                                     segment_time = s.get('t') or segment_time
1811                                     segment_d = s['d']
1812                                     add_segment_url()
1813                                     segment_number += 1
1814                                     for r in range(s.get('r', 0)):
1815                                         segment_time += segment_d
1816                                         add_segment_url()
1817                                         segment_number += 1
1818                                     segment_time += segment_d
1819                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1820                             # No media template
1821                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1822                             # or any YouTube dashsegments video
1823                             fragments = []
1824                             segment_index = 0
1825                             timescale = representation_ms_info['timescale']
1826                             for s in representation_ms_info['s']:
1827                                 duration = float_or_none(s['d'], timescale)
1828                                 for r in range(s.get('r', 0) + 1):
1829                                     fragments.append({
1830                                         'url': representation_ms_info['segment_urls'][segment_index],
1831                                         'duration': duration,
1832                                     })
1833                                     segment_index += 1
1834                             representation_ms_info['fragments'] = fragments
1835                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1836                         # No fragments key is present in this case.
1837                         if 'fragments' in representation_ms_info:
1838                             f.update({
1839                                 'fragments': [],
1840                                 'protocol': 'http_dash_segments',
1841                             })
1842                             if 'initialization_url' in representation_ms_info:
1843                                 initialization_url = representation_ms_info['initialization_url']
1844                                 if not f.get('url'):
1845                                     f['url'] = initialization_url
1846                                 f['fragments'].append({'url': initialization_url})
1847                             f['fragments'].extend(representation_ms_info['fragments'])
1848                             for fragment in f['fragments']:
1849                                 fragment['url'] = urljoin(base_url, fragment['url'])
1850                         try:
1851                             existing_format = next(
1852                                 fo for fo in formats
1853                                 if fo['format_id'] == representation_id)
1854                         except StopIteration:
1855                             full_info = formats_dict.get(representation_id, {}).copy()
1856                             full_info.update(f)
1857                             formats.append(full_info)
1858                         else:
1859                             existing_format.update(f)
1860                     else:
1861                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1862         return formats
1863
1864     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1865         res = self._download_webpage_handle(
1866             ism_url, video_id,
1867             note=note or 'Downloading ISM manifest',
1868             errnote=errnote or 'Failed to download ISM manifest',
1869             fatal=fatal)
1870         if res is False:
1871             return []
1872         ism, urlh = res
1873
1874         return self._parse_ism_formats(
1875             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1876
1877     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1878         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1879             return []
1880
1881         duration = int(ism_doc.attrib['Duration'])
1882         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1883
1884         formats = []
1885         for stream in ism_doc.findall('StreamIndex'):
1886             stream_type = stream.get('Type')
1887             if stream_type not in ('video', 'audio'):
1888                 continue
1889             url_pattern = stream.attrib['Url']
1890             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1891             stream_name = stream.get('Name')
1892             for track in stream.findall('QualityLevel'):
1893                 fourcc = track.get('FourCC')
1894                 # TODO: add support for WVC1 and WMAP
1895                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1896                     self.report_warning('%s is not a supported codec' % fourcc)
1897                     continue
1898                 tbr = int(track.attrib['Bitrate']) // 1000
1899                 width = int_or_none(track.get('MaxWidth'))
1900                 height = int_or_none(track.get('MaxHeight'))
1901                 sampling_rate = int_or_none(track.get('SamplingRate'))
1902
1903                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1904                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1905
1906                 fragments = []
1907                 fragment_ctx = {
1908                     'time': 0,
1909                 }
1910                 stream_fragments = stream.findall('c')
1911                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1912                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1913                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1914                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1915                     if not fragment_ctx['duration']:
1916                         try:
1917                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1918                         except IndexError:
1919                             next_fragment_time = duration
1920                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1921                     for _ in range(fragment_repeat):
1922                         fragments.append({
1923                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1924                             'duration': fragment_ctx['duration'] / stream_timescale,
1925                         })
1926                         fragment_ctx['time'] += fragment_ctx['duration']
1927
1928                 format_id = []
1929                 if ism_id:
1930                     format_id.append(ism_id)
1931                 if stream_name:
1932                     format_id.append(stream_name)
1933                 format_id.append(compat_str(tbr))
1934
1935                 formats.append({
1936                     'format_id': '-'.join(format_id),
1937                     'url': ism_url,
1938                     'manifest_url': ism_url,
1939                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1940                     'width': width,
1941                     'height': height,
1942                     'tbr': tbr,
1943                     'asr': sampling_rate,
1944                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1945                     'acodec': 'none' if stream_type == 'video' else fourcc,
1946                     'protocol': 'ism',
1947                     'fragments': fragments,
1948                     '_download_params': {
1949                         'duration': duration,
1950                         'timescale': stream_timescale,
1951                         'width': width or 0,
1952                         'height': height or 0,
1953                         'fourcc': fourcc,
1954                         'codec_private_data': track.get('CodecPrivateData'),
1955                         'sampling_rate': sampling_rate,
1956                         'channels': int_or_none(track.get('Channels', 2)),
1957                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1958                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1959                     },
1960                 })
1961         return formats
1962
1963     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1964         def absolute_url(video_url):
1965             return compat_urlparse.urljoin(base_url, video_url)
1966
1967         def parse_content_type(content_type):
1968             if not content_type:
1969                 return {}
1970             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1971             if ctr:
1972                 mimetype, codecs = ctr.groups()
1973                 f = parse_codecs(codecs)
1974                 f['ext'] = mimetype2ext(mimetype)
1975                 return f
1976             return {}
1977
1978         def _media_formats(src, cur_media_type):
1979             full_url = absolute_url(src)
1980             ext = determine_ext(full_url)
1981             if ext == 'm3u8':
1982                 is_plain_url = False
1983                 formats = self._extract_m3u8_formats(
1984                     full_url, video_id, ext='mp4',
1985                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1986             elif ext == 'mpd':
1987                 is_plain_url = False
1988                 formats = self._extract_mpd_formats(
1989                     full_url, video_id, mpd_id=mpd_id)
1990             else:
1991                 is_plain_url = True
1992                 formats = [{
1993                     'url': full_url,
1994                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1995                 }]
1996             return is_plain_url, formats
1997
1998         entries = []
1999         media_tags = [(media_tag, media_type, '')
2000                       for media_tag, media_type
2001                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2002         media_tags.extend(re.findall(
2003             # We only allow video|audio followed by a whitespace or '>'.
2004             # Allowing more characters may end up in significant slow down (see
2005             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2006             # http://www.porntrex.com/maps/videositemap.xml).
2007             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2008         for media_tag, media_type, media_content in media_tags:
2009             media_info = {
2010                 'formats': [],
2011                 'subtitles': {},
2012             }
2013             media_attributes = extract_attributes(media_tag)
2014             src = media_attributes.get('src')
2015             if src:
2016                 _, formats = _media_formats(src, media_type)
2017                 media_info['formats'].extend(formats)
2018             media_info['thumbnail'] = media_attributes.get('poster')
2019             if media_content:
2020                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2021                     source_attributes = extract_attributes(source_tag)
2022                     src = source_attributes.get('src')
2023                     if not src:
2024                         continue
2025                     is_plain_url, formats = _media_formats(src, media_type)
2026                     if is_plain_url:
2027                         f = parse_content_type(source_attributes.get('type'))
2028                         f.update(formats[0])
2029                         media_info['formats'].append(f)
2030                     else:
2031                         media_info['formats'].extend(formats)
2032                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2033                     track_attributes = extract_attributes(track_tag)
2034                     kind = track_attributes.get('kind')
2035                     if not kind or kind in ('subtitles', 'captions'):
2036                         src = track_attributes.get('src')
2037                         if not src:
2038                             continue
2039                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2040                         media_info['subtitles'].setdefault(lang, []).append({
2041                             'url': absolute_url(src),
2042                         })
2043             if media_info['formats'] or media_info['subtitles']:
2044                 entries.append(media_info)
2045         return entries
2046
2047     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2048         formats = []
2049         hdcore_sign = 'hdcore=3.7.0'
2050         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2051         hds_host = hosts.get('hds')
2052         if hds_host:
2053             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2054         if 'hdcore=' not in f4m_url:
2055             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2056         f4m_formats = self._extract_f4m_formats(
2057             f4m_url, video_id, f4m_id='hds', fatal=False)
2058         for entry in f4m_formats:
2059             entry.update({'extra_param_to_segment_url': hdcore_sign})
2060         formats.extend(f4m_formats)
2061         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2062         hls_host = hosts.get('hls')
2063         if hls_host:
2064             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2065         formats.extend(self._extract_m3u8_formats(
2066             m3u8_url, video_id, 'mp4', 'm3u8_native',
2067             m3u8_id='hls', fatal=False))
2068         return formats
2069
2070     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2071         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2072         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2073         http_base_url = 'http' + url_base
2074         formats = []
2075         if 'm3u8' not in skip_protocols:
2076             formats.extend(self._extract_m3u8_formats(
2077                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2078                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2079         if 'f4m' not in skip_protocols:
2080             formats.extend(self._extract_f4m_formats(
2081                 http_base_url + '/manifest.f4m',
2082                 video_id, f4m_id='hds', fatal=False))
2083         if 'dash' not in skip_protocols:
2084             formats.extend(self._extract_mpd_formats(
2085                 http_base_url + '/manifest.mpd',
2086                 video_id, mpd_id='dash', fatal=False))
2087         if re.search(r'(?:/smil:|\.smil)', url_base):
2088             if 'smil' not in skip_protocols:
2089                 rtmp_formats = self._extract_smil_formats(
2090                     http_base_url + '/jwplayer.smil',
2091                     video_id, fatal=False)
2092                 for rtmp_format in rtmp_formats:
2093                     rtsp_format = rtmp_format.copy()
2094                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2095                     del rtsp_format['play_path']
2096                     del rtsp_format['ext']
2097                     rtsp_format.update({
2098                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2099                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2100                         'protocol': 'rtsp',
2101                     })
2102                     formats.extend([rtmp_format, rtsp_format])
2103         else:
2104             for protocol in ('rtmp', 'rtsp'):
2105                 if protocol not in skip_protocols:
2106                     formats.append({
2107                         'url': protocol + url_base,
2108                         'format_id': protocol,
2109                         'protocol': protocol,
2110                     })
2111         return formats
2112
2113     @staticmethod
2114     def _find_jwplayer_data(webpage):
2115         mobj = re.search(
2116             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2117             webpage)
2118         if mobj:
2119             return mobj.group('options')
2120
2121     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2122         jwplayer_data = self._parse_json(
2123             self._find_jwplayer_data(webpage), video_id,
2124             transform_source=js_to_json)
2125         return self._parse_jwplayer_data(
2126             jwplayer_data, video_id, *args, **kwargs)
2127
2128     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2129                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2130         # JWPlayer backward compatibility: flattened playlists
2131         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2132         if 'playlist' not in jwplayer_data:
2133             jwplayer_data = {'playlist': [jwplayer_data]}
2134
2135         entries = []
2136
2137         # JWPlayer backward compatibility: single playlist item
2138         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2139         if not isinstance(jwplayer_data['playlist'], list):
2140             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2141
2142         for video_data in jwplayer_data['playlist']:
2143             # JWPlayer backward compatibility: flattened sources
2144             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2145             if 'sources' not in video_data:
2146                 video_data['sources'] = [video_data]
2147
2148             this_video_id = video_id or video_data['mediaid']
2149
2150             formats = []
2151             for source in video_data['sources']:
2152                 source_url = self._proto_relative_url(source['file'])
2153                 if base_url:
2154                     source_url = compat_urlparse.urljoin(base_url, source_url)
2155                 source_type = source.get('type') or ''
2156                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2157                 if source_type == 'hls' or ext == 'm3u8':
2158                     formats.extend(self._extract_m3u8_formats(
2159                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2160                 elif ext == 'mpd':
2161                     formats.extend(self._extract_mpd_formats(
2162                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2163                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2164                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2165                     formats.append({
2166                         'url': source_url,
2167                         'vcodec': 'none',
2168                         'ext': ext,
2169                     })
2170                 else:
2171                     height = int_or_none(source.get('height'))
2172                     if height is None:
2173                         # Often no height is provided but there is a label in
2174                         # format like 1080p.
2175                         height = int_or_none(self._search_regex(
2176                             r'^(\d{3,})[pP]$', source.get('label') or '',
2177                             'height', default=None))
2178                     a_format = {
2179                         'url': source_url,
2180                         'width': int_or_none(source.get('width')),
2181                         'height': height,
2182                         'ext': ext,
2183                     }
2184                     if source_url.startswith('rtmp'):
2185                         a_format['ext'] = 'flv'
2186
2187                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2188                         # of jwplayer.flash.swf
2189                         rtmp_url_parts = re.split(
2190                             r'((?:mp4|mp3|flv):)', source_url, 1)
2191                         if len(rtmp_url_parts) == 3:
2192                             rtmp_url, prefix, play_path = rtmp_url_parts
2193                             a_format.update({
2194                                 'url': rtmp_url,
2195                                 'play_path': prefix + play_path,
2196                             })
2197                         if rtmp_params:
2198                             a_format.update(rtmp_params)
2199                     formats.append(a_format)
2200             self._sort_formats(formats)
2201
2202             subtitles = {}
2203             tracks = video_data.get('tracks')
2204             if tracks and isinstance(tracks, list):
2205                 for track in tracks:
2206                     if track.get('kind') != 'captions':
2207                         continue
2208                     track_url = urljoin(base_url, track.get('file'))
2209                     if not track_url:
2210                         continue
2211                     subtitles.setdefault(track.get('label') or 'en', []).append({
2212                         'url': self._proto_relative_url(track_url)
2213                     })
2214
2215             entries.append({
2216                 'id': this_video_id,
2217                 'title': video_data['title'] if require_title else video_data.get('title'),
2218                 'description': video_data.get('description'),
2219                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2220                 'timestamp': int_or_none(video_data.get('pubdate')),
2221                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2222                 'subtitles': subtitles,
2223                 'formats': formats,
2224             })
2225         if len(entries) == 1:
2226             return entries[0]
2227         else:
2228             return self.playlist_result(entries)
2229
2230     def _live_title(self, name):
2231         """ Generate the title for a live video """
2232         now = datetime.datetime.now()
2233         now_str = now.strftime('%Y-%m-%d %H:%M')
2234         return name + ' ' + now_str
2235
2236     def _int(self, v, name, fatal=False, **kwargs):
2237         res = int_or_none(v, **kwargs)
2238         if 'get_attr' in kwargs:
2239             print(getattr(v, kwargs['get_attr']))
2240         if res is None:
2241             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2242             if fatal:
2243                 raise ExtractorError(msg)
2244             else:
2245                 self._downloader.report_warning(msg)
2246         return res
2247
2248     def _float(self, v, name, fatal=False, **kwargs):
2249         res = float_or_none(v, **kwargs)
2250         if res is None:
2251             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2252             if fatal:
2253                 raise ExtractorError(msg)
2254             else:
2255                 self._downloader.report_warning(msg)
2256         return res
2257
2258     def _set_cookie(self, domain, name, value, expire_time=None):
2259         cookie = compat_cookiejar.Cookie(
2260             0, name, value, None, None, domain, None,
2261             None, '/', True, False, expire_time, '', None, None, None)
2262         self._downloader.cookiejar.set_cookie(cookie)
2263
2264     def _get_cookies(self, url):
2265         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2266         req = sanitized_Request(url)
2267         self._downloader.cookiejar.add_cookie_header(req)
2268         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2269
2270     def get_testcases(self, include_onlymatching=False):
2271         t = getattr(self, '_TEST', None)
2272         if t:
2273             assert not hasattr(self, '_TESTS'), \
2274                 '%s has _TEST and _TESTS' % type(self).__name__
2275             tests = [t]
2276         else:
2277             tests = getattr(self, '_TESTS', [])
2278         for t in tests:
2279             if not include_onlymatching and t.get('only_matching', False):
2280                 continue
2281             t['name'] = type(self).__name__[:-len('IE')]
2282             yield t
2283
2284     def is_suitable(self, age_limit):
2285         """ Test whether the extractor is generally suitable for the given
2286         age limit (i.e. pornographic sites are not, all others usually are) """
2287
2288         any_restricted = False
2289         for tc in self.get_testcases(include_onlymatching=False):
2290             if tc.get('playlist', []):
2291                 tc = tc['playlist'][0]
2292             is_restricted = age_restricted(
2293                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2294             if not is_restricted:
2295                 return True
2296             any_restricted = any_restricted or is_restricted
2297         return not any_restricted
2298
2299     def extract_subtitles(self, *args, **kwargs):
2300         if (self._downloader.params.get('writesubtitles', False) or
2301                 self._downloader.params.get('listsubtitles')):
2302             return self._get_subtitles(*args, **kwargs)
2303         return {}
2304
2305     def _get_subtitles(self, *args, **kwargs):
2306         raise NotImplementedError('This method must be implemented by subclasses')
2307
2308     @staticmethod
2309     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2310         """ Merge subtitle items for one language. Items with duplicated URLs
2311         will be dropped. """
2312         list1_urls = set([item['url'] for item in subtitle_list1])
2313         ret = list(subtitle_list1)
2314         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2315         return ret
2316
2317     @classmethod
2318     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2319         """ Merge two subtitle dictionaries, language by language. """
2320         ret = dict(subtitle_dict1)
2321         for lang in subtitle_dict2:
2322             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2323         return ret
2324
2325     def extract_automatic_captions(self, *args, **kwargs):
2326         if (self._downloader.params.get('writeautomaticsub', False) or
2327                 self._downloader.params.get('listsubtitles')):
2328             return self._get_automatic_captions(*args, **kwargs)
2329         return {}
2330
2331     def _get_automatic_captions(self, *args, **kwargs):
2332         raise NotImplementedError('This method must be implemented by subclasses')
2333
2334     def mark_watched(self, *args, **kwargs):
2335         if (self._downloader.params.get('mark_watched', False) and
2336                 (self._get_login_info()[0] is not None or
2337                     self._downloader.params.get('cookiefile') is not None)):
2338             self._mark_watched(*args, **kwargs)
2339
2340     def _mark_watched(self, *args, **kwargs):
2341         raise NotImplementedError('This method must be implemented by subclasses')
2342
2343     def geo_verification_headers(self):
2344         headers = {}
2345         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2346         if geo_verification_proxy:
2347             headers['Ytdl-request-proxy'] = geo_verification_proxy
2348         return headers
2349
2350     def _generic_id(self, url):
2351         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2352
2353     def _generic_title(self, url):
2354         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2355
2356
2357 class SearchInfoExtractor(InfoExtractor):
2358     """
2359     Base class for paged search queries extractors.
2360     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2361     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2362     """
2363
2364     @classmethod
2365     def _make_valid_url(cls):
2366         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2367
2368     @classmethod
2369     def suitable(cls, url):
2370         return re.match(cls._make_valid_url(), url) is not None
2371
2372     def _real_extract(self, query):
2373         mobj = re.match(self._make_valid_url(), query)
2374         if mobj is None:
2375             raise ExtractorError('Invalid search query "%s"' % query)
2376
2377         prefix = mobj.group('prefix')
2378         query = mobj.group('query')
2379         if prefix == '':
2380             return self._get_n_results(query, 1)
2381         elif prefix == 'all':
2382             return self._get_n_results(query, self._MAX_RESULTS)
2383         else:
2384             n = int(prefix)
2385             if n <= 0:
2386                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2387             elif n > self._MAX_RESULTS:
2388                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2389                 n = self._MAX_RESULTS
2390             return self._get_n_results(query, n)
2391
2392     def _get_n_results(self, query, n):
2393         """Get a specified number of results for a query"""
2394         raise NotImplementedError('This method must be implemented by subclasses')
2395
2396     @property
2397     def SEARCH_KEY(self):
2398         return self._SEARCH_KEY