_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import random
  10 import re
  11 import socket
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar,
  18     compat_cookies,
  19     compat_etree_fromstring,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader.f4m import remove_encrypted_media
  31 from ..utils import (
  32     NO_DEFAULT,
  33     age_restricted,
  34     base_url,
  35     bug_reports_message,
  36     clean_html,
  37     compiled_regex_type,
  38     determine_ext,
  39     error_to_compat_str,
  40     ExtractorError,
  41     fix_xml_ampersands,
  42     float_or_none,
  43     GeoRestrictedError,
  44     GeoUtils,
  45     int_or_none,
  46     js_to_json,
  47     parse_iso8601,
  48     RegexNotFoundError,
  49     sanitize_filename,
  50     sanitized_Request,
  51     unescapeHTML,
  52     unified_strdate,
  53     unified_timestamp,
  54     url_basename,
  55     xpath_element,
  56     xpath_text,
  57     xpath_with_ns,
  58     determine_protocol,
  59     parse_duration,
  60     mimetype2ext,
  61     update_Request,
  62     update_url_query,
  63     parse_m3u8_attributes,
  64     extract_attributes,
  65     parse_codecs,
  66     urljoin,
  67 )
  68
  69
  70 class InfoExtractor(object):
  71     """Information Extractor class.
  72
  73     Information extractors are the classes that, given a URL, extract
  74     information about the video (or videos) the URL refers to. This
  75     information includes the real video URL, the video title, author and
  76     others. The information is stored in a dictionary which is then
  77     passed to the YoutubeDL. The YoutubeDL processes this
  78     information possibly downloading the video to the file system, among
  79     other possible outcomes.
  80
  81     The type field determines the type of the result.
  82     By far the most common value (and the default if _type is missing) is
  83     "video", which indicates a single video.
  84
  85     For a video, the dictionaries must include the following fields:
  86
  87     id:             Video identifier.
  88     title:          Video title, unescaped.
  89
  90     Additionally, it must contain either a formats entry or a url one:
  91
  92     formats:        A list of dictionaries for each format available, ordered
  93                     from worst to best quality.
  94
  95                     Potential fields:
  96                     * url        Mandatory. The URL of the video file
  97                     * manifest_url
  98                                  The URL of the manifest file in case of
  99                                  fragmented media (DASH, hls, hds)
 100                     * ext        Will be calculated from URL if missing
 101                     * format     A human-readable description of the format
 102                                  ("mp4 container with h264/opus").
 103                                  Calculated from the format_id, width, height.
 104                                  and format_note fields if missing.
 105                     * format_id  A short description of the format
 106                                  ("mp4_h264_opus" or "19").
 107                                 Technically optional, but strongly recommended.
 108                     * format_note Additional info about the format
 109                                  ("3D" or "DASH video")
 110                     * width      Width of the video, if known
 111                     * height     Height of the video, if known
 112                     * resolution Textual description of width and height
 113                     * tbr        Average bitrate of audio and video in KBit/s
 114                     * abr        Average audio bitrate in KBit/s
 115                     * acodec     Name of the audio codec in use
 116                     * asr        Audio sampling rate in Hertz
 117                     * vbr        Average video bitrate in KBit/s
 118                     * fps        Frame rate
 119                     * vcodec     Name of the video codec in use
 120                     * container  Name of the container format
 121                     * filesize   The number of bytes, if known in advance
 122                     * filesize_approx  An estimate for the number of bytes
 123                     * player_url SWF Player URL (used for rtmpdump).
 124                     * protocol   The protocol that will be used for the actual
 125                                  download, lower-case.
 126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 127                                  "m3u8", "m3u8_native" or "http_dash_segments".
 128                     * fragment_base_url
 129                                  Base URL for fragments. Each fragment's path
 130                                  value (if present) will be relative to
 131                                  this URL.
 132                     * fragments  A list of fragments of a fragmented media.
 133                                  Each fragment entry must contain either an url
 134                                  or a path. If an url is present it should be
 135                                  considered by a client. Otherwise both path and
 136                                  fragment_base_url must be present. Here is
 137                                  the list of all potential fields:
 138                                  * "url" - fragment's URL
 139                                  * "path" - fragment's path relative to
 140                                             fragment_base_url
 141                                  * "duration" (optional, int or float)
 142                                  * "filesize" (optional, int)
 143                     * preference Order number of this format. If this field is
 144                                  present and not None, the formats get sorted
 145                                  by this field, regardless of all other values.
 146                                  -1 for default (order by other properties),
 147                                  -2 or smaller for less than default.
 148                                  < -1000 to hide the format (if there is
 149                                     another one which is strictly better)
 150                     * language   Language code, e.g. "de" or "en-US".
 151                     * language_preference  Is this in the language mentioned in
 152                                  the URL?
 153                                  10 if it's what the URL is about,
 154                                  -1 for default (don't know),
 155                                  -10 otherwise, other values reserved for now.
 156                     * quality    Order number of the video quality of this
 157                                  format, irrespective of the file format.
 158                                  -1 for default (order by other properties),
 159                                  -2 or smaller for less than default.
 160                     * source_preference  Order number for this video source
 161                                   (quality takes higher priority)
 162                                  -1 for default (order by other properties),
 163                                  -2 or smaller for less than default.
 164                     * http_headers  A dictionary of additional HTTP headers
 165                                  to add to the request.
 166                     * stretched_ratio  If given and not 1, indicates that the
 167                                  video's pixels are not square.
 168                                  width : height ratio as float.
 169                     * no_resume  The server does not support resuming the
 170                                  (HTTP or RTMP) download. Boolean.
 171
 172     url:            Final video URL.
 173     ext:            Video filename extension.
 174     format:         The video format, defaults to ext (used for --get-format)
 175     player_url:     SWF Player URL (used for rtmpdump).
 176
 177     The following fields are optional:
 178
 179     alt_title:      A secondary title of the video.
 180     display_id      An alternative identifier for the video, not necessarily
 181                     unique, but available before title. Typically, id is
 182                     something like "4234987", title "Dancing naked mole rats",
 183                     and display_id "dancing-naked-mole-rats"
 184     thumbnails:     A list of dictionaries, with the following entries:
 185                         * "id" (optional, string) - Thumbnail format ID
 186                         * "url"
 187                         * "preference" (optional, int) - quality of the image
 188                         * "width" (optional, int)
 189                         * "height" (optional, int)
 190                         * "resolution" (optional, string "{width}x{height"},
 191                                         deprecated)
 192                         * "filesize" (optional, int)
 193     thumbnail:      Full URL to a video thumbnail image.
 194     description:    Full video description.
 195     uploader:       Full name of the video uploader.
 196     license:        License name the video is licensed under.
 197     creator:        The creator of the video.
 198     release_date:   The date (YYYYMMDD) when the video was released.
 199     timestamp:      UNIX timestamp of the moment the video became available.
 200     upload_date:    Video upload date (YYYYMMDD).
 201                     If not explicitly set, calculated from timestamp.
 202     uploader_id:    Nickname or id of the video uploader.
 203     uploader_url:   Full URL to a personal webpage of the video uploader.
 204     location:       Physical location where the video was filmed.
 205     subtitles:      The available subtitles as a dictionary in the format
 206                     {tag: subformats}. "tag" is usually a language code, and
 207                     "subformats" is a list sorted from lower to higher
 208                     preference, each element is a dictionary with the "ext"
 209                     entry and one of:
 210                         * "data": The subtitles file contents
 211                         * "url": A URL pointing to the subtitles file
 212                     "ext" will be calculated from URL if missing
 213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 214                     automatically generated captions
 215     duration:       Length of the video in seconds, as an integer or float.
 216     view_count:     How many users have watched the video on the platform.
 217     like_count:     Number of positive ratings of the video
 218     dislike_count:  Number of negative ratings of the video
 219     repost_count:   Number of reposts of the video
 220     average_rating: Average rating give by users, the scale used depends on the webpage
 221     comment_count:  Number of comments on the video
 222     comments:       A list of comments, each with one or more of the following
 223                     properties (all but one of text or html optional):
 224                         * "author" - human-readable name of the comment author
 225                         * "author_id" - user ID of the comment author
 226                         * "id" - Comment ID
 227                         * "html" - Comment as HTML
 228                         * "text" - Plain text of the comment
 229                         * "timestamp" - UNIX timestamp of comment
 230                         * "parent" - ID of the comment this one is replying to.
 231                                      Set to "root" to indicate that this is a
 232                                      comment to the original video.
 233     age_limit:      Age restriction for the video, as an integer (years)
 234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 235                     should allow to get the same result again. (It will be set
 236                     by YoutubeDL if it's missing)
 237     categories:     A list of categories that the video falls in, for example
 238                     ["Sports", "Berlin"]
 239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 240     is_live:        True, False, or None (=unknown). Whether this video is a
 241                     live stream that goes on instead of a fixed-length video.
 242     start_time:     Time in seconds where the reproduction should start, as
 243                     specified in the URL.
 244     end_time:       Time in seconds where the reproduction should end, as
 245                     specified in the URL.
 246
 247     The following fields should only be used when the video belongs to some logical
 248     chapter or section:
 249
 250     chapter:        Name or title of the chapter the video belongs to.
 251     chapter_number: Number of the chapter the video belongs to, as an integer.
 252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 253
 254     The following fields should only be used when the video is an episode of some
 255     series, programme or podcast:
 256
 257     series:         Title of the series or programme the video episode belongs to.
 258     season:         Title of the season the video episode belongs to.
 259     season_number:  Number of the season the video episode belongs to, as an integer.
 260     season_id:      Id of the season the video episode belongs to, as a unicode string.
 261     episode:        Title of the video episode. Unlike mandatory video title field,
 262                     this field should denote the exact title of the video episode
 263                     without any kind of decoration.
 264     episode_number: Number of the video episode within a season, as an integer.
 265     episode_id:     Id of the video episode, as a unicode string.
 266
 267     The following fields should only be used when the media is a track or a part of
 268     a music album:
 269
 270     track:          Title of the track.
 271     track_number:   Number of the track within an album or a disc, as an integer.
 272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 273                     as a unicode string.
 274     artist:         Artist(s) of the track.
 275     genre:          Genre(s) of the track.
 276     album:          Title of the album the track belongs to.
 277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 278     album_artist:   List of all artists appeared on the album (e.g.
 279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 280                     and compilations).
 281     disc_number:    Number of the disc or other physical medium the track belongs to,
 282                     as an integer.
 283     release_year:   Year (YYYY) when the album was released.
 284
 285     Unless mentioned otherwise, the fields should be Unicode strings.
 286
 287     Unless mentioned otherwise, None is equivalent to absence of information.
 288
 289
 290     _type "playlist" indicates multiple videos.
 291     There must be a key "entries", which is a list, an iterable, or a PagedList
 292     object, each element of which is a valid dictionary by this specification.
 293
 294     Additionally, playlists can have "title", "description" and "id" attributes
 295     with the same semantics as videos (see above).
 296
 297
 298     _type "multi_video" indicates that there are multiple videos that
 299     form a single show, for examples multiple acts of an opera or TV episode.
 300     It must have an entries key like a playlist and contain all the keys
 301     required for a video at the same time.
 302
 303
 304     _type "url" indicates that the video must be extracted from another
 305     location, possibly by a different extractor. Its only required key is:
 306     "url" - the next URL to extract.
 307     The key "ie_key" can be set to the class name (minus the trailing "IE",
 308     e.g. "Youtube") if the extractor class is known in advance.
 309     Additionally, the dictionary may have any properties of the resolved entity
 310     known in advance, for example "title" if the title of the referred video is
 311     known ahead of time.
 312
 313
 314     _type "url_transparent" entities have the same specification as "url", but
 315     indicate that the given additional information is more precise than the one
 316     associated with the resolved URL.
 317     This is useful when a site employs a video service that hosts the video and
 318     its technical metadata, but that video service does not embed a useful
 319     title, description etc.
 320
 321
 322     Subclasses of this one should re-define the _real_initialize() and
 323     _real_extract() methods and define a _VALID_URL regexp.
 324     Probably, they should also be added to the list of extractors.
 325
 326     _GEO_BYPASS attribute may be set to False in order to disable
 327     geo restriction bypass mechanisms for a particular extractor.
 328     Though it won't disable explicit geo restriction bypass based on
 329     country code provided with geo_bypass_country. (experimental)
 330
 331     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 332     countries for this extractor. One of these countries will be used by
 333     geo restriction bypass mechanism right away in order to bypass
 334     geo restriction, of course, if the mechanism is not disabled. (experimental)
 335
 336     Finally, the _WORKING attribute should be set to False for broken IEs
 337     in order to warn the users and skip the tests.
 338     """
 339
 340     _ready = False
 341     _downloader = None
 342     _x_forwarded_for_ip = None
 343     _GEO_BYPASS = True
 344     _GEO_COUNTRIES = None
 345     _WORKING = True
 346
 347     def __init__(self, downloader=None):
 348         """Constructor. Receives an optional downloader."""
 349         self._ready = False
 350         self._x_forwarded_for_ip = None
 351         self.set_downloader(downloader)
 352
 353     @classmethod
 354     def suitable(cls, url):
 355         """Receives a URL and returns True if suitable for this IE."""
 356
 357         # This does not use has/getattr intentionally - we want to know whether
 358         # we have cached the regexp for *this* class, whereas getattr would also
 359         # match the superclass
 360         if '_VALID_URL_RE' not in cls.__dict__:
 361             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 362         return cls._VALID_URL_RE.match(url) is not None
 363
 364     @classmethod
 365     def _match_id(cls, url):
 366         if '_VALID_URL_RE' not in cls.__dict__:
 367             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 368         m = cls._VALID_URL_RE.match(url)
 369         assert m
 370         return m.group('id')
 371
 372     @classmethod
 373     def working(cls):
 374         """Getter method for _WORKING."""
 375         return cls._WORKING
 376
 377     def initialize(self):
 378         """Initializes an instance (authentication, etc)."""
 379         self.__initialize_geo_bypass()
 380         if not self._ready:
 381             self._real_initialize()
 382             self._ready = True
 383
 384     def __initialize_geo_bypass(self):
 385         if not self._x_forwarded_for_ip:
 386             country_code = self._downloader.params.get('geo_bypass_country', None)
 387             # If there is no explicit country for geo bypass specified and
 388             # the extractor is known to be geo restricted let's fake IP
 389             # as X-Forwarded-For right away.
 390             if (not country_code and
 391                     self._GEO_BYPASS and
 392                     self._downloader.params.get('geo_bypass', True) and
 393                     self._GEO_COUNTRIES):
 394                 country_code = random.choice(self._GEO_COUNTRIES)
 395             if country_code:
 396                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 397                 if self._downloader.params.get('verbose', False):
 398                     self._downloader.to_stdout(
 399                         '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
 400
 401     def extract(self, url):
 402         """Extracts URL information and returns it in list of dicts."""
 403         try:
 404             for _ in range(2):
 405                 try:
 406                     self.initialize()
 407                     ie_result = self._real_extract(url)
 408                     if self._x_forwarded_for_ip:
 409                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 410                     return ie_result
 411                 except GeoRestrictedError as e:
 412                     if self.__maybe_fake_ip_and_retry(e.countries):
 413                         continue
 414                     raise
 415         except ExtractorError:
 416             raise
 417         except compat_http_client.IncompleteRead as e:
 418             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 419         except (KeyError, StopIteration) as e:
 420             raise ExtractorError('An extractor error has occurred.', cause=e)
 421
 422     def __maybe_fake_ip_and_retry(self, countries):
 423         if (not self._downloader.params.get('geo_bypass_country', None) and
 424                 self._GEO_BYPASS and
 425                 self._downloader.params.get('geo_bypass', True) and
 426                 not self._x_forwarded_for_ip and
 427                 countries):
 428             self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
 429             if self._x_forwarded_for_ip:
 430                 self.report_warning(
 431                     'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
 432                 return True
 433         return False
 434
 435     def set_downloader(self, downloader):
 436         """Sets the downloader for this IE."""
 437         self._downloader = downloader
 438
 439     def _real_initialize(self):
 440         """Real initialization process. Redefine in subclasses."""
 441         pass
 442
 443     def _real_extract(self, url):
 444         """Real extraction process. Redefine in subclasses."""
 445         pass
 446
 447     @classmethod
 448     def ie_key(cls):
 449         """A string for getting the InfoExtractor with get_info_extractor"""
 450         return compat_str(cls.__name__[:-2])
 451
 452     @property
 453     def IE_NAME(self):
 454         return compat_str(type(self).__name__[:-2])
 455
 456     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 457         """ Returns the response handle """
 458         if note is None:
 459             self.report_download_webpage(video_id)
 460         elif note is not False:
 461             if video_id is None:
 462                 self.to_screen('%s' % (note,))
 463             else:
 464                 self.to_screen('%s: %s' % (video_id, note))
 465         if isinstance(url_or_request, compat_urllib_request.Request):
 466             url_or_request = update_Request(
 467                 url_or_request, data=data, headers=headers, query=query)
 468         else:
 469             if query:
 470                 url_or_request = update_url_query(url_or_request, query)
 471             if data is not None or headers:
 472                 url_or_request = sanitized_Request(url_or_request, data, headers)
 473         try:
 474             return self._downloader.urlopen(url_or_request)
 475         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 476             if errnote is False:
 477                 return False
 478             if errnote is None:
 479                 errnote = 'Unable to download webpage'
 480
 481             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 482             if fatal:
 483                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 484             else:
 485                 self._downloader.report_warning(errmsg)
 486                 return False
 487
 488     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 489         """ Returns a tuple (page content as string, URL handle) """
 490         # Strip hashes from the URL (#1038)
 491         if isinstance(url_or_request, (compat_str, str)):
 492             url_or_request = url_or_request.partition('#')[0]
 493
 494         # Some sites check X-Forwarded-For HTTP header in order to figure out
 495         # the origin of the client behind proxy. This allows bypassing geo
 496         # restriction by faking this header's value to IP that belongs to some
 497         # geo unrestricted country. We will do so once we encounter any
 498         # geo restriction error.
 499         if self._x_forwarded_for_ip:
 500             if 'X-Forwarded-For' not in headers:
 501                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 502
 503         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 504         if urlh is False:
 505             assert not fatal
 506             return False
 507         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 508         return (content, urlh)
 509
 510     @staticmethod
 511     def _guess_encoding_from_content(content_type, webpage_bytes):
 512         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 513         if m:
 514             encoding = m.group(1)
 515         else:
 516             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 517                           webpage_bytes[:1024])
 518             if m:
 519                 encoding = m.group(1).decode('ascii')
 520             elif webpage_bytes.startswith(b'\xff\xfe'):
 521                 encoding = 'utf-16'
 522             else:
 523                 encoding = 'utf-8'
 524
 525         return encoding
 526
 527     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 528         content_type = urlh.headers.get('Content-Type', '')
 529         webpage_bytes = urlh.read()
 530         if prefix is not None:
 531             webpage_bytes = prefix + webpage_bytes
 532         if not encoding:
 533             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 534         if self._downloader.params.get('dump_intermediate_pages', False):
 535             try:
 536                 url = url_or_request.get_full_url()
 537             except AttributeError:
 538                 url = url_or_request
 539             self.to_screen('Dumping request to ' + url)
 540             dump = base64.b64encode(webpage_bytes).decode('ascii')
 541             self._downloader.to_screen(dump)
 542         if self._downloader.params.get('write_pages', False):
 543             try:
 544                 url = url_or_request.get_full_url()
 545             except AttributeError:
 546                 url = url_or_request
 547             basen = '%s_%s' % (video_id, url)
 548             if len(basen) > 240:
 549                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 550                 basen = basen[:240 - len(h)] + h
 551             raw_filename = basen + '.dump'
 552             filename = sanitize_filename(raw_filename, restricted=True)
 553             self.to_screen('Saving request to ' + filename)
 554             # Working around MAX_PATH limitation on Windows (see
 555             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 556             if compat_os_name == 'nt':
 557                 absfilepath = os.path.abspath(filename)
 558                 if len(absfilepath) > 259:
 559                     filename = '\\\\?\\' + absfilepath
 560             with open(filename, 'wb') as outf:
 561                 outf.write(webpage_bytes)
 562
 563         try:
 564             content = webpage_bytes.decode(encoding, 'replace')
 565         except LookupError:
 566             content = webpage_bytes.decode('utf-8', 'replace')
 567
 568         if ('<title>Access to this site is blocked</title>' in content and
 569                 'Websense' in content[:512]):
 570             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 571             blocked_iframe = self._html_search_regex(
 572                 r'<iframe src="([^"]+)"', content,
 573                 'Websense information URL', default=None)
 574             if blocked_iframe:
 575                 msg += ' Visit %s for more details' % blocked_iframe
 576             raise ExtractorError(msg, expected=True)
 577         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 578             msg = (
 579                 'Access to this webpage has been blocked by Indian censorship. '
 580                 'Use a VPN or proxy server (with --proxy) to route around it.')
 581             block_msg = self._html_search_regex(
 582                 r'</h1><p>(.*?)</p>',
 583                 content, 'block message', default=None)
 584             if block_msg:
 585                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 586             raise ExtractorError(msg, expected=True)
 587
 588         return content
 589
 590     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 591         """ Returns the data of the page as a string """
 592         success = False
 593         try_count = 0
 594         while success is False:
 595             try:
 596                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 597                 success = True
 598             except compat_http_client.IncompleteRead as e:
 599                 try_count += 1
 600                 if try_count >= tries:
 601                     raise e
 602                 self._sleep(timeout, video_id)
 603         if res is False:
 604             return res
 605         else:
 606             content, _ = res
 607             return content
 608
 609     def _download_xml(self, url_or_request, video_id,
 610                       note='Downloading XML', errnote='Unable to download XML',
 611                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 612         """Return the xml as an xml.etree.ElementTree.Element"""
 613         xml_string = self._download_webpage(
 614             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 615         if xml_string is False:
 616             return xml_string
 617         if transform_source:
 618             xml_string = transform_source(xml_string)
 619         return compat_etree_fromstring(xml_string.encode('utf-8'))
 620
 621     def _download_json(self, url_or_request, video_id,
 622                        note='Downloading JSON metadata',
 623                        errnote='Unable to download JSON metadata',
 624                        transform_source=None,
 625                        fatal=True, encoding=None, data=None, headers={}, query={}):
 626         json_string = self._download_webpage(
 627             url_or_request, video_id, note, errnote, fatal=fatal,
 628             encoding=encoding, data=data, headers=headers, query=query)
 629         if (not fatal) and json_string is False:
 630             return None
 631         return self._parse_json(
 632             json_string, video_id, transform_source=transform_source, fatal=fatal)
 633
 634     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 635         if transform_source:
 636             json_string = transform_source(json_string)
 637         try:
 638             return json.loads(json_string)
 639         except ValueError as ve:
 640             errmsg = '%s: Failed to parse JSON ' % video_id
 641             if fatal:
 642                 raise ExtractorError(errmsg, cause=ve)
 643             else:
 644                 self.report_warning(errmsg + str(ve))
 645
 646     def report_warning(self, msg, video_id=None):
 647         idstr = '' if video_id is None else '%s: ' % video_id
 648         self._downloader.report_warning(
 649             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 650
 651     def to_screen(self, msg):
 652         """Print msg to screen, prefixing it with '[ie_name]'"""
 653         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 654
 655     def report_extraction(self, id_or_name):
 656         """Report information extraction."""
 657         self.to_screen('%s: Extracting information' % id_or_name)
 658
 659     def report_download_webpage(self, video_id):
 660         """Report webpage download."""
 661         self.to_screen('%s: Downloading webpage' % video_id)
 662
 663     def report_age_confirmation(self):
 664         """Report attempt to confirm age."""
 665         self.to_screen('Confirming age')
 666
 667     def report_login(self):
 668         """Report attempt to log in."""
 669         self.to_screen('Logging in')
 670
 671     @staticmethod
 672     def raise_login_required(msg='This video is only available for registered users'):
 673         raise ExtractorError(
 674             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 675             expected=True)
 676
 677     @staticmethod
 678     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 679         raise GeoRestrictedError(msg, countries=countries)
 680
 681     # Methods for following #608
 682     @staticmethod
 683     def url_result(url, ie=None, video_id=None, video_title=None):
 684         """Returns a URL that points to a page that should be processed"""
 685         # TODO: ie should be the class used for getting the info
 686         video_info = {'_type': 'url',
 687                       'url': url,
 688                       'ie_key': ie}
 689         if video_id is not None:
 690             video_info['id'] = video_id
 691         if video_title is not None:
 692             video_info['title'] = video_title
 693         return video_info
 694
 695     @staticmethod
 696     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 697         """Returns a playlist"""
 698         video_info = {'_type': 'playlist',
 699                       'entries': entries}
 700         if playlist_id:
 701             video_info['id'] = playlist_id
 702         if playlist_title:
 703             video_info['title'] = playlist_title
 704         if playlist_description:
 705             video_info['description'] = playlist_description
 706         return video_info
 707
 708     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 709         """
 710         Perform a regex search on the given string, using a single or a list of
 711         patterns returning the first matching group.
 712         In case of failure return a default value or raise a WARNING or a
 713         RegexNotFoundError, depending on fatal, specifying the field name.
 714         """
 715         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 716             mobj = re.search(pattern, string, flags)
 717         else:
 718             for p in pattern:
 719                 mobj = re.search(p, string, flags)
 720                 if mobj:
 721                     break
 722
 723         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 724             _name = '\033[0;34m%s\033[0m' % name
 725         else:
 726             _name = name
 727
 728         if mobj:
 729             if group is None:
 730                 # return the first matching group
 731                 return next(g for g in mobj.groups() if g is not None)
 732             else:
 733                 return mobj.group(group)
 734         elif default is not NO_DEFAULT:
 735             return default
 736         elif fatal:
 737             raise RegexNotFoundError('Unable to extract %s' % _name)
 738         else:
 739             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 740             return None
 741
 742     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 743         """
 744         Like _search_regex, but strips HTML tags and unescapes entities.
 745         """
 746         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 747         if res:
 748             return clean_html(res).strip()
 749         else:
 750             return res
 751
 752     def _get_netrc_login_info(self, netrc_machine=None):
 753         username = None
 754         password = None
 755         netrc_machine = netrc_machine or self._NETRC_MACHINE
 756
 757         if self._downloader.params.get('usenetrc', False):
 758             try:
 759                 info = netrc.netrc().authenticators(netrc_machine)
 760                 if info is not None:
 761                     username = info[0]
 762                     password = info[2]
 763                 else:
 764                     raise netrc.NetrcParseError(
 765                         'No authenticators for %s' % netrc_machine)
 766             except (IOError, netrc.NetrcParseError) as err:
 767                 self._downloader.report_warning(
 768                     'parsing .netrc: %s' % error_to_compat_str(err))
 769
 770         return username, password
 771
 772     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 773         """
 774         Get the login info as (username, password)
 775         First look for the manually specified credentials using username_option
 776         and password_option as keys in params dictionary. If no such credentials
 777         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 778         value.
 779         If there's no info available, return (None, None)
 780         """
 781         if self._downloader is None:
 782             return (None, None)
 783
 784         downloader_params = self._downloader.params
 785
 786         # Attempt to use provided username and password or .netrc data
 787         if downloader_params.get(username_option) is not None:
 788             username = downloader_params[username_option]
 789             password = downloader_params[password_option]
 790         else:
 791             username, password = self._get_netrc_login_info(netrc_machine)
 792
 793         return username, password
 794
 795     def _get_tfa_info(self, note='two-factor verification code'):
 796         """
 797         Get the two-factor authentication info
 798         TODO - asking the user will be required for sms/phone verify
 799         currently just uses the command line option
 800         If there's no info available, return None
 801         """
 802         if self._downloader is None:
 803             return None
 804         downloader_params = self._downloader.params
 805
 806         if downloader_params.get('twofactor') is not None:
 807             return downloader_params['twofactor']
 808
 809         return compat_getpass('Type %s and press [Return]: ' % note)
 810
 811     # Helper functions for extracting OpenGraph info
 812     @staticmethod
 813     def _og_regexes(prop):
 814         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 815         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 816                        % {'prop': re.escape(prop)})
 817         template = r'<meta[^>]+?%s[^>]+?%s'
 818         return [
 819             template % (property_re, content_re),
 820             template % (content_re, property_re),
 821         ]
 822
 823     @staticmethod
 824     def _meta_regex(prop):
 825         return r'''(?isx)<meta
 826                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 827                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 828
 829     def _og_search_property(self, prop, html, name=None, **kargs):
 830         if not isinstance(prop, (list, tuple)):
 831             prop = [prop]
 832         if name is None:
 833             name = 'OpenGraph %s' % prop[0]
 834         og_regexes = []
 835         for p in prop:
 836             og_regexes.extend(self._og_regexes(p))
 837         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 838         if escaped is None:
 839             return None
 840         return unescapeHTML(escaped)
 841
 842     def _og_search_thumbnail(self, html, **kargs):
 843         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 844
 845     def _og_search_description(self, html, **kargs):
 846         return self._og_search_property('description', html, fatal=False, **kargs)
 847
 848     def _og_search_title(self, html, **kargs):
 849         return self._og_search_property('title', html, **kargs)
 850
 851     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 852         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 853         if secure:
 854             regexes = self._og_regexes('video:secure_url') + regexes
 855         return self._html_search_regex(regexes, html, name, **kargs)
 856
 857     def _og_search_url(self, html, **kargs):
 858         return self._og_search_property('url', html, **kargs)
 859
 860     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 861         if not isinstance(name, (list, tuple)):
 862             name = [name]
 863         if display_name is None:
 864             display_name = name[0]
 865         return self._html_search_regex(
 866             [self._meta_regex(n) for n in name],
 867             html, display_name, fatal=fatal, group='content', **kwargs)
 868
 869     def _dc_search_uploader(self, html):
 870         return self._html_search_meta('dc.creator', html, 'uploader')
 871
 872     def _rta_search(self, html):
 873         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 874         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 875                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 876                      html):
 877             return 18
 878         return 0
 879
 880     def _media_rating_search(self, html):
 881         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 882         rating = self._html_search_meta('rating', html)
 883
 884         if not rating:
 885             return None
 886
 887         RATING_TABLE = {
 888             'safe for kids': 0,
 889             'general': 8,
 890             '14 years': 14,
 891             'mature': 17,
 892             'restricted': 19,
 893         }
 894         return RATING_TABLE.get(rating.lower())
 895
 896     def _family_friendly_search(self, html):
 897         # See http://schema.org/VideoObject
 898         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 899
 900         if not family_friendly:
 901             return None
 902
 903         RATING_TABLE = {
 904             '1': 0,
 905             'true': 0,
 906             '0': 18,
 907             'false': 18,
 908         }
 909         return RATING_TABLE.get(family_friendly.lower())
 910
 911     def _twitter_search_player(self, html):
 912         return self._html_search_meta('twitter:player', html,
 913                                       'twitter card player')
 914
 915     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 916         json_ld = self._search_regex(
 917             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 918             html, 'JSON-LD', group='json_ld', **kwargs)
 919         default = kwargs.get('default', NO_DEFAULT)
 920         if not json_ld:
 921             return default if default is not NO_DEFAULT else {}
 922         # JSON-LD may be malformed and thus `fatal` should be respected.
 923         # At the same time `default` may be passed that assumes `fatal=False`
 924         # for _search_regex. Let's simulate the same behavior here as well.
 925         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 926         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 927
 928     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 929         if isinstance(json_ld, compat_str):
 930             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 931         if not json_ld:
 932             return {}
 933         info = {}
 934         if not isinstance(json_ld, (list, tuple, dict)):
 935             return info
 936         if isinstance(json_ld, dict):
 937             json_ld = [json_ld]
 938         for e in json_ld:
 939             if e.get('@context') == 'http://schema.org':
 940                 item_type = e.get('@type')
 941                 if expected_type is not None and expected_type != item_type:
 942                     return info
 943                 if item_type == 'TVEpisode':
 944                     info.update({
 945                         'episode': unescapeHTML(e.get('name')),
 946                         'episode_number': int_or_none(e.get('episodeNumber')),
 947                         'description': unescapeHTML(e.get('description')),
 948                     })
 949                     part_of_season = e.get('partOfSeason')
 950                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 951                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 952                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 953                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 954                         info['series'] = unescapeHTML(part_of_series.get('name'))
 955                 elif item_type == 'Article':
 956                     info.update({
 957                         'timestamp': parse_iso8601(e.get('datePublished')),
 958                         'title': unescapeHTML(e.get('headline')),
 959                         'description': unescapeHTML(e.get('articleBody')),
 960                     })
 961                 elif item_type == 'VideoObject':
 962                     info.update({
 963                         'url': e.get('contentUrl'),
 964                         'title': unescapeHTML(e.get('name')),
 965                         'description': unescapeHTML(e.get('description')),
 966                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 967                         'duration': parse_duration(e.get('duration')),
 968                         'timestamp': unified_timestamp(e.get('uploadDate')),
 969                         'filesize': float_or_none(e.get('contentSize')),
 970                         'tbr': int_or_none(e.get('bitrate')),
 971                         'width': int_or_none(e.get('width')),
 972                         'height': int_or_none(e.get('height')),
 973                     })
 974                 break
 975         return dict((k, v) for k, v in info.items() if v is not None)
 976
 977     @staticmethod
 978     def _hidden_inputs(html):
 979         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 980         hidden_inputs = {}
 981         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 982             attrs = extract_attributes(input)
 983             if not input:
 984                 continue
 985             if attrs.get('type') not in ('hidden', 'submit'):
 986                 continue
 987             name = attrs.get('name') or attrs.get('id')
 988             value = attrs.get('value')
 989             if name and value is not None:
 990                 hidden_inputs[name] = value
 991         return hidden_inputs
 992
 993     def _form_hidden_inputs(self, form_id, html):
 994         form = self._search_regex(
 995             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 996             html, '%s form' % form_id, group='form')
 997         return self._hidden_inputs(form)
 998
 999     def _sort_formats(self, formats, field_preference=None):
1000         if not formats:
1001             raise ExtractorError('No video formats found')
1002
1003         for f in formats:
1004             # Automatically determine tbr when missing based on abr and vbr (improves
1005             # formats sorting in some cases)
1006             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1007                 f['tbr'] = f['abr'] + f['vbr']
1008
1009         def _formats_key(f):
1010             # TODO remove the following workaround
1011             from ..utils import determine_ext
1012             if not f.get('ext') and 'url' in f:
1013                 f['ext'] = determine_ext(f['url'])
1014
1015             if isinstance(field_preference, (list, tuple)):
1016                 return tuple(
1017                     f.get(field)
1018                     if f.get(field) is not None
1019                     else ('' if field == 'format_id' else -1)
1020                     for field in field_preference)
1021
1022             preference = f.get('preference')
1023             if preference is None:
1024                 preference = 0
1025                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1026                     preference -= 0.5
1027
1028             protocol = f.get('protocol') or determine_protocol(f)
1029             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1030
1031             if f.get('vcodec') == 'none':  # audio only
1032                 preference -= 50
1033                 if self._downloader.params.get('prefer_free_formats'):
1034                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1035                 else:
1036                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1037                 ext_preference = 0
1038                 try:
1039                     audio_ext_preference = ORDER.index(f['ext'])
1040                 except ValueError:
1041                     audio_ext_preference = -1
1042             else:
1043                 if f.get('acodec') == 'none':  # video only
1044                     preference -= 40
1045                 if self._downloader.params.get('prefer_free_formats'):
1046                     ORDER = ['flv', 'mp4', 'webm']
1047                 else:
1048                     ORDER = ['webm', 'flv', 'mp4']
1049                 try:
1050                     ext_preference = ORDER.index(f['ext'])
1051                 except ValueError:
1052                     ext_preference = -1
1053                 audio_ext_preference = 0
1054
1055             return (
1056                 preference,
1057                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1058                 f.get('quality') if f.get('quality') is not None else -1,
1059                 f.get('tbr') if f.get('tbr') is not None else -1,
1060                 f.get('filesize') if f.get('filesize') is not None else -1,
1061                 f.get('vbr') if f.get('vbr') is not None else -1,
1062                 f.get('height') if f.get('height') is not None else -1,
1063                 f.get('width') if f.get('width') is not None else -1,
1064                 proto_preference,
1065                 ext_preference,
1066                 f.get('abr') if f.get('abr') is not None else -1,
1067                 audio_ext_preference,
1068                 f.get('fps') if f.get('fps') is not None else -1,
1069                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1070                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1071                 f.get('format_id') if f.get('format_id') is not None else '',
1072             )
1073         formats.sort(key=_formats_key)
1074
1075     def _check_formats(self, formats, video_id):
1076         if formats:
1077             formats[:] = filter(
1078                 lambda f: self._is_valid_url(
1079                     f['url'], video_id,
1080                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1081                 formats)
1082
1083     @staticmethod
1084     def _remove_duplicate_formats(formats):
1085         format_urls = set()
1086         unique_formats = []
1087         for f in formats:
1088             if f['url'] not in format_urls:
1089                 format_urls.add(f['url'])
1090                 unique_formats.append(f)
1091         formats[:] = unique_formats
1092
1093     def _is_valid_url(self, url, video_id, item='video', headers={}):
1094         url = self._proto_relative_url(url, scheme='http:')
1095         # For now assume non HTTP(S) URLs always valid
1096         if not (url.startswith('http://') or url.startswith('https://')):
1097             return True
1098         try:
1099             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1100             return True
1101         except ExtractorError as e:
1102             if isinstance(e.cause, compat_urllib_error.URLError):
1103                 self.to_screen(
1104                     '%s: %s URL is invalid, skipping' % (video_id, item))
1105                 return False
1106             raise
1107
1108     def http_scheme(self):
1109         """ Either "http:" or "https:", depending on the user's preferences """
1110         return (
1111             'http:'
1112             if self._downloader.params.get('prefer_insecure', False)
1113             else 'https:')
1114
1115     def _proto_relative_url(self, url, scheme=None):
1116         if url is None:
1117             return url
1118         if url.startswith('//'):
1119             if scheme is None:
1120                 scheme = self.http_scheme()
1121             return scheme + url
1122         else:
1123             return url
1124
1125     def _sleep(self, timeout, video_id, msg_template=None):
1126         if msg_template is None:
1127             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1128         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1129         self.to_screen(msg)
1130         time.sleep(timeout)
1131
1132     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1133                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1134                              fatal=True, m3u8_id=None):
1135         manifest = self._download_xml(
1136             manifest_url, video_id, 'Downloading f4m manifest',
1137             'Unable to download f4m manifest',
1138             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1139             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1140             transform_source=transform_source,
1141             fatal=fatal)
1142
1143         if manifest is False:
1144             return []
1145
1146         return self._parse_f4m_formats(
1147             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1148             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1149
1150     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1151                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1152                            fatal=True, m3u8_id=None):
1153         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1154         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1155         if akamai_pv is not None and ';' in akamai_pv.text:
1156             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1157             if playerVerificationChallenge.strip() != '':
1158                 return []
1159
1160         formats = []
1161         manifest_version = '1.0'
1162         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1163         if not media_nodes:
1164             manifest_version = '2.0'
1165             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1166         # Remove unsupported DRM protected media from final formats
1167         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1168         media_nodes = remove_encrypted_media(media_nodes)
1169         if not media_nodes:
1170             return formats
1171         base_url = xpath_text(
1172             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1173             'base URL', default=None)
1174         if base_url:
1175             base_url = base_url.strip()
1176
1177         bootstrap_info = xpath_element(
1178             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1179             'bootstrap info', default=None)
1180
1181         vcodec = None
1182         mime_type = xpath_text(
1183             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1184             'base URL', default=None)
1185         if mime_type and mime_type.startswith('audio/'):
1186             vcodec = 'none'
1187
1188         for i, media_el in enumerate(media_nodes):
1189             tbr = int_or_none(media_el.attrib.get('bitrate'))
1190             width = int_or_none(media_el.attrib.get('width'))
1191             height = int_or_none(media_el.attrib.get('height'))
1192             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1193             # If <bootstrapInfo> is present, the specified f4m is a
1194             # stream-level manifest, and only set-level manifests may refer to
1195             # external resources.  See section 11.4 and section 4 of F4M spec
1196             if bootstrap_info is None:
1197                 media_url = None
1198                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1199                 if manifest_version == '2.0':
1200                     media_url = media_el.attrib.get('href')
1201                 if media_url is None:
1202                     media_url = media_el.attrib.get('url')
1203                 if not media_url:
1204                     continue
1205                 manifest_url = (
1206                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1207                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1208                 # If media_url is itself a f4m manifest do the recursive extraction
1209                 # since bitrates in parent manifest (this one) and media_url manifest
1210                 # may differ leading to inability to resolve the format by requested
1211                 # bitrate in f4m downloader
1212                 ext = determine_ext(manifest_url)
1213                 if ext == 'f4m':
1214                     f4m_formats = self._extract_f4m_formats(
1215                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1216                         transform_source=transform_source, fatal=fatal)
1217                     # Sometimes stream-level manifest contains single media entry that
1218                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1219                     # At the same time parent's media entry in set-level manifest may
1220                     # contain it. We will copy it from parent in such cases.
1221                     if len(f4m_formats) == 1:
1222                         f = f4m_formats[0]
1223                         f.update({
1224                             'tbr': f.get('tbr') or tbr,
1225                             'width': f.get('width') or width,
1226                             'height': f.get('height') or height,
1227                             'format_id': f.get('format_id') if not tbr else format_id,
1228                             'vcodec': vcodec,
1229                         })
1230                     formats.extend(f4m_formats)
1231                     continue
1232                 elif ext == 'm3u8':
1233                     formats.extend(self._extract_m3u8_formats(
1234                         manifest_url, video_id, 'mp4', preference=preference,
1235                         m3u8_id=m3u8_id, fatal=fatal))
1236                     continue
1237             formats.append({
1238                 'format_id': format_id,
1239                 'url': manifest_url,
1240                 'manifest_url': manifest_url,
1241                 'ext': 'flv' if bootstrap_info is not None else None,
1242                 'tbr': tbr,
1243                 'width': width,
1244                 'height': height,
1245                 'vcodec': vcodec,
1246                 'preference': preference,
1247             })
1248         return formats
1249
1250     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1251         return {
1252             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1253             'url': m3u8_url,
1254             'ext': ext,
1255             'protocol': 'm3u8',
1256             'preference': preference - 100 if preference else -100,
1257             'resolution': 'multiple',
1258             'format_note': 'Quality selection URL',
1259         }
1260
1261     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1262                               entry_protocol='m3u8', preference=None,
1263                               m3u8_id=None, note=None, errnote=None,
1264                               fatal=True, live=False):
1265
1266         res = self._download_webpage_handle(
1267             m3u8_url, video_id,
1268             note=note or 'Downloading m3u8 information',
1269             errnote=errnote or 'Failed to download m3u8 information',
1270             fatal=fatal)
1271         if res is False:
1272             return []
1273         m3u8_doc, urlh = res
1274         m3u8_url = urlh.geturl()
1275
1276         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1277             return []
1278
1279         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1280
1281         format_url = lambda u: (
1282             u
1283             if re.match(r'^https?://', u)
1284             else compat_urlparse.urljoin(m3u8_url, u))
1285
1286         # We should try extracting formats only from master playlists [1], i.e.
1287         # playlists that describe available qualities. On the other hand media
1288         # playlists [2] should be returned as is since they contain just the media
1289         # without qualities renditions.
1290         # Fortunately, master playlist can be easily distinguished from media
1291         # playlist based on particular tags availability. As of [1, 2] master
1292         # playlist tags MUST NOT appear in a media playist and vice versa.
1293         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1294         # and MUST NOT appear in master playlist thus we can clearly detect media
1295         # playlist with this criterion.
1296         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1297         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1298         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1299         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1300             return [{
1301                 'url': m3u8_url,
1302                 'format_id': m3u8_id,
1303                 'ext': ext,
1304                 'protocol': entry_protocol,
1305                 'preference': preference,
1306             }]
1307         audio_in_video_stream = {}
1308         last_info = {}
1309         last_media = {}
1310         for line in m3u8_doc.splitlines():
1311             if line.startswith('#EXT-X-STREAM-INF:'):
1312                 last_info = parse_m3u8_attributes(line)
1313             elif line.startswith('#EXT-X-MEDIA:'):
1314                 media = parse_m3u8_attributes(line)
1315                 media_type = media.get('TYPE')
1316                 if media_type in ('VIDEO', 'AUDIO'):
1317                     group_id = media.get('GROUP-ID')
1318                     media_url = media.get('URI')
1319                     if media_url:
1320                         format_id = []
1321                         for v in (group_id, media.get('NAME')):
1322                             if v:
1323                                 format_id.append(v)
1324                         f = {
1325                             'format_id': '-'.join(format_id),
1326                             'url': format_url(media_url),
1327                             'language': media.get('LANGUAGE'),
1328                             'ext': ext,
1329                             'protocol': entry_protocol,
1330                             'preference': preference,
1331                         }
1332                         if media_type == 'AUDIO':
1333                             f['vcodec'] = 'none'
1334                             if group_id and not audio_in_video_stream.get(group_id):
1335                                 audio_in_video_stream[group_id] = False
1336                         formats.append(f)
1337                     else:
1338                         # When there is no URI in EXT-X-MEDIA let this tag's
1339                         # data be used by regular URI lines below
1340                         last_media = media
1341                         if media_type == 'AUDIO' and group_id:
1342                             audio_in_video_stream[group_id] = True
1343             elif line.startswith('#') or not line.strip():
1344                 continue
1345             else:
1346                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1347                 format_id = []
1348                 if m3u8_id:
1349                     format_id.append(m3u8_id)
1350                 # Despite specification does not mention NAME attribute for
1351                 # EXT-X-STREAM-INF it still sometimes may be present
1352                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1353                 # Bandwidth of live streams may differ over time thus making
1354                 # format_id unpredictable. So it's better to keep provided
1355                 # format_id intact.
1356                 if not live:
1357                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1358                 manifest_url = format_url(line.strip())
1359                 f = {
1360                     'format_id': '-'.join(format_id),
1361                     'url': manifest_url,
1362                     'manifest_url': manifest_url,
1363                     'tbr': tbr,
1364                     'ext': ext,
1365                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1366                     'protocol': entry_protocol,
1367                     'preference': preference,
1368                 }
1369                 resolution = last_info.get('RESOLUTION')
1370                 if resolution:
1371                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1372                     if mobj:
1373                         f['width'] = int(mobj.group('width'))
1374                         f['height'] = int(mobj.group('height'))
1375                 # Unified Streaming Platform
1376                 mobj = re.search(
1377                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1378                 if mobj:
1379                     abr, vbr = mobj.groups()
1380                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1381                     f.update({
1382                         'vbr': vbr,
1383                         'abr': abr,
1384                     })
1385                 f.update(parse_codecs(last_info.get('CODECS')))
1386                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1387                     # TODO: update acodec for audio only formats with the same GROUP-ID
1388                     f['acodec'] = 'none'
1389                 formats.append(f)
1390                 last_info = {}
1391                 last_media = {}
1392         return formats
1393
1394     @staticmethod
1395     def _xpath_ns(path, namespace=None):
1396         if not namespace:
1397             return path
1398         out = []
1399         for c in path.split('/'):
1400             if not c or c == '.':
1401                 out.append(c)
1402             else:
1403                 out.append('{%s}%s' % (namespace, c))
1404         return '/'.join(out)
1405
1406     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1407         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1408
1409         if smil is False:
1410             assert not fatal
1411             return []
1412
1413         namespace = self._parse_smil_namespace(smil)
1414
1415         return self._parse_smil_formats(
1416             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1417
1418     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1419         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1420         if smil is False:
1421             return {}
1422         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1423
1424     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1425         return self._download_xml(
1426             smil_url, video_id, 'Downloading SMIL file',
1427             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1428
1429     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1430         namespace = self._parse_smil_namespace(smil)
1431
1432         formats = self._parse_smil_formats(
1433             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1434         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1435
1436         video_id = os.path.splitext(url_basename(smil_url))[0]
1437         title = None
1438         description = None
1439         upload_date = None
1440         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1441             name = meta.attrib.get('name')
1442             content = meta.attrib.get('content')
1443             if not name or not content:
1444                 continue
1445             if not title and name == 'title':
1446                 title = content
1447             elif not description and name in ('description', 'abstract'):
1448                 description = content
1449             elif not upload_date and name == 'date':
1450                 upload_date = unified_strdate(content)
1451
1452         thumbnails = [{
1453             'id': image.get('type'),
1454             'url': image.get('src'),
1455             'width': int_or_none(image.get('width')),
1456             'height': int_or_none(image.get('height')),
1457         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1458
1459         return {
1460             'id': video_id,
1461             'title': title or video_id,
1462             'description': description,
1463             'upload_date': upload_date,
1464             'thumbnails': thumbnails,
1465             'formats': formats,
1466             'subtitles': subtitles,
1467         }
1468
1469     def _parse_smil_namespace(self, smil):
1470         return self._search_regex(
1471             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1472
1473     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1474         base = smil_url
1475         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1476             b = meta.get('base') or meta.get('httpBase')
1477             if b:
1478                 base = b
1479                 break
1480
1481         formats = []
1482         rtmp_count = 0
1483         http_count = 0
1484         m3u8_count = 0
1485
1486         srcs = []
1487         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1488         for medium in media:
1489             src = medium.get('src')
1490             if not src or src in srcs:
1491                 continue
1492             srcs.append(src)
1493
1494             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1495             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1496             width = int_or_none(medium.get('width'))
1497             height = int_or_none(medium.get('height'))
1498             proto = medium.get('proto')
1499             ext = medium.get('ext')
1500             src_ext = determine_ext(src)
1501             streamer = medium.get('streamer') or base
1502
1503             if proto == 'rtmp' or streamer.startswith('rtmp'):
1504                 rtmp_count += 1
1505                 formats.append({
1506                     'url': streamer,
1507                     'play_path': src,
1508                     'ext': 'flv',
1509                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1510                     'tbr': bitrate,
1511                     'filesize': filesize,
1512                     'width': width,
1513                     'height': height,
1514                 })
1515                 if transform_rtmp_url:
1516                     streamer, src = transform_rtmp_url(streamer, src)
1517                     formats[-1].update({
1518                         'url': streamer,
1519                         'play_path': src,
1520                     })
1521                 continue
1522
1523             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1524             src_url = src_url.strip()
1525
1526             if proto == 'm3u8' or src_ext == 'm3u8':
1527                 m3u8_formats = self._extract_m3u8_formats(
1528                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1529                 if len(m3u8_formats) == 1:
1530                     m3u8_count += 1
1531                     m3u8_formats[0].update({
1532                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1533                         'tbr': bitrate,
1534                         'width': width,
1535                         'height': height,
1536                     })
1537                 formats.extend(m3u8_formats)
1538                 continue
1539
1540             if src_ext == 'f4m':
1541                 f4m_url = src_url
1542                 if not f4m_params:
1543                     f4m_params = {
1544                         'hdcore': '3.2.0',
1545                         'plugin': 'flowplayer-3.2.0.1',
1546                     }
1547                 f4m_url += '&' if '?' in f4m_url else '?'
1548                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1549                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1550                 continue
1551
1552             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1553                 http_count += 1
1554                 formats.append({
1555                     'url': src_url,
1556                     'ext': ext or src_ext or 'flv',
1557                     'format_id': 'http-%d' % (bitrate or http_count),
1558                     'tbr': bitrate,
1559                     'filesize': filesize,
1560                     'width': width,
1561                     'height': height,
1562                 })
1563                 continue
1564
1565         return formats
1566
1567     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1568         urls = []
1569         subtitles = {}
1570         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1571             src = textstream.get('src')
1572             if not src or src in urls:
1573                 continue
1574             urls.append(src)
1575             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1576             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1577             subtitles.setdefault(lang, []).append({
1578                 'url': src,
1579                 'ext': ext,
1580             })
1581         return subtitles
1582
1583     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1584         xspf = self._download_xml(
1585             playlist_url, playlist_id, 'Downloading xpsf playlist',
1586             'Unable to download xspf manifest', fatal=fatal)
1587         if xspf is False:
1588             return []
1589         return self._parse_xspf(xspf, playlist_id)
1590
1591     def _parse_xspf(self, playlist, playlist_id):
1592         NS_MAP = {
1593             'xspf': 'http://xspf.org/ns/0/',
1594             's1': 'http://static.streamone.nl/player/ns/0',
1595         }
1596
1597         entries = []
1598         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1599             title = xpath_text(
1600                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1601             description = xpath_text(
1602                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1603             thumbnail = xpath_text(
1604                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1605             duration = float_or_none(
1606                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1607
1608             formats = [{
1609                 'url': location.text,
1610                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1611                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1612                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1613             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1614             self._sort_formats(formats)
1615
1616             entries.append({
1617                 'id': playlist_id,
1618                 'title': title,
1619                 'description': description,
1620                 'thumbnail': thumbnail,
1621                 'duration': duration,
1622                 'formats': formats,
1623             })
1624         return entries
1625
1626     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1627         res = self._download_webpage_handle(
1628             mpd_url, video_id,
1629             note=note or 'Downloading MPD manifest',
1630             errnote=errnote or 'Failed to download MPD manifest',
1631             fatal=fatal)
1632         if res is False:
1633             return []
1634         mpd, urlh = res
1635         mpd_base_url = base_url(urlh.geturl())
1636
1637         return self._parse_mpd_formats(
1638             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1639             formats_dict=formats_dict, mpd_url=mpd_url)
1640
1641     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1642         """
1643         Parse formats from MPD manifest.
1644         References:
1645          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1646             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1647          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1648         """
1649         if mpd_doc.get('type') == 'dynamic':
1650             return []
1651
1652         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1653
1654         def _add_ns(path):
1655             return self._xpath_ns(path, namespace)
1656
1657         def is_drm_protected(element):
1658             return element.find(_add_ns('ContentProtection')) is not None
1659
1660         def extract_multisegment_info(element, ms_parent_info):
1661             ms_info = ms_parent_info.copy()
1662
1663             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1664             # common attributes and elements.  We will only extract relevant
1665             # for us.
1666             def extract_common(source):
1667                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1668                 if segment_timeline is not None:
1669                     s_e = segment_timeline.findall(_add_ns('S'))
1670                     if s_e:
1671                         ms_info['total_number'] = 0
1672                         ms_info['s'] = []
1673                         for s in s_e:
1674                             r = int(s.get('r', 0))
1675                             ms_info['total_number'] += 1 + r
1676                             ms_info['s'].append({
1677                                 't': int(s.get('t', 0)),
1678                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1679                                 'd': int(s.attrib['d']),
1680                                 'r': r,
1681                             })
1682                 start_number = source.get('startNumber')
1683                 if start_number:
1684                     ms_info['start_number'] = int(start_number)
1685                 timescale = source.get('timescale')
1686                 if timescale:
1687                     ms_info['timescale'] = int(timescale)
1688                 segment_duration = source.get('duration')
1689                 if segment_duration:
1690                     ms_info['segment_duration'] = int(segment_duration)
1691
1692             def extract_Initialization(source):
1693                 initialization = source.find(_add_ns('Initialization'))
1694                 if initialization is not None:
1695                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1696
1697             segment_list = element.find(_add_ns('SegmentList'))
1698             if segment_list is not None:
1699                 extract_common(segment_list)
1700                 extract_Initialization(segment_list)
1701                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1702                 if segment_urls_e:
1703                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1704             else:
1705                 segment_template = element.find(_add_ns('SegmentTemplate'))
1706                 if segment_template is not None:
1707                     extract_common(segment_template)
1708                     media = segment_template.get('media')
1709                     if media:
1710                         ms_info['media'] = media
1711                     initialization = segment_template.get('initialization')
1712                     if initialization:
1713                         ms_info['initialization'] = initialization
1714                     else:
1715                         extract_Initialization(segment_template)
1716             return ms_info
1717
1718         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1719         formats = []
1720         for period in mpd_doc.findall(_add_ns('Period')):
1721             period_duration = parse_duration(period.get('duration')) or mpd_duration
1722             period_ms_info = extract_multisegment_info(period, {
1723                 'start_number': 1,
1724                 'timescale': 1,
1725             })
1726             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1727                 if is_drm_protected(adaptation_set):
1728                     continue
1729                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1730                 for representation in adaptation_set.findall(_add_ns('Representation')):
1731                     if is_drm_protected(representation):
1732                         continue
1733                     representation_attrib = adaptation_set.attrib.copy()
1734                     representation_attrib.update(representation.attrib)
1735                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1736                     mime_type = representation_attrib['mimeType']
1737                     content_type = mime_type.split('/')[0]
1738                     if content_type == 'text':
1739                         # TODO implement WebVTT downloading
1740                         pass
1741                     elif content_type == 'video' or content_type == 'audio':
1742                         base_url = ''
1743                         for element in (representation, adaptation_set, period, mpd_doc):
1744                             base_url_e = element.find(_add_ns('BaseURL'))
1745                             if base_url_e is not None:
1746                                 base_url = base_url_e.text + base_url
1747                                 if re.match(r'^https?://', base_url):
1748                                     break
1749                         if mpd_base_url and not re.match(r'^https?://', base_url):
1750                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1751                                 mpd_base_url += '/'
1752                             base_url = mpd_base_url + base_url
1753                         representation_id = representation_attrib.get('id')
1754                         lang = representation_attrib.get('lang')
1755                         url_el = representation.find(_add_ns('BaseURL'))
1756                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1757                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1758                         f = {
1759                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1760                             'url': base_url,
1761                             'manifest_url': mpd_url,
1762                             'ext': mimetype2ext(mime_type),
1763                             'width': int_or_none(representation_attrib.get('width')),
1764                             'height': int_or_none(representation_attrib.get('height')),
1765                             'tbr': int_or_none(bandwidth, 1000),
1766                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1767                             'fps': int_or_none(representation_attrib.get('frameRate')),
1768                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1769                             'format_note': 'DASH %s' % content_type,
1770                             'filesize': filesize,
1771                         }
1772                         f.update(parse_codecs(representation_attrib.get('codecs')))
1773                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1774
1775                         def prepare_template(template_name, identifiers):
1776                             t = representation_ms_info[template_name]
1777                             t = t.replace('$RepresentationID$', representation_id)
1778                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1779                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1780                             t.replace('$$', '$')
1781                             return t
1782
1783                         # @initialization is a regular template like @media one
1784                         # so it should be handled just the same way (see
1785                         # https://github.com/rg3/youtube-dl/issues/11605)
1786                         if 'initialization' in representation_ms_info:
1787                             initialization_template = prepare_template(
1788                                 'initialization',
1789                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1790                                 # $Time$ shall not be included for @initialization thus
1791                                 # only $Bandwidth$ remains
1792                                 ('Bandwidth', ))
1793                             representation_ms_info['initialization_url'] = initialization_template % {
1794                                 'Bandwidth': bandwidth,
1795                             }
1796
1797                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1798
1799                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1800
1801                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1802                             # can't be used at the same time
1803                             if '%(Number' in media_template and 's' not in representation_ms_info:
1804                                 segment_duration = None
1805                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1806                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1807                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1808                                 representation_ms_info['fragments'] = [{
1809                                     'url': media_template % {
1810                                         'Number': segment_number,
1811                                         'Bandwidth': bandwidth,
1812                                     },
1813                                     'duration': segment_duration,
1814                                 } for segment_number in range(
1815                                     representation_ms_info['start_number'],
1816                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1817                             else:
1818                                 # $Number*$ or $Time$ in media template with S list available
1819                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1820                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1821                                 representation_ms_info['fragments'] = []
1822                                 segment_time = 0
1823                                 segment_d = None
1824                                 segment_number = representation_ms_info['start_number']
1825
1826                                 def add_segment_url():
1827                                     segment_url = media_template % {
1828                                         'Time': segment_time,
1829                                         'Bandwidth': bandwidth,
1830                                         'Number': segment_number,
1831                                     }
1832                                     representation_ms_info['fragments'].append({
1833                                         'url': segment_url,
1834                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1835                                     })
1836
1837                                 for num, s in enumerate(representation_ms_info['s']):
1838                                     segment_time = s.get('t') or segment_time
1839                                     segment_d = s['d']
1840                                     add_segment_url()
1841                                     segment_number += 1
1842                                     for r in range(s.get('r', 0)):
1843                                         segment_time += segment_d
1844                                         add_segment_url()
1845                                         segment_number += 1
1846                                     segment_time += segment_d
1847                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1848                             # No media template
1849                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1850                             # or any YouTube dashsegments video
1851                             fragments = []
1852                             segment_index = 0
1853                             timescale = representation_ms_info['timescale']
1854                             for s in representation_ms_info['s']:
1855                                 duration = float_or_none(s['d'], timescale)
1856                                 for r in range(s.get('r', 0) + 1):
1857                                     fragments.append({
1858                                         'url': representation_ms_info['segment_urls'][segment_index],
1859                                         'duration': duration,
1860                                     })
1861                                     segment_index += 1
1862                             representation_ms_info['fragments'] = fragments
1863                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1864                         # No fragments key is present in this case.
1865                         if 'fragments' in representation_ms_info:
1866                             f.update({
1867                                 'fragments': [],
1868                                 'protocol': 'http_dash_segments',
1869                             })
1870                             if 'initialization_url' in representation_ms_info:
1871                                 initialization_url = representation_ms_info['initialization_url']
1872                                 if not f.get('url'):
1873                                     f['url'] = initialization_url
1874                                 f['fragments'].append({'url': initialization_url})
1875                             f['fragments'].extend(representation_ms_info['fragments'])
1876                             for fragment in f['fragments']:
1877                                 fragment['url'] = urljoin(base_url, fragment['url'])
1878                         try:
1879                             existing_format = next(
1880                                 fo for fo in formats
1881                                 if fo['format_id'] == representation_id)
1882                         except StopIteration:
1883                             full_info = formats_dict.get(representation_id, {}).copy()
1884                             full_info.update(f)
1885                             formats.append(full_info)
1886                         else:
1887                             existing_format.update(f)
1888                     else:
1889                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1890         return formats
1891
1892     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1893         res = self._download_webpage_handle(
1894             ism_url, video_id,
1895             note=note or 'Downloading ISM manifest',
1896             errnote=errnote or 'Failed to download ISM manifest',
1897             fatal=fatal)
1898         if res is False:
1899             return []
1900         ism, urlh = res
1901
1902         return self._parse_ism_formats(
1903             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1904
1905     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1906         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1907             return []
1908
1909         duration = int(ism_doc.attrib['Duration'])
1910         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1911
1912         formats = []
1913         for stream in ism_doc.findall('StreamIndex'):
1914             stream_type = stream.get('Type')
1915             if stream_type not in ('video', 'audio'):
1916                 continue
1917             url_pattern = stream.attrib['Url']
1918             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1919             stream_name = stream.get('Name')
1920             for track in stream.findall('QualityLevel'):
1921                 fourcc = track.get('FourCC')
1922                 # TODO: add support for WVC1 and WMAP
1923                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1924                     self.report_warning('%s is not a supported codec' % fourcc)
1925                     continue
1926                 tbr = int(track.attrib['Bitrate']) // 1000
1927                 width = int_or_none(track.get('MaxWidth'))
1928                 height = int_or_none(track.get('MaxHeight'))
1929                 sampling_rate = int_or_none(track.get('SamplingRate'))
1930
1931                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1932                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1933
1934                 fragments = []
1935                 fragment_ctx = {
1936                     'time': 0,
1937                 }
1938                 stream_fragments = stream.findall('c')
1939                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1940                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1941                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1942                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1943                     if not fragment_ctx['duration']:
1944                         try:
1945                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1946                         except IndexError:
1947                             next_fragment_time = duration
1948                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1949                     for _ in range(fragment_repeat):
1950                         fragments.append({
1951                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1952                             'duration': fragment_ctx['duration'] / stream_timescale,
1953                         })
1954                         fragment_ctx['time'] += fragment_ctx['duration']
1955
1956                 format_id = []
1957                 if ism_id:
1958                     format_id.append(ism_id)
1959                 if stream_name:
1960                     format_id.append(stream_name)
1961                 format_id.append(compat_str(tbr))
1962
1963                 formats.append({
1964                     'format_id': '-'.join(format_id),
1965                     'url': ism_url,
1966                     'manifest_url': ism_url,
1967                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1968                     'width': width,
1969                     'height': height,
1970                     'tbr': tbr,
1971                     'asr': sampling_rate,
1972                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1973                     'acodec': 'none' if stream_type == 'video' else fourcc,
1974                     'protocol': 'ism',
1975                     'fragments': fragments,
1976                     '_download_params': {
1977                         'duration': duration,
1978                         'timescale': stream_timescale,
1979                         'width': width or 0,
1980                         'height': height or 0,
1981                         'fourcc': fourcc,
1982                         'codec_private_data': track.get('CodecPrivateData'),
1983                         'sampling_rate': sampling_rate,
1984                         'channels': int_or_none(track.get('Channels', 2)),
1985                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1986                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1987                     },
1988                 })
1989         return formats
1990
1991     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1992         def absolute_url(video_url):
1993             return compat_urlparse.urljoin(base_url, video_url)
1994
1995         def parse_content_type(content_type):
1996             if not content_type:
1997                 return {}
1998             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1999             if ctr:
2000                 mimetype, codecs = ctr.groups()
2001                 f = parse_codecs(codecs)
2002                 f['ext'] = mimetype2ext(mimetype)
2003                 return f
2004             return {}
2005
2006         def _media_formats(src, cur_media_type):
2007             full_url = absolute_url(src)
2008             ext = determine_ext(full_url)
2009             if ext == 'm3u8':
2010                 is_plain_url = False
2011                 formats = self._extract_m3u8_formats(
2012                     full_url, video_id, ext='mp4',
2013                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
2014             elif ext == 'mpd':
2015                 is_plain_url = False
2016                 formats = self._extract_mpd_formats(
2017                     full_url, video_id, mpd_id=mpd_id)
2018             else:
2019                 is_plain_url = True
2020                 formats = [{
2021                     'url': full_url,
2022                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2023                 }]
2024             return is_plain_url, formats
2025
2026         entries = []
2027         media_tags = [(media_tag, media_type, '')
2028                       for media_tag, media_type
2029                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2030         media_tags.extend(re.findall(
2031             # We only allow video|audio followed by a whitespace or '>'.
2032             # Allowing more characters may end up in significant slow down (see
2033             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2034             # http://www.porntrex.com/maps/videositemap.xml).
2035             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2036         for media_tag, media_type, media_content in media_tags:
2037             media_info = {
2038                 'formats': [],
2039                 'subtitles': {},
2040             }
2041             media_attributes = extract_attributes(media_tag)
2042             src = media_attributes.get('src')
2043             if src:
2044                 _, formats = _media_formats(src, media_type)
2045                 media_info['formats'].extend(formats)
2046             media_info['thumbnail'] = media_attributes.get('poster')
2047             if media_content:
2048                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2049                     source_attributes = extract_attributes(source_tag)
2050                     src = source_attributes.get('src')
2051                     if not src:
2052                         continue
2053                     is_plain_url, formats = _media_formats(src, media_type)
2054                     if is_plain_url:
2055                         f = parse_content_type(source_attributes.get('type'))
2056                         f.update(formats[0])
2057                         media_info['formats'].append(f)
2058                     else:
2059                         media_info['formats'].extend(formats)
2060                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2061                     track_attributes = extract_attributes(track_tag)
2062                     kind = track_attributes.get('kind')
2063                     if not kind or kind in ('subtitles', 'captions'):
2064                         src = track_attributes.get('src')
2065                         if not src:
2066                             continue
2067                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2068                         media_info['subtitles'].setdefault(lang, []).append({
2069                             'url': absolute_url(src),
2070                         })
2071             if media_info['formats'] or media_info['subtitles']:
2072                 entries.append(media_info)
2073         return entries
2074
2075     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2076         formats = []
2077         hdcore_sign = 'hdcore=3.7.0'
2078         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2079         hds_host = hosts.get('hds')
2080         if hds_host:
2081             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2082         if 'hdcore=' not in f4m_url:
2083             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2084         f4m_formats = self._extract_f4m_formats(
2085             f4m_url, video_id, f4m_id='hds', fatal=False)
2086         for entry in f4m_formats:
2087             entry.update({'extra_param_to_segment_url': hdcore_sign})
2088         formats.extend(f4m_formats)
2089         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2090         hls_host = hosts.get('hls')
2091         if hls_host:
2092             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2093         formats.extend(self._extract_m3u8_formats(
2094             m3u8_url, video_id, 'mp4', 'm3u8_native',
2095             m3u8_id='hls', fatal=False))
2096         return formats
2097
2098     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2099         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2100         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2101         http_base_url = 'http' + url_base
2102         formats = []
2103         if 'm3u8' not in skip_protocols:
2104             formats.extend(self._extract_m3u8_formats(
2105                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2106                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2107         if 'f4m' not in skip_protocols:
2108             formats.extend(self._extract_f4m_formats(
2109                 http_base_url + '/manifest.f4m',
2110                 video_id, f4m_id='hds', fatal=False))
2111         if 'dash' not in skip_protocols:
2112             formats.extend(self._extract_mpd_formats(
2113                 http_base_url + '/manifest.mpd',
2114                 video_id, mpd_id='dash', fatal=False))
2115         if re.search(r'(?:/smil:|\.smil)', url_base):
2116             if 'smil' not in skip_protocols:
2117                 rtmp_formats = self._extract_smil_formats(
2118                     http_base_url + '/jwplayer.smil',
2119                     video_id, fatal=False)
2120                 for rtmp_format in rtmp_formats:
2121                     rtsp_format = rtmp_format.copy()
2122                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2123                     del rtsp_format['play_path']
2124                     del rtsp_format['ext']
2125                     rtsp_format.update({
2126                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2127                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2128                         'protocol': 'rtsp',
2129                     })
2130                     formats.extend([rtmp_format, rtsp_format])
2131         else:
2132             for protocol in ('rtmp', 'rtsp'):
2133                 if protocol not in skip_protocols:
2134                     formats.append({
2135                         'url': protocol + url_base,
2136                         'format_id': protocol,
2137                         'protocol': protocol,
2138                     })
2139         return formats
2140
2141     @staticmethod
2142     def _find_jwplayer_data(webpage):
2143         mobj = re.search(
2144             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2145             webpage)
2146         if mobj:
2147             return mobj.group('options')
2148
2149     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2150         jwplayer_data = self._parse_json(
2151             self._find_jwplayer_data(webpage), video_id,
2152             transform_source=js_to_json)
2153         return self._parse_jwplayer_data(
2154             jwplayer_data, video_id, *args, **kwargs)
2155
2156     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2157                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2158         # JWPlayer backward compatibility: flattened playlists
2159         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2160         if 'playlist' not in jwplayer_data:
2161             jwplayer_data = {'playlist': [jwplayer_data]}
2162
2163         entries = []
2164
2165         # JWPlayer backward compatibility: single playlist item
2166         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2167         if not isinstance(jwplayer_data['playlist'], list):
2168             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2169
2170         for video_data in jwplayer_data['playlist']:
2171             # JWPlayer backward compatibility: flattened sources
2172             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2173             if 'sources' not in video_data:
2174                 video_data['sources'] = [video_data]
2175
2176             this_video_id = video_id or video_data['mediaid']
2177
2178             formats = []
2179             for source in video_data['sources']:
2180                 source_url = self._proto_relative_url(source['file'])
2181                 if base_url:
2182                     source_url = compat_urlparse.urljoin(base_url, source_url)
2183                 source_type = source.get('type') or ''
2184                 ext = mimetype2ext(source_type) or determine_ext(source_url)
2185                 if source_type == 'hls' or ext == 'm3u8':
2186                     formats.extend(self._extract_m3u8_formats(
2187                         source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
2188                 elif ext == 'mpd':
2189                     formats.extend(self._extract_mpd_formats(
2190                         source_url, this_video_id, mpd_id=mpd_id, fatal=False))
2191                 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2192                 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2193                     formats.append({
2194                         'url': source_url,
2195                         'vcodec': 'none',
2196                         'ext': ext,
2197                     })
2198                 else:
2199                     height = int_or_none(source.get('height'))
2200                     if height is None:
2201                         # Often no height is provided but there is a label in
2202                         # format like 1080p.
2203                         height = int_or_none(self._search_regex(
2204                             r'^(\d{3,})[pP]$', source.get('label') or '',
2205                             'height', default=None))
2206                     a_format = {
2207                         'url': source_url,
2208                         'width': int_or_none(source.get('width')),
2209                         'height': height,
2210                         'ext': ext,
2211                     }
2212                     if source_url.startswith('rtmp'):
2213                         a_format['ext'] = 'flv'
2214
2215                         # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2216                         # of jwplayer.flash.swf
2217                         rtmp_url_parts = re.split(
2218                             r'((?:mp4|mp3|flv):)', source_url, 1)
2219                         if len(rtmp_url_parts) == 3:
2220                             rtmp_url, prefix, play_path = rtmp_url_parts
2221                             a_format.update({
2222                                 'url': rtmp_url,
2223                                 'play_path': prefix + play_path,
2224                             })
2225                         if rtmp_params:
2226                             a_format.update(rtmp_params)
2227                     formats.append(a_format)
2228             self._sort_formats(formats)
2229
2230             subtitles = {}
2231             tracks = video_data.get('tracks')
2232             if tracks and isinstance(tracks, list):
2233                 for track in tracks:
2234                     if track.get('kind') != 'captions':
2235                         continue
2236                     track_url = urljoin(base_url, track.get('file'))
2237                     if not track_url:
2238                         continue
2239                     subtitles.setdefault(track.get('label') or 'en', []).append({
2240                         'url': self._proto_relative_url(track_url)
2241                     })
2242
2243             entries.append({
2244                 'id': this_video_id,
2245                 'title': video_data['title'] if require_title else video_data.get('title'),
2246                 'description': video_data.get('description'),
2247                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2248                 'timestamp': int_or_none(video_data.get('pubdate')),
2249                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2250                 'subtitles': subtitles,
2251                 'formats': formats,
2252             })
2253         if len(entries) == 1:
2254             return entries[0]
2255         else:
2256             return self.playlist_result(entries)
2257
2258     def _live_title(self, name):
2259         """ Generate the title for a live video """
2260         now = datetime.datetime.now()
2261         now_str = now.strftime('%Y-%m-%d %H:%M')
2262         return name + ' ' + now_str
2263
2264     def _int(self, v, name, fatal=False, **kwargs):
2265         res = int_or_none(v, **kwargs)
2266         if 'get_attr' in kwargs:
2267             print(getattr(v, kwargs['get_attr']))
2268         if res is None:
2269             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2270             if fatal:
2271                 raise ExtractorError(msg)
2272             else:
2273                 self._downloader.report_warning(msg)
2274         return res
2275
2276     def _float(self, v, name, fatal=False, **kwargs):
2277         res = float_or_none(v, **kwargs)
2278         if res is None:
2279             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2280             if fatal:
2281                 raise ExtractorError(msg)
2282             else:
2283                 self._downloader.report_warning(msg)
2284         return res
2285
2286     def _set_cookie(self, domain, name, value, expire_time=None):
2287         cookie = compat_cookiejar.Cookie(
2288             0, name, value, None, None, domain, None,
2289             None, '/', True, False, expire_time, '', None, None, None)
2290         self._downloader.cookiejar.set_cookie(cookie)
2291
2292     def _get_cookies(self, url):
2293         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2294         req = sanitized_Request(url)
2295         self._downloader.cookiejar.add_cookie_header(req)
2296         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2297
2298     def get_testcases(self, include_onlymatching=False):
2299         t = getattr(self, '_TEST', None)
2300         if t:
2301             assert not hasattr(self, '_TESTS'), \
2302                 '%s has _TEST and _TESTS' % type(self).__name__
2303             tests = [t]
2304         else:
2305             tests = getattr(self, '_TESTS', [])
2306         for t in tests:
2307             if not include_onlymatching and t.get('only_matching', False):
2308                 continue
2309             t['name'] = type(self).__name__[:-len('IE')]
2310             yield t
2311
2312     def is_suitable(self, age_limit):
2313         """ Test whether the extractor is generally suitable for the given
2314         age limit (i.e. pornographic sites are not, all others usually are) """
2315
2316         any_restricted = False
2317         for tc in self.get_testcases(include_onlymatching=False):
2318             if tc.get('playlist', []):
2319                 tc = tc['playlist'][0]
2320             is_restricted = age_restricted(
2321                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2322             if not is_restricted:
2323                 return True
2324             any_restricted = any_restricted or is_restricted
2325         return not any_restricted
2326
2327     def extract_subtitles(self, *args, **kwargs):
2328         if (self._downloader.params.get('writesubtitles', False) or
2329                 self._downloader.params.get('listsubtitles')):
2330             return self._get_subtitles(*args, **kwargs)
2331         return {}
2332
2333     def _get_subtitles(self, *args, **kwargs):
2334         raise NotImplementedError('This method must be implemented by subclasses')
2335
2336     @staticmethod
2337     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2338         """ Merge subtitle items for one language. Items with duplicated URLs
2339         will be dropped. """
2340         list1_urls = set([item['url'] for item in subtitle_list1])
2341         ret = list(subtitle_list1)
2342         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2343         return ret
2344
2345     @classmethod
2346     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2347         """ Merge two subtitle dictionaries, language by language. """
2348         ret = dict(subtitle_dict1)
2349         for lang in subtitle_dict2:
2350             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2351         return ret
2352
2353     def extract_automatic_captions(self, *args, **kwargs):
2354         if (self._downloader.params.get('writeautomaticsub', False) or
2355                 self._downloader.params.get('listsubtitles')):
2356             return self._get_automatic_captions(*args, **kwargs)
2357         return {}
2358
2359     def _get_automatic_captions(self, *args, **kwargs):
2360         raise NotImplementedError('This method must be implemented by subclasses')
2361
2362     def mark_watched(self, *args, **kwargs):
2363         if (self._downloader.params.get('mark_watched', False) and
2364                 (self._get_login_info()[0] is not None or
2365                     self._downloader.params.get('cookiefile') is not None)):
2366             self._mark_watched(*args, **kwargs)
2367
2368     def _mark_watched(self, *args, **kwargs):
2369         raise NotImplementedError('This method must be implemented by subclasses')
2370
2371     def geo_verification_headers(self):
2372         headers = {}
2373         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2374         if geo_verification_proxy:
2375             headers['Ytdl-request-proxy'] = geo_verification_proxy
2376         return headers
2377
2378     def _generic_id(self, url):
2379         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2380
2381     def _generic_title(self, url):
2382         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2383
2384
2385 class SearchInfoExtractor(InfoExtractor):
2386     """
2387     Base class for paged search queries extractors.
2388     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2389     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2390     """
2391
2392     @classmethod
2393     def _make_valid_url(cls):
2394         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2395
2396     @classmethod
2397     def suitable(cls, url):
2398         return re.match(cls._make_valid_url(), url) is not None
2399
2400     def _real_extract(self, query):
2401         mobj = re.match(self._make_valid_url(), query)
2402         if mobj is None:
2403             raise ExtractorError('Invalid search query "%s"' % query)
2404
2405         prefix = mobj.group('prefix')
2406         query = mobj.group('query')
2407         if prefix == '':
2408             return self._get_n_results(query, 1)
2409         elif prefix == 'all':
2410             return self._get_n_results(query, self._MAX_RESULTS)
2411         else:
2412             n = int(prefix)
2413             if n <= 0:
2414                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2415             elif n > self._MAX_RESULTS:
2416                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2417                 n = self._MAX_RESULTS
2418             return self._get_n_results(query, n)
2419
2420     def _get_n_results(self, query, n):
2421         """Get a specified number of results for a query"""
2422         raise NotImplementedError('This method must be implemented by subclasses')
2423
2424     @property
2425     def SEARCH_KEY(self):
2426         return self._SEARCH_KEY