_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_unquote,
  25     compat_urllib_parse_urlencode,
  26     compat_urllib_request,
  27     compat_urlparse,
  28 )
  29 from ..downloader.f4m import remove_encrypted_media
  30 from ..utils import (
  31     NO_DEFAULT,
  32     age_restricted,
  33     base_url,
  34     bug_reports_message,
  35     clean_html,
  36     compiled_regex_type,
  37     determine_ext,
  38     error_to_compat_str,
  39     ExtractorError,
  40     fix_xml_ampersands,
  41     float_or_none,
  42     int_or_none,
  43     parse_iso8601,
  44     RegexNotFoundError,
  45     sanitize_filename,
  46     sanitized_Request,
  47     unescapeHTML,
  48     unified_strdate,
  49     unified_timestamp,
  50     url_basename,
  51     xpath_element,
  52     xpath_text,
  53     xpath_with_ns,
  54     determine_protocol,
  55     parse_duration,
  56     mimetype2ext,
  57     update_Request,
  58     update_url_query,
  59     parse_m3u8_attributes,
  60     extract_attributes,
  61     parse_codecs,
  62     urljoin,
  63 )
  64
  65
  66 class InfoExtractor(object):
  67     """Information Extractor class.
  68
  69     Information extractors are the classes that, given a URL, extract
  70     information about the video (or videos) the URL refers to. This
  71     information includes the real video URL, the video title, author and
  72     others. The information is stored in a dictionary which is then
  73     passed to the YoutubeDL. The YoutubeDL processes this
  74     information possibly downloading the video to the file system, among
  75     other possible outcomes.
  76
  77     The type field determines the type of the result.
  78     By far the most common value (and the default if _type is missing) is
  79     "video", which indicates a single video.
  80
  81     For a video, the dictionaries must include the following fields:
  82
  83     id:             Video identifier.
  84     title:          Video title, unescaped.
  85
  86     Additionally, it must contain either a formats entry or a url one:
  87
  88     formats:        A list of dictionaries for each format available, ordered
  89                     from worst to best quality.
  90
  91                     Potential fields:
  92                     * url        Mandatory. The URL of the video file
  93                     * manifest_url
  94                                  The URL of the manifest file in case of
  95                                  fragmented media (DASH, hls, hds)
  96                     * ext        Will be calculated from URL if missing
  97                     * format     A human-readable description of the format
  98                                  ("mp4 container with h264/opus").
  99                                  Calculated from the format_id, width, height.
 100                                  and format_note fields if missing.
 101                     * format_id  A short description of the format
 102                                  ("mp4_h264_opus" or "19").
 103                                 Technically optional, but strongly recommended.
 104                     * format_note Additional info about the format
 105                                  ("3D" or "DASH video")
 106                     * width      Width of the video, if known
 107                     * height     Height of the video, if known
 108                     * resolution Textual description of width and height
 109                     * tbr        Average bitrate of audio and video in KBit/s
 110                     * abr        Average audio bitrate in KBit/s
 111                     * acodec     Name of the audio codec in use
 112                     * asr        Audio sampling rate in Hertz
 113                     * vbr        Average video bitrate in KBit/s
 114                     * fps        Frame rate
 115                     * vcodec     Name of the video codec in use
 116                     * container  Name of the container format
 117                     * filesize   The number of bytes, if known in advance
 118                     * filesize_approx  An estimate for the number of bytes
 119                     * player_url SWF Player URL (used for rtmpdump).
 120                     * protocol   The protocol that will be used for the actual
 121                                  download, lower-case.
 122                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 123                                  "m3u8", "m3u8_native" or "http_dash_segments".
 124                     * fragments  A list of fragments of the fragmented media,
 125                                  with the following entries:
 126                                  * "url" (mandatory) - fragment's URL
 127                                  * "duration" (optional, int or float)
 128                                  * "filesize" (optional, int)
 129                     * preference Order number of this format. If this field is
 130                                  present and not None, the formats get sorted
 131                                  by this field, regardless of all other values.
 132                                  -1 for default (order by other properties),
 133                                  -2 or smaller for less than default.
 134                                  < -1000 to hide the format (if there is
 135                                     another one which is strictly better)
 136                     * language   Language code, e.g. "de" or "en-US".
 137                     * language_preference  Is this in the language mentioned in
 138                                  the URL?
 139                                  10 if it's what the URL is about,
 140                                  -1 for default (don't know),
 141                                  -10 otherwise, other values reserved for now.
 142                     * quality    Order number of the video quality of this
 143                                  format, irrespective of the file format.
 144                                  -1 for default (order by other properties),
 145                                  -2 or smaller for less than default.
 146                     * source_preference  Order number for this video source
 147                                   (quality takes higher priority)
 148                                  -1 for default (order by other properties),
 149                                  -2 or smaller for less than default.
 150                     * http_headers  A dictionary of additional HTTP headers
 151                                  to add to the request.
 152                     * stretched_ratio  If given and not 1, indicates that the
 153                                  video's pixels are not square.
 154                                  width : height ratio as float.
 155                     * no_resume  The server does not support resuming the
 156                                  (HTTP or RTMP) download. Boolean.
 157
 158     url:            Final video URL.
 159     ext:            Video filename extension.
 160     format:         The video format, defaults to ext (used for --get-format)
 161     player_url:     SWF Player URL (used for rtmpdump).
 162
 163     The following fields are optional:
 164
 165     alt_title:      A secondary title of the video.
 166     display_id      An alternative identifier for the video, not necessarily
 167                     unique, but available before title. Typically, id is
 168                     something like "4234987", title "Dancing naked mole rats",
 169                     and display_id "dancing-naked-mole-rats"
 170     thumbnails:     A list of dictionaries, with the following entries:
 171                         * "id" (optional, string) - Thumbnail format ID
 172                         * "url"
 173                         * "preference" (optional, int) - quality of the image
 174                         * "width" (optional, int)
 175                         * "height" (optional, int)
 176                         * "resolution" (optional, string "{width}x{height"},
 177                                         deprecated)
 178                         * "filesize" (optional, int)
 179     thumbnail:      Full URL to a video thumbnail image.
 180     description:    Full video description.
 181     uploader:       Full name of the video uploader.
 182     license:        License name the video is licensed under.
 183     creator:        The creator of the video.
 184     release_date:   The date (YYYYMMDD) when the video was released.
 185     timestamp:      UNIX timestamp of the moment the video became available.
 186     upload_date:    Video upload date (YYYYMMDD).
 187                     If not explicitly set, calculated from timestamp.
 188     uploader_id:    Nickname or id of the video uploader.
 189     uploader_url:   Full URL to a personal webpage of the video uploader.
 190     location:       Physical location where the video was filmed.
 191     subtitles:      The available subtitles as a dictionary in the format
 192                     {tag: subformats}. "tag" is usually a language code, and
 193                     "subformats" is a list sorted from lower to higher
 194                     preference, each element is a dictionary with the "ext"
 195                     entry and one of:
 196                         * "data": The subtitles file contents
 197                         * "url": A URL pointing to the subtitles file
 198                     "ext" will be calculated from URL if missing
 199     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 200                     automatically generated captions
 201     duration:       Length of the video in seconds, as an integer or float.
 202     view_count:     How many users have watched the video on the platform.
 203     like_count:     Number of positive ratings of the video
 204     dislike_count:  Number of negative ratings of the video
 205     repost_count:   Number of reposts of the video
 206     average_rating: Average rating give by users, the scale used depends on the webpage
 207     comment_count:  Number of comments on the video
 208     comments:       A list of comments, each with one or more of the following
 209                     properties (all but one of text or html optional):
 210                         * "author" - human-readable name of the comment author
 211                         * "author_id" - user ID of the comment author
 212                         * "id" - Comment ID
 213                         * "html" - Comment as HTML
 214                         * "text" - Plain text of the comment
 215                         * "timestamp" - UNIX timestamp of comment
 216                         * "parent" - ID of the comment this one is replying to.
 217                                      Set to "root" to indicate that this is a
 218                                      comment to the original video.
 219     age_limit:      Age restriction for the video, as an integer (years)
 220     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 221                     should allow to get the same result again. (It will be set
 222                     by YoutubeDL if it's missing)
 223     categories:     A list of categories that the video falls in, for example
 224                     ["Sports", "Berlin"]
 225     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 226     is_live:        True, False, or None (=unknown). Whether this video is a
 227                     live stream that goes on instead of a fixed-length video.
 228     start_time:     Time in seconds where the reproduction should start, as
 229                     specified in the URL.
 230     end_time:       Time in seconds where the reproduction should end, as
 231                     specified in the URL.
 232
 233     The following fields should only be used when the video belongs to some logical
 234     chapter or section:
 235
 236     chapter:        Name or title of the chapter the video belongs to.
 237     chapter_number: Number of the chapter the video belongs to, as an integer.
 238     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 239
 240     The following fields should only be used when the video is an episode of some
 241     series, programme or podcast:
 242
 243     series:         Title of the series or programme the video episode belongs to.
 244     season:         Title of the season the video episode belongs to.
 245     season_number:  Number of the season the video episode belongs to, as an integer.
 246     season_id:      Id of the season the video episode belongs to, as a unicode string.
 247     episode:        Title of the video episode. Unlike mandatory video title field,
 248                     this field should denote the exact title of the video episode
 249                     without any kind of decoration.
 250     episode_number: Number of the video episode within a season, as an integer.
 251     episode_id:     Id of the video episode, as a unicode string.
 252
 253     The following fields should only be used when the media is a track or a part of
 254     a music album:
 255
 256     track:          Title of the track.
 257     track_number:   Number of the track within an album or a disc, as an integer.
 258     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 259                     as a unicode string.
 260     artist:         Artist(s) of the track.
 261     genre:          Genre(s) of the track.
 262     album:          Title of the album the track belongs to.
 263     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 264     album_artist:   List of all artists appeared on the album (e.g.
 265                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 266                     and compilations).
 267     disc_number:    Number of the disc or other physical medium the track belongs to,
 268                     as an integer.
 269     release_year:   Year (YYYY) when the album was released.
 270
 271     Unless mentioned otherwise, the fields should be Unicode strings.
 272
 273     Unless mentioned otherwise, None is equivalent to absence of information.
 274
 275
 276     _type "playlist" indicates multiple videos.
 277     There must be a key "entries", which is a list, an iterable, or a PagedList
 278     object, each element of which is a valid dictionary by this specification.
 279
 280     Additionally, playlists can have "title", "description" and "id" attributes
 281     with the same semantics as videos (see above).
 282
 283
 284     _type "multi_video" indicates that there are multiple videos that
 285     form a single show, for examples multiple acts of an opera or TV episode.
 286     It must have an entries key like a playlist and contain all the keys
 287     required for a video at the same time.
 288
 289
 290     _type "url" indicates that the video must be extracted from another
 291     location, possibly by a different extractor. Its only required key is:
 292     "url" - the next URL to extract.
 293     The key "ie_key" can be set to the class name (minus the trailing "IE",
 294     e.g. "Youtube") if the extractor class is known in advance.
 295     Additionally, the dictionary may have any properties of the resolved entity
 296     known in advance, for example "title" if the title of the referred video is
 297     known ahead of time.
 298
 299
 300     _type "url_transparent" entities have the same specification as "url", but
 301     indicate that the given additional information is more precise than the one
 302     associated with the resolved URL.
 303     This is useful when a site employs a video service that hosts the video and
 304     its technical metadata, but that video service does not embed a useful
 305     title, description etc.
 306
 307
 308     Subclasses of this one should re-define the _real_initialize() and
 309     _real_extract() methods and define a _VALID_URL regexp.
 310     Probably, they should also be added to the list of extractors.
 311
 312     Finally, the _WORKING attribute should be set to False for broken IEs
 313     in order to warn the users and skip the tests.
 314     """
 315
 316     _ready = False
 317     _downloader = None
 318     _WORKING = True
 319
 320     def __init__(self, downloader=None):
 321         """Constructor. Receives an optional downloader."""
 322         self._ready = False
 323         self.set_downloader(downloader)
 324
 325     @classmethod
 326     def suitable(cls, url):
 327         """Receives a URL and returns True if suitable for this IE."""
 328
 329         # This does not use has/getattr intentionally - we want to know whether
 330         # we have cached the regexp for *this* class, whereas getattr would also
 331         # match the superclass
 332         if '_VALID_URL_RE' not in cls.__dict__:
 333             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 334         return cls._VALID_URL_RE.match(url) is not None
 335
 336     @classmethod
 337     def _match_id(cls, url):
 338         if '_VALID_URL_RE' not in cls.__dict__:
 339             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 340         m = cls._VALID_URL_RE.match(url)
 341         assert m
 342         return m.group('id')
 343
 344     @classmethod
 345     def working(cls):
 346         """Getter method for _WORKING."""
 347         return cls._WORKING
 348
 349     def initialize(self):
 350         """Initializes an instance (authentication, etc)."""
 351         if not self._ready:
 352             self._real_initialize()
 353             self._ready = True
 354
 355     def extract(self, url):
 356         """Extracts URL information and returns it in list of dicts."""
 357         try:
 358             self.initialize()
 359             return self._real_extract(url)
 360         except ExtractorError:
 361             raise
 362         except compat_http_client.IncompleteRead as e:
 363             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 364         except (KeyError, StopIteration) as e:
 365             raise ExtractorError('An extractor error has occurred.', cause=e)
 366
 367     def set_downloader(self, downloader):
 368         """Sets the downloader for this IE."""
 369         self._downloader = downloader
 370
 371     def _real_initialize(self):
 372         """Real initialization process. Redefine in subclasses."""
 373         pass
 374
 375     def _real_extract(self, url):
 376         """Real extraction process. Redefine in subclasses."""
 377         pass
 378
 379     @classmethod
 380     def ie_key(cls):
 381         """A string for getting the InfoExtractor with get_info_extractor"""
 382         return compat_str(cls.__name__[:-2])
 383
 384     @property
 385     def IE_NAME(self):
 386         return compat_str(type(self).__name__[:-2])
 387
 388     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 389         """ Returns the response handle """
 390         if note is None:
 391             self.report_download_webpage(video_id)
 392         elif note is not False:
 393             if video_id is None:
 394                 self.to_screen('%s' % (note,))
 395             else:
 396                 self.to_screen('%s: %s' % (video_id, note))
 397         if isinstance(url_or_request, compat_urllib_request.Request):
 398             url_or_request = update_Request(
 399                 url_or_request, data=data, headers=headers, query=query)
 400         else:
 401             if query:
 402                 url_or_request = update_url_query(url_or_request, query)
 403             if data is not None or headers:
 404                 url_or_request = sanitized_Request(url_or_request, data, headers)
 405         try:
 406             return self._downloader.urlopen(url_or_request)
 407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 408             if errnote is False:
 409                 return False
 410             if errnote is None:
 411                 errnote = 'Unable to download webpage'
 412
 413             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 414             if fatal:
 415                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 416             else:
 417                 self._downloader.report_warning(errmsg)
 418                 return False
 419
 420     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 421         """ Returns a tuple (page content as string, URL handle) """
 422         # Strip hashes from the URL (#1038)
 423         if isinstance(url_or_request, (compat_str, str)):
 424             url_or_request = url_or_request.partition('#')[0]
 425
 426         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 427         if urlh is False:
 428             assert not fatal
 429             return False
 430         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 431         return (content, urlh)
 432
 433     @staticmethod
 434     def _guess_encoding_from_content(content_type, webpage_bytes):
 435         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 436         if m:
 437             encoding = m.group(1)
 438         else:
 439             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 440                           webpage_bytes[:1024])
 441             if m:
 442                 encoding = m.group(1).decode('ascii')
 443             elif webpage_bytes.startswith(b'\xff\xfe'):
 444                 encoding = 'utf-16'
 445             else:
 446                 encoding = 'utf-8'
 447
 448         return encoding
 449
 450     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 451         content_type = urlh.headers.get('Content-Type', '')
 452         webpage_bytes = urlh.read()
 453         if prefix is not None:
 454             webpage_bytes = prefix + webpage_bytes
 455         if not encoding:
 456             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 457         if self._downloader.params.get('dump_intermediate_pages', False):
 458             try:
 459                 url = url_or_request.get_full_url()
 460             except AttributeError:
 461                 url = url_or_request
 462             self.to_screen('Dumping request to ' + url)
 463             dump = base64.b64encode(webpage_bytes).decode('ascii')
 464             self._downloader.to_screen(dump)
 465         if self._downloader.params.get('write_pages', False):
 466             try:
 467                 url = url_or_request.get_full_url()
 468             except AttributeError:
 469                 url = url_or_request
 470             basen = '%s_%s' % (video_id, url)
 471             if len(basen) > 240:
 472                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 473                 basen = basen[:240 - len(h)] + h
 474             raw_filename = basen + '.dump'
 475             filename = sanitize_filename(raw_filename, restricted=True)
 476             self.to_screen('Saving request to ' + filename)
 477             # Working around MAX_PATH limitation on Windows (see
 478             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 479             if compat_os_name == 'nt':
 480                 absfilepath = os.path.abspath(filename)
 481                 if len(absfilepath) > 259:
 482                     filename = '\\\\?\\' + absfilepath
 483             with open(filename, 'wb') as outf:
 484                 outf.write(webpage_bytes)
 485
 486         try:
 487             content = webpage_bytes.decode(encoding, 'replace')
 488         except LookupError:
 489             content = webpage_bytes.decode('utf-8', 'replace')
 490
 491         if ('<title>Access to this site is blocked</title>' in content and
 492                 'Websense' in content[:512]):
 493             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 494             blocked_iframe = self._html_search_regex(
 495                 r'<iframe src="([^"]+)"', content,
 496                 'Websense information URL', default=None)
 497             if blocked_iframe:
 498                 msg += ' Visit %s for more details' % blocked_iframe
 499             raise ExtractorError(msg, expected=True)
 500         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 501             msg = (
 502                 'Access to this webpage has been blocked by Indian censorship. '
 503                 'Use a VPN or proxy server (with --proxy) to route around it.')
 504             block_msg = self._html_search_regex(
 505                 r'</h1><p>(.*?)</p>',
 506                 content, 'block message', default=None)
 507             if block_msg:
 508                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 509             raise ExtractorError(msg, expected=True)
 510
 511         return content
 512
 513     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 514         """ Returns the data of the page as a string """
 515         success = False
 516         try_count = 0
 517         while success is False:
 518             try:
 519                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 520                 success = True
 521             except compat_http_client.IncompleteRead as e:
 522                 try_count += 1
 523                 if try_count >= tries:
 524                     raise e
 525                 self._sleep(timeout, video_id)
 526         if res is False:
 527             return res
 528         else:
 529             content, _ = res
 530             return content
 531
 532     def _download_xml(self, url_or_request, video_id,
 533                       note='Downloading XML', errnote='Unable to download XML',
 534                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 535         """Return the xml as an xml.etree.ElementTree.Element"""
 536         xml_string = self._download_webpage(
 537             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 538         if xml_string is False:
 539             return xml_string
 540         if transform_source:
 541             xml_string = transform_source(xml_string)
 542         return compat_etree_fromstring(xml_string.encode('utf-8'))
 543
 544     def _download_json(self, url_or_request, video_id,
 545                        note='Downloading JSON metadata',
 546                        errnote='Unable to download JSON metadata',
 547                        transform_source=None,
 548                        fatal=True, encoding=None, data=None, headers={}, query={}):
 549         json_string = self._download_webpage(
 550             url_or_request, video_id, note, errnote, fatal=fatal,
 551             encoding=encoding, data=data, headers=headers, query=query)
 552         if (not fatal) and json_string is False:
 553             return None
 554         return self._parse_json(
 555             json_string, video_id, transform_source=transform_source, fatal=fatal)
 556
 557     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 558         if transform_source:
 559             json_string = transform_source(json_string)
 560         try:
 561             return json.loads(json_string)
 562         except ValueError as ve:
 563             errmsg = '%s: Failed to parse JSON ' % video_id
 564             if fatal:
 565                 raise ExtractorError(errmsg, cause=ve)
 566             else:
 567                 self.report_warning(errmsg + str(ve))
 568
 569     def report_warning(self, msg, video_id=None):
 570         idstr = '' if video_id is None else '%s: ' % video_id
 571         self._downloader.report_warning(
 572             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 573
 574     def to_screen(self, msg):
 575         """Print msg to screen, prefixing it with '[ie_name]'"""
 576         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 577
 578     def report_extraction(self, id_or_name):
 579         """Report information extraction."""
 580         self.to_screen('%s: Extracting information' % id_or_name)
 581
 582     def report_download_webpage(self, video_id):
 583         """Report webpage download."""
 584         self.to_screen('%s: Downloading webpage' % video_id)
 585
 586     def report_age_confirmation(self):
 587         """Report attempt to confirm age."""
 588         self.to_screen('Confirming age')
 589
 590     def report_login(self):
 591         """Report attempt to log in."""
 592         self.to_screen('Logging in')
 593
 594     @staticmethod
 595     def raise_login_required(msg='This video is only available for registered users'):
 596         raise ExtractorError(
 597             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 598             expected=True)
 599
 600     @staticmethod
 601     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 602         raise ExtractorError(
 603             '%s. You might want to use --proxy to workaround.' % msg,
 604             expected=True)
 605
 606     # Methods for following #608
 607     @staticmethod
 608     def url_result(url, ie=None, video_id=None, video_title=None):
 609         """Returns a URL that points to a page that should be processed"""
 610         # TODO: ie should be the class used for getting the info
 611         video_info = {'_type': 'url',
 612                       'url': url,
 613                       'ie_key': ie}
 614         if video_id is not None:
 615             video_info['id'] = video_id
 616         if video_title is not None:
 617             video_info['title'] = video_title
 618         return video_info
 619
 620     @staticmethod
 621     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 622         """Returns a playlist"""
 623         video_info = {'_type': 'playlist',
 624                       'entries': entries}
 625         if playlist_id:
 626             video_info['id'] = playlist_id
 627         if playlist_title:
 628             video_info['title'] = playlist_title
 629         if playlist_description:
 630             video_info['description'] = playlist_description
 631         return video_info
 632
 633     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 634         """
 635         Perform a regex search on the given string, using a single or a list of
 636         patterns returning the first matching group.
 637         In case of failure return a default value or raise a WARNING or a
 638         RegexNotFoundError, depending on fatal, specifying the field name.
 639         """
 640         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 641             mobj = re.search(pattern, string, flags)
 642         else:
 643             for p in pattern:
 644                 mobj = re.search(p, string, flags)
 645                 if mobj:
 646                     break
 647
 648         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 649             _name = '\033[0;34m%s\033[0m' % name
 650         else:
 651             _name = name
 652
 653         if mobj:
 654             if group is None:
 655                 # return the first matching group
 656                 return next(g for g in mobj.groups() if g is not None)
 657             else:
 658                 return mobj.group(group)
 659         elif default is not NO_DEFAULT:
 660             return default
 661         elif fatal:
 662             raise RegexNotFoundError('Unable to extract %s' % _name)
 663         else:
 664             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 665             return None
 666
 667     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 668         """
 669         Like _search_regex, but strips HTML tags and unescapes entities.
 670         """
 671         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 672         if res:
 673             return clean_html(res).strip()
 674         else:
 675             return res
 676
 677     def _get_netrc_login_info(self, netrc_machine=None):
 678         username = None
 679         password = None
 680         netrc_machine = netrc_machine or self._NETRC_MACHINE
 681
 682         if self._downloader.params.get('usenetrc', False):
 683             try:
 684                 info = netrc.netrc().authenticators(netrc_machine)
 685                 if info is not None:
 686                     username = info[0]
 687                     password = info[2]
 688                 else:
 689                     raise netrc.NetrcParseError(
 690                         'No authenticators for %s' % netrc_machine)
 691             except (IOError, netrc.NetrcParseError) as err:
 692                 self._downloader.report_warning(
 693                     'parsing .netrc: %s' % error_to_compat_str(err))
 694
 695         return username, password
 696
 697     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 698         """
 699         Get the login info as (username, password)
 700         First look for the manually specified credentials using username_option
 701         and password_option as keys in params dictionary. If no such credentials
 702         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 703         value.
 704         If there's no info available, return (None, None)
 705         """
 706         if self._downloader is None:
 707             return (None, None)
 708
 709         downloader_params = self._downloader.params
 710
 711         # Attempt to use provided username and password or .netrc data
 712         if downloader_params.get(username_option) is not None:
 713             username = downloader_params[username_option]
 714             password = downloader_params[password_option]
 715         else:
 716             username, password = self._get_netrc_login_info(netrc_machine)
 717
 718         return username, password
 719
 720     def _get_tfa_info(self, note='two-factor verification code'):
 721         """
 722         Get the two-factor authentication info
 723         TODO - asking the user will be required for sms/phone verify
 724         currently just uses the command line option
 725         If there's no info available, return None
 726         """
 727         if self._downloader is None:
 728             return None
 729         downloader_params = self._downloader.params
 730
 731         if downloader_params.get('twofactor') is not None:
 732             return downloader_params['twofactor']
 733
 734         return compat_getpass('Type %s and press [Return]: ' % note)
 735
 736     # Helper functions for extracting OpenGraph info
 737     @staticmethod
 738     def _og_regexes(prop):
 739         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 740         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 741                        % {'prop': re.escape(prop)})
 742         template = r'<meta[^>]+?%s[^>]+?%s'
 743         return [
 744             template % (property_re, content_re),
 745             template % (content_re, property_re),
 746         ]
 747
 748     @staticmethod
 749     def _meta_regex(prop):
 750         return r'''(?isx)<meta
 751                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 752                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 753
 754     def _og_search_property(self, prop, html, name=None, **kargs):
 755         if not isinstance(prop, (list, tuple)):
 756             prop = [prop]
 757         if name is None:
 758             name = 'OpenGraph %s' % prop[0]
 759         og_regexes = []
 760         for p in prop:
 761             og_regexes.extend(self._og_regexes(p))
 762         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 763         if escaped is None:
 764             return None
 765         return unescapeHTML(escaped)
 766
 767     def _og_search_thumbnail(self, html, **kargs):
 768         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 769
 770     def _og_search_description(self, html, **kargs):
 771         return self._og_search_property('description', html, fatal=False, **kargs)
 772
 773     def _og_search_title(self, html, **kargs):
 774         return self._og_search_property('title', html, **kargs)
 775
 776     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 777         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 778         if secure:
 779             regexes = self._og_regexes('video:secure_url') + regexes
 780         return self._html_search_regex(regexes, html, name, **kargs)
 781
 782     def _og_search_url(self, html, **kargs):
 783         return self._og_search_property('url', html, **kargs)
 784
 785     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 786         if not isinstance(name, (list, tuple)):
 787             name = [name]
 788         if display_name is None:
 789             display_name = name[0]
 790         return self._html_search_regex(
 791             [self._meta_regex(n) for n in name],
 792             html, display_name, fatal=fatal, group='content', **kwargs)
 793
 794     def _dc_search_uploader(self, html):
 795         return self._html_search_meta('dc.creator', html, 'uploader')
 796
 797     def _rta_search(self, html):
 798         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 799         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 800                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 801                      html):
 802             return 18
 803         return 0
 804
 805     def _media_rating_search(self, html):
 806         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 807         rating = self._html_search_meta('rating', html)
 808
 809         if not rating:
 810             return None
 811
 812         RATING_TABLE = {
 813             'safe for kids': 0,
 814             'general': 8,
 815             '14 years': 14,
 816             'mature': 17,
 817             'restricted': 19,
 818         }
 819         return RATING_TABLE.get(rating.lower())
 820
 821     def _family_friendly_search(self, html):
 822         # See http://schema.org/VideoObject
 823         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 824
 825         if not family_friendly:
 826             return None
 827
 828         RATING_TABLE = {
 829             '1': 0,
 830             'true': 0,
 831             '0': 18,
 832             'false': 18,
 833         }
 834         return RATING_TABLE.get(family_friendly.lower())
 835
 836     def _twitter_search_player(self, html):
 837         return self._html_search_meta('twitter:player', html,
 838                                       'twitter card player')
 839
 840     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 841         json_ld = self._search_regex(
 842             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 843             html, 'JSON-LD', group='json_ld', **kwargs)
 844         default = kwargs.get('default', NO_DEFAULT)
 845         if not json_ld:
 846             return default if default is not NO_DEFAULT else {}
 847         # JSON-LD may be malformed and thus `fatal` should be respected.
 848         # At the same time `default` may be passed that assumes `fatal=False`
 849         # for _search_regex. Let's simulate the same behavior here as well.
 850         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 851         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 852
 853     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 854         if isinstance(json_ld, compat_str):
 855             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 856         if not json_ld:
 857             return {}
 858         info = {}
 859         if not isinstance(json_ld, (list, tuple, dict)):
 860             return info
 861         if isinstance(json_ld, dict):
 862             json_ld = [json_ld]
 863         for e in json_ld:
 864             if e.get('@context') == 'http://schema.org':
 865                 item_type = e.get('@type')
 866                 if expected_type is not None and expected_type != item_type:
 867                     return info
 868                 if item_type == 'TVEpisode':
 869                     info.update({
 870                         'episode': unescapeHTML(e.get('name')),
 871                         'episode_number': int_or_none(e.get('episodeNumber')),
 872                         'description': unescapeHTML(e.get('description')),
 873                     })
 874                     part_of_season = e.get('partOfSeason')
 875                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 876                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 877                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 878                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 879                         info['series'] = unescapeHTML(part_of_series.get('name'))
 880                 elif item_type == 'Article':
 881                     info.update({
 882                         'timestamp': parse_iso8601(e.get('datePublished')),
 883                         'title': unescapeHTML(e.get('headline')),
 884                         'description': unescapeHTML(e.get('articleBody')),
 885                     })
 886                 elif item_type == 'VideoObject':
 887                     info.update({
 888                         'url': e.get('contentUrl'),
 889                         'title': unescapeHTML(e.get('name')),
 890                         'description': unescapeHTML(e.get('description')),
 891                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 892                         'duration': parse_duration(e.get('duration')),
 893                         'timestamp': unified_timestamp(e.get('uploadDate')),
 894                         'filesize': float_or_none(e.get('contentSize')),
 895                         'tbr': int_or_none(e.get('bitrate')),
 896                         'width': int_or_none(e.get('width')),
 897                         'height': int_or_none(e.get('height')),
 898                     })
 899                 break
 900         return dict((k, v) for k, v in info.items() if v is not None)
 901
 902     @staticmethod
 903     def _hidden_inputs(html):
 904         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 905         hidden_inputs = {}
 906         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 907             attrs = extract_attributes(input)
 908             if not input:
 909                 continue
 910             if attrs.get('type') not in ('hidden', 'submit'):
 911                 continue
 912             name = attrs.get('name') or attrs.get('id')
 913             value = attrs.get('value')
 914             if name and value is not None:
 915                 hidden_inputs[name] = value
 916         return hidden_inputs
 917
 918     def _form_hidden_inputs(self, form_id, html):
 919         form = self._search_regex(
 920             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 921             html, '%s form' % form_id, group='form')
 922         return self._hidden_inputs(form)
 923
 924     def _sort_formats(self, formats, field_preference=None):
 925         if not formats:
 926             raise ExtractorError('No video formats found')
 927
 928         for f in formats:
 929             # Automatically determine tbr when missing based on abr and vbr (improves
 930             # formats sorting in some cases)
 931             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 932                 f['tbr'] = f['abr'] + f['vbr']
 933
 934         def _formats_key(f):
 935             # TODO remove the following workaround
 936             from ..utils import determine_ext
 937             if not f.get('ext') and 'url' in f:
 938                 f['ext'] = determine_ext(f['url'])
 939
 940             if isinstance(field_preference, (list, tuple)):
 941                 return tuple(
 942                     f.get(field)
 943                     if f.get(field) is not None
 944                     else ('' if field == 'format_id' else -1)
 945                     for field in field_preference)
 946
 947             preference = f.get('preference')
 948             if preference is None:
 949                 preference = 0
 950                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 951                     preference -= 0.5
 952
 953             protocol = f.get('protocol') or determine_protocol(f)
 954             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
 955
 956             if f.get('vcodec') == 'none':  # audio only
 957                 preference -= 50
 958                 if self._downloader.params.get('prefer_free_formats'):
 959                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 960                 else:
 961                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 962                 ext_preference = 0
 963                 try:
 964                     audio_ext_preference = ORDER.index(f['ext'])
 965                 except ValueError:
 966                     audio_ext_preference = -1
 967             else:
 968                 if f.get('acodec') == 'none':  # video only
 969                     preference -= 40
 970                 if self._downloader.params.get('prefer_free_formats'):
 971                     ORDER = ['flv', 'mp4', 'webm']
 972                 else:
 973                     ORDER = ['webm', 'flv', 'mp4']
 974                 try:
 975                     ext_preference = ORDER.index(f['ext'])
 976                 except ValueError:
 977                     ext_preference = -1
 978                 audio_ext_preference = 0
 979
 980             return (
 981                 preference,
 982                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 983                 f.get('quality') if f.get('quality') is not None else -1,
 984                 f.get('tbr') if f.get('tbr') is not None else -1,
 985                 f.get('filesize') if f.get('filesize') is not None else -1,
 986                 f.get('vbr') if f.get('vbr') is not None else -1,
 987                 f.get('height') if f.get('height') is not None else -1,
 988                 f.get('width') if f.get('width') is not None else -1,
 989                 proto_preference,
 990                 ext_preference,
 991                 f.get('abr') if f.get('abr') is not None else -1,
 992                 audio_ext_preference,
 993                 f.get('fps') if f.get('fps') is not None else -1,
 994                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 995                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 996                 f.get('format_id') if f.get('format_id') is not None else '',
 997             )
 998         formats.sort(key=_formats_key)
 999
1000     def _check_formats(self, formats, video_id):
1001         if formats:
1002             formats[:] = filter(
1003                 lambda f: self._is_valid_url(
1004                     f['url'], video_id,
1005                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1006                 formats)
1007
1008     @staticmethod
1009     def _remove_duplicate_formats(formats):
1010         format_urls = set()
1011         unique_formats = []
1012         for f in formats:
1013             if f['url'] not in format_urls:
1014                 format_urls.add(f['url'])
1015                 unique_formats.append(f)
1016         formats[:] = unique_formats
1017
1018     def _is_valid_url(self, url, video_id, item='video'):
1019         url = self._proto_relative_url(url, scheme='http:')
1020         # For now assume non HTTP(S) URLs always valid
1021         if not (url.startswith('http://') or url.startswith('https://')):
1022             return True
1023         try:
1024             self._request_webpage(url, video_id, 'Checking %s URL' % item)
1025             return True
1026         except ExtractorError as e:
1027             if isinstance(e.cause, compat_urllib_error.URLError):
1028                 self.to_screen(
1029                     '%s: %s URL is invalid, skipping' % (video_id, item))
1030                 return False
1031             raise
1032
1033     def http_scheme(self):
1034         """ Either "http:" or "https:", depending on the user's preferences """
1035         return (
1036             'http:'
1037             if self._downloader.params.get('prefer_insecure', False)
1038             else 'https:')
1039
1040     def _proto_relative_url(self, url, scheme=None):
1041         if url is None:
1042             return url
1043         if url.startswith('//'):
1044             if scheme is None:
1045                 scheme = self.http_scheme()
1046             return scheme + url
1047         else:
1048             return url
1049
1050     def _sleep(self, timeout, video_id, msg_template=None):
1051         if msg_template is None:
1052             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1053         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1054         self.to_screen(msg)
1055         time.sleep(timeout)
1056
1057     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1058                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1059                              fatal=True, m3u8_id=None):
1060         manifest = self._download_xml(
1061             manifest_url, video_id, 'Downloading f4m manifest',
1062             'Unable to download f4m manifest',
1063             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1064             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1065             transform_source=transform_source,
1066             fatal=fatal)
1067
1068         if manifest is False:
1069             return []
1070
1071         return self._parse_f4m_formats(
1072             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1073             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1074
1075     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1076                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1077                            fatal=True, m3u8_id=None):
1078         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1079         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1080         if akamai_pv is not None and ';' in akamai_pv.text:
1081             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1082             if playerVerificationChallenge.strip() != '':
1083                 return []
1084
1085         formats = []
1086         manifest_version = '1.0'
1087         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1088         if not media_nodes:
1089             manifest_version = '2.0'
1090             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1091         # Remove unsupported DRM protected media from final formats
1092         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1093         media_nodes = remove_encrypted_media(media_nodes)
1094         if not media_nodes:
1095             return formats
1096         base_url = xpath_text(
1097             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1098             'base URL', default=None)
1099         if base_url:
1100             base_url = base_url.strip()
1101
1102         bootstrap_info = xpath_element(
1103             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1104             'bootstrap info', default=None)
1105
1106         vcodec = None
1107         mime_type = xpath_text(
1108             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1109             'base URL', default=None)
1110         if mime_type and mime_type.startswith('audio/'):
1111             vcodec = 'none'
1112
1113         for i, media_el in enumerate(media_nodes):
1114             tbr = int_or_none(media_el.attrib.get('bitrate'))
1115             width = int_or_none(media_el.attrib.get('width'))
1116             height = int_or_none(media_el.attrib.get('height'))
1117             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1118             # If <bootstrapInfo> is present, the specified f4m is a
1119             # stream-level manifest, and only set-level manifests may refer to
1120             # external resources.  See section 11.4 and section 4 of F4M spec
1121             if bootstrap_info is None:
1122                 media_url = None
1123                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1124                 if manifest_version == '2.0':
1125                     media_url = media_el.attrib.get('href')
1126                 if media_url is None:
1127                     media_url = media_el.attrib.get('url')
1128                 if not media_url:
1129                     continue
1130                 manifest_url = (
1131                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1132                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1133                 # If media_url is itself a f4m manifest do the recursive extraction
1134                 # since bitrates in parent manifest (this one) and media_url manifest
1135                 # may differ leading to inability to resolve the format by requested
1136                 # bitrate in f4m downloader
1137                 ext = determine_ext(manifest_url)
1138                 if ext == 'f4m':
1139                     f4m_formats = self._extract_f4m_formats(
1140                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1141                         transform_source=transform_source, fatal=fatal)
1142                     # Sometimes stream-level manifest contains single media entry that
1143                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1144                     # At the same time parent's media entry in set-level manifest may
1145                     # contain it. We will copy it from parent in such cases.
1146                     if len(f4m_formats) == 1:
1147                         f = f4m_formats[0]
1148                         f.update({
1149                             'tbr': f.get('tbr') or tbr,
1150                             'width': f.get('width') or width,
1151                             'height': f.get('height') or height,
1152                             'format_id': f.get('format_id') if not tbr else format_id,
1153                             'vcodec': vcodec,
1154                         })
1155                     formats.extend(f4m_formats)
1156                     continue
1157                 elif ext == 'm3u8':
1158                     formats.extend(self._extract_m3u8_formats(
1159                         manifest_url, video_id, 'mp4', preference=preference,
1160                         m3u8_id=m3u8_id, fatal=fatal))
1161                     continue
1162             formats.append({
1163                 'format_id': format_id,
1164                 'url': manifest_url,
1165                 'manifest_url': manifest_url,
1166                 'ext': 'flv' if bootstrap_info is not None else None,
1167                 'tbr': tbr,
1168                 'width': width,
1169                 'height': height,
1170                 'vcodec': vcodec,
1171                 'preference': preference,
1172             })
1173         return formats
1174
1175     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1176         return {
1177             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1178             'url': m3u8_url,
1179             'ext': ext,
1180             'protocol': 'm3u8',
1181             'preference': preference - 100 if preference else -100,
1182             'resolution': 'multiple',
1183             'format_note': 'Quality selection URL',
1184         }
1185
1186     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1187                               entry_protocol='m3u8', preference=None,
1188                               m3u8_id=None, note=None, errnote=None,
1189                               fatal=True, live=False):
1190
1191         res = self._download_webpage_handle(
1192             m3u8_url, video_id,
1193             note=note or 'Downloading m3u8 information',
1194             errnote=errnote or 'Failed to download m3u8 information',
1195             fatal=fatal)
1196         if res is False:
1197             return []
1198         m3u8_doc, urlh = res
1199         m3u8_url = urlh.geturl()
1200
1201         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1202
1203         format_url = lambda u: (
1204             u
1205             if re.match(r'^https?://', u)
1206             else compat_urlparse.urljoin(m3u8_url, u))
1207
1208         # We should try extracting formats only from master playlists [1], i.e.
1209         # playlists that describe available qualities. On the other hand media
1210         # playlists [2] should be returned as is since they contain just the media
1211         # without qualities renditions.
1212         # Fortunately, master playlist can be easily distinguished from media
1213         # playlist based on particular tags availability. As of [1, 2] master
1214         # playlist tags MUST NOT appear in a media playist and vice versa.
1215         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1216         # and MUST NOT appear in master playlist thus we can clearly detect media
1217         # playlist with this criterion.
1218         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1219         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1220         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1221         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1222             return [{
1223                 'url': m3u8_url,
1224                 'format_id': m3u8_id,
1225                 'ext': ext,
1226                 'protocol': entry_protocol,
1227                 'preference': preference,
1228             }]
1229         audio_in_video_stream = {}
1230         last_info = {}
1231         last_media = {}
1232         for line in m3u8_doc.splitlines():
1233             if line.startswith('#EXT-X-STREAM-INF:'):
1234                 last_info = parse_m3u8_attributes(line)
1235             elif line.startswith('#EXT-X-MEDIA:'):
1236                 media = parse_m3u8_attributes(line)
1237                 media_type = media.get('TYPE')
1238                 if media_type in ('VIDEO', 'AUDIO'):
1239                     group_id = media.get('GROUP-ID')
1240                     media_url = media.get('URI')
1241                     if media_url:
1242                         format_id = []
1243                         for v in (group_id, media.get('NAME')):
1244                             if v:
1245                                 format_id.append(v)
1246                         f = {
1247                             'format_id': '-'.join(format_id),
1248                             'url': format_url(media_url),
1249                             'language': media.get('LANGUAGE'),
1250                             'ext': ext,
1251                             'protocol': entry_protocol,
1252                             'preference': preference,
1253                         }
1254                         if media_type == 'AUDIO':
1255                             f['vcodec'] = 'none'
1256                             if group_id and not audio_in_video_stream.get(group_id):
1257                                 audio_in_video_stream[group_id] = False
1258                         formats.append(f)
1259                     else:
1260                         # When there is no URI in EXT-X-MEDIA let this tag's
1261                         # data be used by regular URI lines below
1262                         last_media = media
1263                         if media_type == 'AUDIO' and group_id:
1264                             audio_in_video_stream[group_id] = True
1265             elif line.startswith('#') or not line.strip():
1266                 continue
1267             else:
1268                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1269                 format_id = []
1270                 if m3u8_id:
1271                     format_id.append(m3u8_id)
1272                 # Despite specification does not mention NAME attribute for
1273                 # EXT-X-STREAM-INF it still sometimes may be present
1274                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1275                 # Bandwidth of live streams may differ over time thus making
1276                 # format_id unpredictable. So it's better to keep provided
1277                 # format_id intact.
1278                 if not live:
1279                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1280                 manifest_url = format_url(line.strip())
1281                 f = {
1282                     'format_id': '-'.join(format_id),
1283                     'url': manifest_url,
1284                     'manifest_url': manifest_url,
1285                     'tbr': tbr,
1286                     'ext': ext,
1287                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1288                     'protocol': entry_protocol,
1289                     'preference': preference,
1290                 }
1291                 resolution = last_info.get('RESOLUTION')
1292                 if resolution:
1293                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1294                     if mobj:
1295                         f['width'] = int(mobj.group('width'))
1296                         f['height'] = int(mobj.group('height'))
1297                 # Unified Streaming Platform
1298                 mobj = re.search(
1299                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1300                 if mobj:
1301                     abr, vbr = mobj.groups()
1302                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1303                     f.update({
1304                         'vbr': vbr,
1305                         'abr': abr,
1306                     })
1307                 f.update(parse_codecs(last_info.get('CODECS')))
1308                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
1309                     # TODO: update acodec for for audio only formats with the same GROUP-ID
1310                     f['acodec'] = 'none'
1311                 formats.append(f)
1312                 last_info = {}
1313                 last_media = {}
1314         return formats
1315
1316     @staticmethod
1317     def _xpath_ns(path, namespace=None):
1318         if not namespace:
1319             return path
1320         out = []
1321         for c in path.split('/'):
1322             if not c or c == '.':
1323                 out.append(c)
1324             else:
1325                 out.append('{%s}%s' % (namespace, c))
1326         return '/'.join(out)
1327
1328     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1329         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1330
1331         if smil is False:
1332             assert not fatal
1333             return []
1334
1335         namespace = self._parse_smil_namespace(smil)
1336
1337         return self._parse_smil_formats(
1338             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1339
1340     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1341         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1342         if smil is False:
1343             return {}
1344         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1345
1346     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1347         return self._download_xml(
1348             smil_url, video_id, 'Downloading SMIL file',
1349             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1350
1351     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1352         namespace = self._parse_smil_namespace(smil)
1353
1354         formats = self._parse_smil_formats(
1355             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1356         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1357
1358         video_id = os.path.splitext(url_basename(smil_url))[0]
1359         title = None
1360         description = None
1361         upload_date = None
1362         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1363             name = meta.attrib.get('name')
1364             content = meta.attrib.get('content')
1365             if not name or not content:
1366                 continue
1367             if not title and name == 'title':
1368                 title = content
1369             elif not description and name in ('description', 'abstract'):
1370                 description = content
1371             elif not upload_date and name == 'date':
1372                 upload_date = unified_strdate(content)
1373
1374         thumbnails = [{
1375             'id': image.get('type'),
1376             'url': image.get('src'),
1377             'width': int_or_none(image.get('width')),
1378             'height': int_or_none(image.get('height')),
1379         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1380
1381         return {
1382             'id': video_id,
1383             'title': title or video_id,
1384             'description': description,
1385             'upload_date': upload_date,
1386             'thumbnails': thumbnails,
1387             'formats': formats,
1388             'subtitles': subtitles,
1389         }
1390
1391     def _parse_smil_namespace(self, smil):
1392         return self._search_regex(
1393             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1394
1395     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1396         base = smil_url
1397         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1398             b = meta.get('base') or meta.get('httpBase')
1399             if b:
1400                 base = b
1401                 break
1402
1403         formats = []
1404         rtmp_count = 0
1405         http_count = 0
1406         m3u8_count = 0
1407
1408         srcs = []
1409         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1410         for medium in media:
1411             src = medium.get('src')
1412             if not src or src in srcs:
1413                 continue
1414             srcs.append(src)
1415
1416             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1417             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1418             width = int_or_none(medium.get('width'))
1419             height = int_or_none(medium.get('height'))
1420             proto = medium.get('proto')
1421             ext = medium.get('ext')
1422             src_ext = determine_ext(src)
1423             streamer = medium.get('streamer') or base
1424
1425             if proto == 'rtmp' or streamer.startswith('rtmp'):
1426                 rtmp_count += 1
1427                 formats.append({
1428                     'url': streamer,
1429                     'play_path': src,
1430                     'ext': 'flv',
1431                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1432                     'tbr': bitrate,
1433                     'filesize': filesize,
1434                     'width': width,
1435                     'height': height,
1436                 })
1437                 if transform_rtmp_url:
1438                     streamer, src = transform_rtmp_url(streamer, src)
1439                     formats[-1].update({
1440                         'url': streamer,
1441                         'play_path': src,
1442                     })
1443                 continue
1444
1445             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1446             src_url = src_url.strip()
1447
1448             if proto == 'm3u8' or src_ext == 'm3u8':
1449                 m3u8_formats = self._extract_m3u8_formats(
1450                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1451                 if len(m3u8_formats) == 1:
1452                     m3u8_count += 1
1453                     m3u8_formats[0].update({
1454                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1455                         'tbr': bitrate,
1456                         'width': width,
1457                         'height': height,
1458                     })
1459                 formats.extend(m3u8_formats)
1460                 continue
1461
1462             if src_ext == 'f4m':
1463                 f4m_url = src_url
1464                 if not f4m_params:
1465                     f4m_params = {
1466                         'hdcore': '3.2.0',
1467                         'plugin': 'flowplayer-3.2.0.1',
1468                     }
1469                 f4m_url += '&' if '?' in f4m_url else '?'
1470                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1471                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1472                 continue
1473
1474             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1475                 http_count += 1
1476                 formats.append({
1477                     'url': src_url,
1478                     'ext': ext or src_ext or 'flv',
1479                     'format_id': 'http-%d' % (bitrate or http_count),
1480                     'tbr': bitrate,
1481                     'filesize': filesize,
1482                     'width': width,
1483                     'height': height,
1484                 })
1485                 continue
1486
1487         return formats
1488
1489     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1490         urls = []
1491         subtitles = {}
1492         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1493             src = textstream.get('src')
1494             if not src or src in urls:
1495                 continue
1496             urls.append(src)
1497             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1498             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1499             subtitles.setdefault(lang, []).append({
1500                 'url': src,
1501                 'ext': ext,
1502             })
1503         return subtitles
1504
1505     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1506         xspf = self._download_xml(
1507             playlist_url, playlist_id, 'Downloading xpsf playlist',
1508             'Unable to download xspf manifest', fatal=fatal)
1509         if xspf is False:
1510             return []
1511         return self._parse_xspf(xspf, playlist_id)
1512
1513     def _parse_xspf(self, playlist, playlist_id):
1514         NS_MAP = {
1515             'xspf': 'http://xspf.org/ns/0/',
1516             's1': 'http://static.streamone.nl/player/ns/0',
1517         }
1518
1519         entries = []
1520         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1521             title = xpath_text(
1522                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1523             description = xpath_text(
1524                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1525             thumbnail = xpath_text(
1526                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1527             duration = float_or_none(
1528                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1529
1530             formats = [{
1531                 'url': location.text,
1532                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1533                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1534                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1535             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1536             self._sort_formats(formats)
1537
1538             entries.append({
1539                 'id': playlist_id,
1540                 'title': title,
1541                 'description': description,
1542                 'thumbnail': thumbnail,
1543                 'duration': duration,
1544                 'formats': formats,
1545             })
1546         return entries
1547
1548     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1549         res = self._download_webpage_handle(
1550             mpd_url, video_id,
1551             note=note or 'Downloading MPD manifest',
1552             errnote=errnote or 'Failed to download MPD manifest',
1553             fatal=fatal)
1554         if res is False:
1555             return []
1556         mpd, urlh = res
1557         mpd_base_url = base_url(urlh.geturl())
1558
1559         return self._parse_mpd_formats(
1560             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1561             formats_dict=formats_dict, mpd_url=mpd_url)
1562
1563     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1564         """
1565         Parse formats from MPD manifest.
1566         References:
1567          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1568             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1569          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1570         """
1571         if mpd_doc.get('type') == 'dynamic':
1572             return []
1573
1574         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1575
1576         def _add_ns(path):
1577             return self._xpath_ns(path, namespace)
1578
1579         def is_drm_protected(element):
1580             return element.find(_add_ns('ContentProtection')) is not None
1581
1582         def extract_multisegment_info(element, ms_parent_info):
1583             ms_info = ms_parent_info.copy()
1584
1585             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1586             # common attributes and elements.  We will only extract relevant
1587             # for us.
1588             def extract_common(source):
1589                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1590                 if segment_timeline is not None:
1591                     s_e = segment_timeline.findall(_add_ns('S'))
1592                     if s_e:
1593                         ms_info['total_number'] = 0
1594                         ms_info['s'] = []
1595                         for s in s_e:
1596                             r = int(s.get('r', 0))
1597                             ms_info['total_number'] += 1 + r
1598                             ms_info['s'].append({
1599                                 't': int(s.get('t', 0)),
1600                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1601                                 'd': int(s.attrib['d']),
1602                                 'r': r,
1603                             })
1604                 start_number = source.get('startNumber')
1605                 if start_number:
1606                     ms_info['start_number'] = int(start_number)
1607                 timescale = source.get('timescale')
1608                 if timescale:
1609                     ms_info['timescale'] = int(timescale)
1610                 segment_duration = source.get('duration')
1611                 if segment_duration:
1612                     ms_info['segment_duration'] = int(segment_duration)
1613
1614             def extract_Initialization(source):
1615                 initialization = source.find(_add_ns('Initialization'))
1616                 if initialization is not None:
1617                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1618
1619             segment_list = element.find(_add_ns('SegmentList'))
1620             if segment_list is not None:
1621                 extract_common(segment_list)
1622                 extract_Initialization(segment_list)
1623                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1624                 if segment_urls_e:
1625                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1626             else:
1627                 segment_template = element.find(_add_ns('SegmentTemplate'))
1628                 if segment_template is not None:
1629                     extract_common(segment_template)
1630                     media_template = segment_template.get('media')
1631                     if media_template:
1632                         ms_info['media_template'] = media_template
1633                     initialization = segment_template.get('initialization')
1634                     if initialization:
1635                         ms_info['initialization_url'] = initialization
1636                     else:
1637                         extract_Initialization(segment_template)
1638             return ms_info
1639
1640         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1641         formats = []
1642         for period in mpd_doc.findall(_add_ns('Period')):
1643             period_duration = parse_duration(period.get('duration')) or mpd_duration
1644             period_ms_info = extract_multisegment_info(period, {
1645                 'start_number': 1,
1646                 'timescale': 1,
1647             })
1648             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1649                 if is_drm_protected(adaptation_set):
1650                     continue
1651                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1652                 for representation in adaptation_set.findall(_add_ns('Representation')):
1653                     if is_drm_protected(representation):
1654                         continue
1655                     representation_attrib = adaptation_set.attrib.copy()
1656                     representation_attrib.update(representation.attrib)
1657                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1658                     mime_type = representation_attrib['mimeType']
1659                     content_type = mime_type.split('/')[0]
1660                     if content_type == 'text':
1661                         # TODO implement WebVTT downloading
1662                         pass
1663                     elif content_type == 'video' or content_type == 'audio':
1664                         base_url = ''
1665                         for element in (representation, adaptation_set, period, mpd_doc):
1666                             base_url_e = element.find(_add_ns('BaseURL'))
1667                             if base_url_e is not None:
1668                                 base_url = base_url_e.text + base_url
1669                                 if re.match(r'^https?://', base_url):
1670                                     break
1671                         if mpd_base_url and not re.match(r'^https?://', base_url):
1672                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1673                                 mpd_base_url += '/'
1674                             base_url = mpd_base_url + base_url
1675                         representation_id = representation_attrib.get('id')
1676                         lang = representation_attrib.get('lang')
1677                         url_el = representation.find(_add_ns('BaseURL'))
1678                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1679                         f = {
1680                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1681                             'url': base_url,
1682                             'manifest_url': mpd_url,
1683                             'ext': mimetype2ext(mime_type),
1684                             'width': int_or_none(representation_attrib.get('width')),
1685                             'height': int_or_none(representation_attrib.get('height')),
1686                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1687                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1688                             'fps': int_or_none(representation_attrib.get('frameRate')),
1689                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1690                             'format_note': 'DASH %s' % content_type,
1691                             'filesize': filesize,
1692                         }
1693                         f.update(parse_codecs(representation_attrib.get('codecs')))
1694                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1695                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1696
1697                             media_template = representation_ms_info['media_template']
1698                             media_template = media_template.replace('$RepresentationID$', representation_id)
1699                             media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1700                             media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1701                             media_template.replace('$$', '$')
1702
1703                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1704                             # can't be used at the same time
1705                             if '%(Number' in media_template and 's' not in representation_ms_info:
1706                                 segment_duration = None
1707                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1708                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1709                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1710                                 representation_ms_info['fragments'] = [{
1711                                     'url': media_template % {
1712                                         'Number': segment_number,
1713                                         'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
1714                                     },
1715                                     'duration': segment_duration,
1716                                 } for segment_number in range(
1717                                     representation_ms_info['start_number'],
1718                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1719                             else:
1720                                 # $Number*$ or $Time$ in media template with S list available
1721                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1722                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1723                                 representation_ms_info['fragments'] = []
1724                                 segment_time = 0
1725                                 segment_d = None
1726                                 segment_number = representation_ms_info['start_number']
1727
1728                                 def add_segment_url():
1729                                     segment_url = media_template % {
1730                                         'Time': segment_time,
1731                                         'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
1732                                         'Number': segment_number,
1733                                     }
1734                                     representation_ms_info['fragments'].append({
1735                                         'url': segment_url,
1736                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1737                                     })
1738
1739                                 for num, s in enumerate(representation_ms_info['s']):
1740                                     segment_time = s.get('t') or segment_time
1741                                     segment_d = s['d']
1742                                     add_segment_url()
1743                                     segment_number += 1
1744                                     for r in range(s.get('r', 0)):
1745                                         segment_time += segment_d
1746                                         add_segment_url()
1747                                         segment_number += 1
1748                                     segment_time += segment_d
1749                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1750                             # No media template
1751                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1752                             # or any YouTube dashsegments video
1753                             fragments = []
1754                             segment_index = 0
1755                             timescale = representation_ms_info['timescale']
1756                             for s in representation_ms_info['s']:
1757                                 duration = float_or_none(s['d'], timescale)
1758                                 for r in range(s.get('r', 0) + 1):
1759                                     fragments.append({
1760                                         'url': representation_ms_info['segment_urls'][segment_index],
1761                                         'duration': duration,
1762                                     })
1763                                     segment_index += 1
1764                             representation_ms_info['fragments'] = fragments
1765                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1766                         # No fragments key is present in this case.
1767                         if 'fragments' in representation_ms_info:
1768                             f.update({
1769                                 'fragments': [],
1770                                 'protocol': 'http_dash_segments',
1771                             })
1772                             if 'initialization_url' in representation_ms_info:
1773                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1774                                 if not f.get('url'):
1775                                     f['url'] = initialization_url
1776                                 f['fragments'].append({'url': initialization_url})
1777                             f['fragments'].extend(representation_ms_info['fragments'])
1778                             for fragment in f['fragments']:
1779                                 fragment['url'] = urljoin(base_url, fragment['url'])
1780                         try:
1781                             existing_format = next(
1782                                 fo for fo in formats
1783                                 if fo['format_id'] == representation_id)
1784                         except StopIteration:
1785                             full_info = formats_dict.get(representation_id, {}).copy()
1786                             full_info.update(f)
1787                             formats.append(full_info)
1788                         else:
1789                             existing_format.update(f)
1790                     else:
1791                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1792         return formats
1793
1794     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1795         res = self._download_webpage_handle(
1796             ism_url, video_id,
1797             note=note or 'Downloading ISM manifest',
1798             errnote=errnote or 'Failed to download ISM manifest',
1799             fatal=fatal)
1800         if res is False:
1801             return []
1802         ism, urlh = res
1803
1804         return self._parse_ism_formats(
1805             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1806
1807     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1808         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1809             return []
1810
1811         duration = int(ism_doc.attrib['Duration'])
1812         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1813
1814         formats = []
1815         for stream in ism_doc.findall('StreamIndex'):
1816             stream_type = stream.get('Type')
1817             if stream_type not in ('video', 'audio'):
1818                 continue
1819             url_pattern = stream.attrib['Url']
1820             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1821             stream_name = stream.get('Name')
1822             for track in stream.findall('QualityLevel'):
1823                 fourcc = track.get('FourCC')
1824                 # TODO: add support for WVC1 and WMAP
1825                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1826                     self.report_warning('%s is not a supported codec' % fourcc)
1827                     continue
1828                 tbr = int(track.attrib['Bitrate']) // 1000
1829                 width = int_or_none(track.get('MaxWidth'))
1830                 height = int_or_none(track.get('MaxHeight'))
1831                 sampling_rate = int_or_none(track.get('SamplingRate'))
1832
1833                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1834                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1835
1836                 fragments = []
1837                 fragment_ctx = {
1838                     'time': 0,
1839                 }
1840                 stream_fragments = stream.findall('c')
1841                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1842                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1843                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1844                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1845                     if not fragment_ctx['duration']:
1846                         try:
1847                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1848                         except IndexError:
1849                             next_fragment_time = duration
1850                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1851                     for _ in range(fragment_repeat):
1852                         fragments.append({
1853                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1854                             'duration': fragment_ctx['duration'] / stream_timescale,
1855                         })
1856                         fragment_ctx['time'] += fragment_ctx['duration']
1857
1858                 format_id = []
1859                 if ism_id:
1860                     format_id.append(ism_id)
1861                 if stream_name:
1862                     format_id.append(stream_name)
1863                 format_id.append(compat_str(tbr))
1864
1865                 formats.append({
1866                     'format_id': '-'.join(format_id),
1867                     'url': ism_url,
1868                     'manifest_url': ism_url,
1869                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1870                     'width': width,
1871                     'height': height,
1872                     'tbr': tbr,
1873                     'asr': sampling_rate,
1874                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1875                     'acodec': 'none' if stream_type == 'video' else fourcc,
1876                     'protocol': 'ism',
1877                     'fragments': fragments,
1878                     '_download_params': {
1879                         'duration': duration,
1880                         'timescale': stream_timescale,
1881                         'width': width or 0,
1882                         'height': height or 0,
1883                         'fourcc': fourcc,
1884                         'codec_private_data': track.get('CodecPrivateData'),
1885                         'sampling_rate': sampling_rate,
1886                         'channels': int_or_none(track.get('Channels', 2)),
1887                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
1888                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
1889                     },
1890                 })
1891         return formats
1892
1893     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
1894         def absolute_url(video_url):
1895             return compat_urlparse.urljoin(base_url, video_url)
1896
1897         def parse_content_type(content_type):
1898             if not content_type:
1899                 return {}
1900             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1901             if ctr:
1902                 mimetype, codecs = ctr.groups()
1903                 f = parse_codecs(codecs)
1904                 f['ext'] = mimetype2ext(mimetype)
1905                 return f
1906             return {}
1907
1908         def _media_formats(src, cur_media_type):
1909             full_url = absolute_url(src)
1910             ext = determine_ext(full_url)
1911             if ext == 'm3u8':
1912                 is_plain_url = False
1913                 formats = self._extract_m3u8_formats(
1914                     full_url, video_id, ext='mp4',
1915                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1916             elif ext == 'mpd':
1917                 is_plain_url = False
1918                 formats = self._extract_mpd_formats(
1919                     full_url, video_id, mpd_id=mpd_id)
1920             else:
1921                 is_plain_url = True
1922                 formats = [{
1923                     'url': full_url,
1924                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1925                 }]
1926             return is_plain_url, formats
1927
1928         entries = []
1929         media_tags = [(media_tag, media_type, '')
1930                       for media_tag, media_type
1931                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
1932         media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
1933         for media_tag, media_type, media_content in media_tags:
1934             media_info = {
1935                 'formats': [],
1936                 'subtitles': {},
1937             }
1938             media_attributes = extract_attributes(media_tag)
1939             src = media_attributes.get('src')
1940             if src:
1941                 _, formats = _media_formats(src, media_type)
1942                 media_info['formats'].extend(formats)
1943             media_info['thumbnail'] = media_attributes.get('poster')
1944             if media_content:
1945                 for source_tag in re.findall(r'<source[^>]+>', media_content):
1946                     source_attributes = extract_attributes(source_tag)
1947                     src = source_attributes.get('src')
1948                     if not src:
1949                         continue
1950                     is_plain_url, formats = _media_formats(src, media_type)
1951                     if is_plain_url:
1952                         f = parse_content_type(source_attributes.get('type'))
1953                         f.update(formats[0])
1954                         media_info['formats'].append(f)
1955                     else:
1956                         media_info['formats'].extend(formats)
1957                 for track_tag in re.findall(r'<track[^>]+>', media_content):
1958                     track_attributes = extract_attributes(track_tag)
1959                     kind = track_attributes.get('kind')
1960                     if not kind or kind in ('subtitles', 'captions'):
1961                         src = track_attributes.get('src')
1962                         if not src:
1963                             continue
1964                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1965                         media_info['subtitles'].setdefault(lang, []).append({
1966                             'url': absolute_url(src),
1967                         })
1968             if media_info['formats'] or media_info['subtitles']:
1969                 entries.append(media_info)
1970         return entries
1971
1972     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
1973         formats = []
1974         hdcore_sign = 'hdcore=3.7.0'
1975         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1976         hds_host = hosts.get('hds')
1977         if hds_host:
1978             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
1979         if 'hdcore=' not in f4m_url:
1980             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
1981         f4m_formats = self._extract_f4m_formats(
1982             f4m_url, video_id, f4m_id='hds', fatal=False)
1983         for entry in f4m_formats:
1984             entry.update({'extra_param_to_segment_url': hdcore_sign})
1985         formats.extend(f4m_formats)
1986         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1987         hls_host = hosts.get('hls')
1988         if hls_host:
1989             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
1990         formats.extend(self._extract_m3u8_formats(
1991             m3u8_url, video_id, 'mp4', 'm3u8_native',
1992             m3u8_id='hls', fatal=False))
1993         return formats
1994
1995     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1996         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1997         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1998         http_base_url = 'http' + url_base
1999         formats = []
2000         if 'm3u8' not in skip_protocols:
2001             formats.extend(self._extract_m3u8_formats(
2002                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2003                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2004         if 'f4m' not in skip_protocols:
2005             formats.extend(self._extract_f4m_formats(
2006                 http_base_url + '/manifest.f4m',
2007                 video_id, f4m_id='hds', fatal=False))
2008         if 'dash' not in skip_protocols:
2009             formats.extend(self._extract_mpd_formats(
2010                 http_base_url + '/manifest.mpd',
2011                 video_id, mpd_id='dash', fatal=False))
2012         if re.search(r'(?:/smil:|\.smil)', url_base):
2013             if 'smil' not in skip_protocols:
2014                 rtmp_formats = self._extract_smil_formats(
2015                     http_base_url + '/jwplayer.smil',
2016                     video_id, fatal=False)
2017                 for rtmp_format in rtmp_formats:
2018                     rtsp_format = rtmp_format.copy()
2019                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2020                     del rtsp_format['play_path']
2021                     del rtsp_format['ext']
2022                     rtsp_format.update({
2023                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2024                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2025                         'protocol': 'rtsp',
2026                     })
2027                     formats.extend([rtmp_format, rtsp_format])
2028         else:
2029             for protocol in ('rtmp', 'rtsp'):
2030                 if protocol not in skip_protocols:
2031                     formats.append({
2032                         'url': protocol + url_base,
2033                         'format_id': protocol,
2034                         'protocol': protocol,
2035                     })
2036         return formats
2037
2038     def _live_title(self, name):
2039         """ Generate the title for a live video """
2040         now = datetime.datetime.now()
2041         now_str = now.strftime('%Y-%m-%d %H:%M')
2042         return name + ' ' + now_str
2043
2044     def _int(self, v, name, fatal=False, **kwargs):
2045         res = int_or_none(v, **kwargs)
2046         if 'get_attr' in kwargs:
2047             print(getattr(v, kwargs['get_attr']))
2048         if res is None:
2049             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2050             if fatal:
2051                 raise ExtractorError(msg)
2052             else:
2053                 self._downloader.report_warning(msg)
2054         return res
2055
2056     def _float(self, v, name, fatal=False, **kwargs):
2057         res = float_or_none(v, **kwargs)
2058         if res is None:
2059             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2060             if fatal:
2061                 raise ExtractorError(msg)
2062             else:
2063                 self._downloader.report_warning(msg)
2064         return res
2065
2066     def _set_cookie(self, domain, name, value, expire_time=None):
2067         cookie = compat_cookiejar.Cookie(
2068             0, name, value, None, None, domain, None,
2069             None, '/', True, False, expire_time, '', None, None, None)
2070         self._downloader.cookiejar.set_cookie(cookie)
2071
2072     def _get_cookies(self, url):
2073         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2074         req = sanitized_Request(url)
2075         self._downloader.cookiejar.add_cookie_header(req)
2076         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2077
2078     def get_testcases(self, include_onlymatching=False):
2079         t = getattr(self, '_TEST', None)
2080         if t:
2081             assert not hasattr(self, '_TESTS'), \
2082                 '%s has _TEST and _TESTS' % type(self).__name__
2083             tests = [t]
2084         else:
2085             tests = getattr(self, '_TESTS', [])
2086         for t in tests:
2087             if not include_onlymatching and t.get('only_matching', False):
2088                 continue
2089             t['name'] = type(self).__name__[:-len('IE')]
2090             yield t
2091
2092     def is_suitable(self, age_limit):
2093         """ Test whether the extractor is generally suitable for the given
2094         age limit (i.e. pornographic sites are not, all others usually are) """
2095
2096         any_restricted = False
2097         for tc in self.get_testcases(include_onlymatching=False):
2098             if tc.get('playlist', []):
2099                 tc = tc['playlist'][0]
2100             is_restricted = age_restricted(
2101                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2102             if not is_restricted:
2103                 return True
2104             any_restricted = any_restricted or is_restricted
2105         return not any_restricted
2106
2107     def extract_subtitles(self, *args, **kwargs):
2108         if (self._downloader.params.get('writesubtitles', False) or
2109                 self._downloader.params.get('listsubtitles')):
2110             return self._get_subtitles(*args, **kwargs)
2111         return {}
2112
2113     def _get_subtitles(self, *args, **kwargs):
2114         raise NotImplementedError('This method must be implemented by subclasses')
2115
2116     @staticmethod
2117     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2118         """ Merge subtitle items for one language. Items with duplicated URLs
2119         will be dropped. """
2120         list1_urls = set([item['url'] for item in subtitle_list1])
2121         ret = list(subtitle_list1)
2122         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2123         return ret
2124
2125     @classmethod
2126     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2127         """ Merge two subtitle dictionaries, language by language. """
2128         ret = dict(subtitle_dict1)
2129         for lang in subtitle_dict2:
2130             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2131         return ret
2132
2133     def extract_automatic_captions(self, *args, **kwargs):
2134         if (self._downloader.params.get('writeautomaticsub', False) or
2135                 self._downloader.params.get('listsubtitles')):
2136             return self._get_automatic_captions(*args, **kwargs)
2137         return {}
2138
2139     def _get_automatic_captions(self, *args, **kwargs):
2140         raise NotImplementedError('This method must be implemented by subclasses')
2141
2142     def mark_watched(self, *args, **kwargs):
2143         if (self._downloader.params.get('mark_watched', False) and
2144                 (self._get_login_info()[0] is not None or
2145                     self._downloader.params.get('cookiefile') is not None)):
2146             self._mark_watched(*args, **kwargs)
2147
2148     def _mark_watched(self, *args, **kwargs):
2149         raise NotImplementedError('This method must be implemented by subclasses')
2150
2151     def geo_verification_headers(self):
2152         headers = {}
2153         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2154         if geo_verification_proxy:
2155             headers['Ytdl-request-proxy'] = geo_verification_proxy
2156         return headers
2157
2158     def _generic_id(self, url):
2159         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2160
2161     def _generic_title(self, url):
2162         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2163
2164
2165 class SearchInfoExtractor(InfoExtractor):
2166     """
2167     Base class for paged search queries extractors.
2168     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2169     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2170     """
2171
2172     @classmethod
2173     def _make_valid_url(cls):
2174         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2175
2176     @classmethod
2177     def suitable(cls, url):
2178         return re.match(cls._make_valid_url(), url) is not None
2179
2180     def _real_extract(self, query):
2181         mobj = re.match(self._make_valid_url(), query)
2182         if mobj is None:
2183             raise ExtractorError('Invalid search query "%s"' % query)
2184
2185         prefix = mobj.group('prefix')
2186         query = mobj.group('query')
2187         if prefix == '':
2188             return self._get_n_results(query, 1)
2189         elif prefix == 'all':
2190             return self._get_n_results(query, self._MAX_RESULTS)
2191         else:
2192             n = int(prefix)
2193             if n <= 0:
2194                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2195             elif n > self._MAX_RESULTS:
2196                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2197                 n = self._MAX_RESULTS
2198             return self._get_n_results(query, n)
2199
2200     def _get_n_results(self, query, n):
2201         """Get a specified number of results for a query"""
2202         raise NotImplementedError('This method must be implemented by subclasses')
2203
2204     @property
2205     def SEARCH_KEY(self):
2206         return self._SEARCH_KEY