_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     unified_timestamp,
  48     url_basename,
  49     xpath_element,
  50     xpath_text,
  51     xpath_with_ns,
  52     determine_protocol,
  53     parse_duration,
  54     mimetype2ext,
  55     update_Request,
  56     update_url_query,
  57     parse_m3u8_attributes,
  58     extract_attributes,
  59     parse_codecs,
  60 )
  61
  62
  63 class InfoExtractor(object):
  64     """Information Extractor class.
  65
  66     Information extractors are the classes that, given a URL, extract
  67     information about the video (or videos) the URL refers to. This
  68     information includes the real video URL, the video title, author and
  69     others. The information is stored in a dictionary which is then
  70     passed to the YoutubeDL. The YoutubeDL processes this
  71     information possibly downloading the video to the file system, among
  72     other possible outcomes.
  73
  74     The type field determines the type of the result.
  75     By far the most common value (and the default if _type is missing) is
  76     "video", which indicates a single video.
  77
  78     For a video, the dictionaries must include the following fields:
  79
  80     id:             Video identifier.
  81     title:          Video title, unescaped.
  82
  83     Additionally, it must contain either a formats entry or a url one:
  84
  85     formats:        A list of dictionaries for each format available, ordered
  86                     from worst to best quality.
  87
  88                     Potential fields:
  89                     * url        Mandatory. The URL of the video file
  90                     * ext        Will be calculated from URL if missing
  91                     * format     A human-readable description of the format
  92                                  ("mp4 container with h264/opus").
  93                                  Calculated from the format_id, width, height.
  94                                  and format_note fields if missing.
  95                     * format_id  A short description of the format
  96                                  ("mp4_h264_opus" or "19").
  97                                 Technically optional, but strongly recommended.
  98                     * format_note Additional info about the format
  99                                  ("3D" or "DASH video")
 100                     * width      Width of the video, if known
 101                     * height     Height of the video, if known
 102                     * resolution Textual description of width and height
 103                     * tbr        Average bitrate of audio and video in KBit/s
 104                     * abr        Average audio bitrate in KBit/s
 105                     * acodec     Name of the audio codec in use
 106                     * asr        Audio sampling rate in Hertz
 107                     * vbr        Average video bitrate in KBit/s
 108                     * fps        Frame rate
 109                     * vcodec     Name of the video codec in use
 110                     * container  Name of the container format
 111                     * filesize   The number of bytes, if known in advance
 112                     * filesize_approx  An estimate for the number of bytes
 113                     * player_url SWF Player URL (used for rtmpdump).
 114                     * protocol   The protocol that will be used for the actual
 115                                  download, lower-case.
 116                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 117                                  "m3u8", "m3u8_native" or "http_dash_segments".
 118                     * preference Order number of this format. If this field is
 119                                  present and not None, the formats get sorted
 120                                  by this field, regardless of all other values.
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                                  < -1000 to hide the format (if there is
 124                                     another one which is strictly better)
 125                     * language   Language code, e.g. "de" or "en-US".
 126                     * language_preference  Is this in the language mentioned in
 127                                  the URL?
 128                                  10 if it's what the URL is about,
 129                                  -1 for default (don't know),
 130                                  -10 otherwise, other values reserved for now.
 131                     * quality    Order number of the video quality of this
 132                                  format, irrespective of the file format.
 133                                  -1 for default (order by other properties),
 134                                  -2 or smaller for less than default.
 135                     * source_preference  Order number for this video source
 136                                   (quality takes higher priority)
 137                                  -1 for default (order by other properties),
 138                                  -2 or smaller for less than default.
 139                     * http_headers  A dictionary of additional HTTP headers
 140                                  to add to the request.
 141                     * stretched_ratio  If given and not 1, indicates that the
 142                                  video's pixels are not square.
 143                                  width : height ratio as float.
 144                     * no_resume  The server does not support resuming the
 145                                  (HTTP or RTMP) download. Boolean.
 146
 147     url:            Final video URL.
 148     ext:            Video filename extension.
 149     format:         The video format, defaults to ext (used for --get-format)
 150     player_url:     SWF Player URL (used for rtmpdump).
 151
 152     The following fields are optional:
 153
 154     alt_title:      A secondary title of the video.
 155     display_id      An alternative identifier for the video, not necessarily
 156                     unique, but available before title. Typically, id is
 157                     something like "4234987", title "Dancing naked mole rats",
 158                     and display_id "dancing-naked-mole-rats"
 159     thumbnails:     A list of dictionaries, with the following entries:
 160                         * "id" (optional, string) - Thumbnail format ID
 161                         * "url"
 162                         * "preference" (optional, int) - quality of the image
 163                         * "width" (optional, int)
 164                         * "height" (optional, int)
 165                         * "resolution" (optional, string "{width}x{height"},
 166                                         deprecated)
 167                         * "filesize" (optional, int)
 168     thumbnail:      Full URL to a video thumbnail image.
 169     description:    Full video description.
 170     uploader:       Full name of the video uploader.
 171     license:        License name the video is licensed under.
 172     creator:        The creator of the video.
 173     release_date:   The date (YYYYMMDD) when the video was released.
 174     timestamp:      UNIX timestamp of the moment the video became available.
 175     upload_date:    Video upload date (YYYYMMDD).
 176                     If not explicitly set, calculated from timestamp.
 177     uploader_id:    Nickname or id of the video uploader.
 178     uploader_url:   Full URL to a personal webpage of the video uploader.
 179     location:       Physical location where the video was filmed.
 180     subtitles:      The available subtitles as a dictionary in the format
 181                     {language: subformats}. "subformats" is a list sorted from
 182                     lower to higher preference, each element is a dictionary
 183                     with the "ext" entry and one of:
 184                         * "data": The subtitles file contents
 185                         * "url": A URL pointing to the subtitles file
 186                     "ext" will be calculated from URL if missing
 187     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 188                     automatically generated captions
 189     duration:       Length of the video in seconds, as an integer or float.
 190     view_count:     How many users have watched the video on the platform.
 191     like_count:     Number of positive ratings of the video
 192     dislike_count:  Number of negative ratings of the video
 193     repost_count:   Number of reposts of the video
 194     average_rating: Average rating give by users, the scale used depends on the webpage
 195     comment_count:  Number of comments on the video
 196     comments:       A list of comments, each with one or more of the following
 197                     properties (all but one of text or html optional):
 198                         * "author" - human-readable name of the comment author
 199                         * "author_id" - user ID of the comment author
 200                         * "id" - Comment ID
 201                         * "html" - Comment as HTML
 202                         * "text" - Plain text of the comment
 203                         * "timestamp" - UNIX timestamp of comment
 204                         * "parent" - ID of the comment this one is replying to.
 205                                      Set to "root" to indicate that this is a
 206                                      comment to the original video.
 207     age_limit:      Age restriction for the video, as an integer (years)
 208     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 209                     should allow to get the same result again. (It will be set
 210                     by YoutubeDL if it's missing)
 211     categories:     A list of categories that the video falls in, for example
 212                     ["Sports", "Berlin"]
 213     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 214     is_live:        True, False, or None (=unknown). Whether this video is a
 215                     live stream that goes on instead of a fixed-length video.
 216     start_time:     Time in seconds where the reproduction should start, as
 217                     specified in the URL.
 218     end_time:       Time in seconds where the reproduction should end, as
 219                     specified in the URL.
 220
 221     The following fields should only be used when the video belongs to some logical
 222     chapter or section:
 223
 224     chapter:        Name or title of the chapter the video belongs to.
 225     chapter_number: Number of the chapter the video belongs to, as an integer.
 226     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 227
 228     The following fields should only be used when the video is an episode of some
 229     series or programme:
 230
 231     series:         Title of the series or programme the video episode belongs to.
 232     season:         Title of the season the video episode belongs to.
 233     season_number:  Number of the season the video episode belongs to, as an integer.
 234     season_id:      Id of the season the video episode belongs to, as a unicode string.
 235     episode:        Title of the video episode. Unlike mandatory video title field,
 236                     this field should denote the exact title of the video episode
 237                     without any kind of decoration.
 238     episode_number: Number of the video episode within a season, as an integer.
 239     episode_id:     Id of the video episode, as a unicode string.
 240
 241     The following fields should only be used when the media is a track or a part of
 242     a music album:
 243
 244     track:          Title of the track.
 245     track_number:   Number of the track within an album or a disc, as an integer.
 246     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 247                     as a unicode string.
 248     artist:         Artist(s) of the track.
 249     genre:          Genre(s) of the track.
 250     album:          Title of the album the track belongs to.
 251     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 252     album_artist:   List of all artists appeared on the album (e.g.
 253                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 254                     and compilations).
 255     disc_number:    Number of the disc or other physical medium the track belongs to,
 256                     as an integer.
 257     release_year:   Year (YYYY) when the album was released.
 258
 259     Unless mentioned otherwise, the fields should be Unicode strings.
 260
 261     Unless mentioned otherwise, None is equivalent to absence of information.
 262
 263
 264     _type "playlist" indicates multiple videos.
 265     There must be a key "entries", which is a list, an iterable, or a PagedList
 266     object, each element of which is a valid dictionary by this specification.
 267
 268     Additionally, playlists can have "title", "description" and "id" attributes
 269     with the same semantics as videos (see above).
 270
 271
 272     _type "multi_video" indicates that there are multiple videos that
 273     form a single show, for examples multiple acts of an opera or TV episode.
 274     It must have an entries key like a playlist and contain all the keys
 275     required for a video at the same time.
 276
 277
 278     _type "url" indicates that the video must be extracted from another
 279     location, possibly by a different extractor. Its only required key is:
 280     "url" - the next URL to extract.
 281     The key "ie_key" can be set to the class name (minus the trailing "IE",
 282     e.g. "Youtube") if the extractor class is known in advance.
 283     Additionally, the dictionary may have any properties of the resolved entity
 284     known in advance, for example "title" if the title of the referred video is
 285     known ahead of time.
 286
 287
 288     _type "url_transparent" entities have the same specification as "url", but
 289     indicate that the given additional information is more precise than the one
 290     associated with the resolved URL.
 291     This is useful when a site employs a video service that hosts the video and
 292     its technical metadata, but that video service does not embed a useful
 293     title, description etc.
 294
 295
 296     Subclasses of this one should re-define the _real_initialize() and
 297     _real_extract() methods and define a _VALID_URL regexp.
 298     Probably, they should also be added to the list of extractors.
 299
 300     Finally, the _WORKING attribute should be set to False for broken IEs
 301     in order to warn the users and skip the tests.
 302     """
 303
 304     _ready = False
 305     _downloader = None
 306     _WORKING = True
 307
 308     def __init__(self, downloader=None):
 309         """Constructor. Receives an optional downloader."""
 310         self._ready = False
 311         self.set_downloader(downloader)
 312
 313     @classmethod
 314     def suitable(cls, url):
 315         """Receives a URL and returns True if suitable for this IE."""
 316
 317         # This does not use has/getattr intentionally - we want to know whether
 318         # we have cached the regexp for *this* class, whereas getattr would also
 319         # match the superclass
 320         if '_VALID_URL_RE' not in cls.__dict__:
 321             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 322         return cls._VALID_URL_RE.match(url) is not None
 323
 324     @classmethod
 325     def _match_id(cls, url):
 326         if '_VALID_URL_RE' not in cls.__dict__:
 327             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 328         m = cls._VALID_URL_RE.match(url)
 329         assert m
 330         return m.group('id')
 331
 332     @classmethod
 333     def working(cls):
 334         """Getter method for _WORKING."""
 335         return cls._WORKING
 336
 337     def initialize(self):
 338         """Initializes an instance (authentication, etc)."""
 339         if not self._ready:
 340             self._real_initialize()
 341             self._ready = True
 342
 343     def extract(self, url):
 344         """Extracts URL information and returns it in list of dicts."""
 345         try:
 346             self.initialize()
 347             return self._real_extract(url)
 348         except ExtractorError:
 349             raise
 350         except compat_http_client.IncompleteRead as e:
 351             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 352         except (KeyError, StopIteration) as e:
 353             raise ExtractorError('An extractor error has occurred.', cause=e)
 354
 355     def set_downloader(self, downloader):
 356         """Sets the downloader for this IE."""
 357         self._downloader = downloader
 358
 359     def _real_initialize(self):
 360         """Real initialization process. Redefine in subclasses."""
 361         pass
 362
 363     def _real_extract(self, url):
 364         """Real extraction process. Redefine in subclasses."""
 365         pass
 366
 367     @classmethod
 368     def ie_key(cls):
 369         """A string for getting the InfoExtractor with get_info_extractor"""
 370         return compat_str(cls.__name__[:-2])
 371
 372     @property
 373     def IE_NAME(self):
 374         return compat_str(type(self).__name__[:-2])
 375
 376     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 377         """ Returns the response handle """
 378         if note is None:
 379             self.report_download_webpage(video_id)
 380         elif note is not False:
 381             if video_id is None:
 382                 self.to_screen('%s' % (note,))
 383             else:
 384                 self.to_screen('%s: %s' % (video_id, note))
 385         if isinstance(url_or_request, compat_urllib_request.Request):
 386             url_or_request = update_Request(
 387                 url_or_request, data=data, headers=headers, query=query)
 388         else:
 389             if query:
 390                 url_or_request = update_url_query(url_or_request, query)
 391             if data is not None or headers:
 392                 url_or_request = sanitized_Request(url_or_request, data, headers)
 393         try:
 394             return self._downloader.urlopen(url_or_request)
 395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 396             if errnote is False:
 397                 return False
 398             if errnote is None:
 399                 errnote = 'Unable to download webpage'
 400
 401             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 402             if fatal:
 403                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 404             else:
 405                 self._downloader.report_warning(errmsg)
 406                 return False
 407
 408     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 409         """ Returns a tuple (page content as string, URL handle) """
 410         # Strip hashes from the URL (#1038)
 411         if isinstance(url_or_request, (compat_str, str)):
 412             url_or_request = url_or_request.partition('#')[0]
 413
 414         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 415         if urlh is False:
 416             assert not fatal
 417             return False
 418         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 419         return (content, urlh)
 420
 421     @staticmethod
 422     def _guess_encoding_from_content(content_type, webpage_bytes):
 423         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 424         if m:
 425             encoding = m.group(1)
 426         else:
 427             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 428                           webpage_bytes[:1024])
 429             if m:
 430                 encoding = m.group(1).decode('ascii')
 431             elif webpage_bytes.startswith(b'\xff\xfe'):
 432                 encoding = 'utf-16'
 433             else:
 434                 encoding = 'utf-8'
 435
 436         return encoding
 437
 438     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 439         content_type = urlh.headers.get('Content-Type', '')
 440         webpage_bytes = urlh.read()
 441         if prefix is not None:
 442             webpage_bytes = prefix + webpage_bytes
 443         if not encoding:
 444             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 445         if self._downloader.params.get('dump_intermediate_pages', False):
 446             try:
 447                 url = url_or_request.get_full_url()
 448             except AttributeError:
 449                 url = url_or_request
 450             self.to_screen('Dumping request to ' + url)
 451             dump = base64.b64encode(webpage_bytes).decode('ascii')
 452             self._downloader.to_screen(dump)
 453         if self._downloader.params.get('write_pages', False):
 454             try:
 455                 url = url_or_request.get_full_url()
 456             except AttributeError:
 457                 url = url_or_request
 458             basen = '%s_%s' % (video_id, url)
 459             if len(basen) > 240:
 460                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 461                 basen = basen[:240 - len(h)] + h
 462             raw_filename = basen + '.dump'
 463             filename = sanitize_filename(raw_filename, restricted=True)
 464             self.to_screen('Saving request to ' + filename)
 465             # Working around MAX_PATH limitation on Windows (see
 466             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 467             if compat_os_name == 'nt':
 468                 absfilepath = os.path.abspath(filename)
 469                 if len(absfilepath) > 259:
 470                     filename = '\\\\?\\' + absfilepath
 471             with open(filename, 'wb') as outf:
 472                 outf.write(webpage_bytes)
 473
 474         try:
 475             content = webpage_bytes.decode(encoding, 'replace')
 476         except LookupError:
 477             content = webpage_bytes.decode('utf-8', 'replace')
 478
 479         if ('<title>Access to this site is blocked</title>' in content and
 480                 'Websense' in content[:512]):
 481             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 482             blocked_iframe = self._html_search_regex(
 483                 r'<iframe src="([^"]+)"', content,
 484                 'Websense information URL', default=None)
 485             if blocked_iframe:
 486                 msg += ' Visit %s for more details' % blocked_iframe
 487             raise ExtractorError(msg, expected=True)
 488         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 489             msg = (
 490                 'Access to this webpage has been blocked by Indian censorship. '
 491                 'Use a VPN or proxy server (with --proxy) to route around it.')
 492             block_msg = self._html_search_regex(
 493                 r'</h1><p>(.*?)</p>',
 494                 content, 'block message', default=None)
 495             if block_msg:
 496                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 497             raise ExtractorError(msg, expected=True)
 498
 499         return content
 500
 501     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 502         """ Returns the data of the page as a string """
 503         success = False
 504         try_count = 0
 505         while success is False:
 506             try:
 507                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 508                 success = True
 509             except compat_http_client.IncompleteRead as e:
 510                 try_count += 1
 511                 if try_count >= tries:
 512                     raise e
 513                 self._sleep(timeout, video_id)
 514         if res is False:
 515             return res
 516         else:
 517             content, _ = res
 518             return content
 519
 520     def _download_xml(self, url_or_request, video_id,
 521                       note='Downloading XML', errnote='Unable to download XML',
 522                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 523         """Return the xml as an xml.etree.ElementTree.Element"""
 524         xml_string = self._download_webpage(
 525             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 526         if xml_string is False:
 527             return xml_string
 528         if transform_source:
 529             xml_string = transform_source(xml_string)
 530         return compat_etree_fromstring(xml_string.encode('utf-8'))
 531
 532     def _download_json(self, url_or_request, video_id,
 533                        note='Downloading JSON metadata',
 534                        errnote='Unable to download JSON metadata',
 535                        transform_source=None,
 536                        fatal=True, encoding=None, data=None, headers={}, query={}):
 537         json_string = self._download_webpage(
 538             url_or_request, video_id, note, errnote, fatal=fatal,
 539             encoding=encoding, data=data, headers=headers, query=query)
 540         if (not fatal) and json_string is False:
 541             return None
 542         return self._parse_json(
 543             json_string, video_id, transform_source=transform_source, fatal=fatal)
 544
 545     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 546         if transform_source:
 547             json_string = transform_source(json_string)
 548         try:
 549             return json.loads(json_string)
 550         except ValueError as ve:
 551             errmsg = '%s: Failed to parse JSON ' % video_id
 552             if fatal:
 553                 raise ExtractorError(errmsg, cause=ve)
 554             else:
 555                 self.report_warning(errmsg + str(ve))
 556
 557     def report_warning(self, msg, video_id=None):
 558         idstr = '' if video_id is None else '%s: ' % video_id
 559         self._downloader.report_warning(
 560             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 561
 562     def to_screen(self, msg):
 563         """Print msg to screen, prefixing it with '[ie_name]'"""
 564         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 565
 566     def report_extraction(self, id_or_name):
 567         """Report information extraction."""
 568         self.to_screen('%s: Extracting information' % id_or_name)
 569
 570     def report_download_webpage(self, video_id):
 571         """Report webpage download."""
 572         self.to_screen('%s: Downloading webpage' % video_id)
 573
 574     def report_age_confirmation(self):
 575         """Report attempt to confirm age."""
 576         self.to_screen('Confirming age')
 577
 578     def report_login(self):
 579         """Report attempt to log in."""
 580         self.to_screen('Logging in')
 581
 582     @staticmethod
 583     def raise_login_required(msg='This video is only available for registered users'):
 584         raise ExtractorError(
 585             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 586             expected=True)
 587
 588     @staticmethod
 589     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 590         raise ExtractorError(
 591             '%s. You might want to use --proxy to workaround.' % msg,
 592             expected=True)
 593
 594     # Methods for following #608
 595     @staticmethod
 596     def url_result(url, ie=None, video_id=None, video_title=None):
 597         """Returns a URL that points to a page that should be processed"""
 598         # TODO: ie should be the class used for getting the info
 599         video_info = {'_type': 'url',
 600                       'url': url,
 601                       'ie_key': ie}
 602         if video_id is not None:
 603             video_info['id'] = video_id
 604         if video_title is not None:
 605             video_info['title'] = video_title
 606         return video_info
 607
 608     @staticmethod
 609     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 610         """Returns a playlist"""
 611         video_info = {'_type': 'playlist',
 612                       'entries': entries}
 613         if playlist_id:
 614             video_info['id'] = playlist_id
 615         if playlist_title:
 616             video_info['title'] = playlist_title
 617         if playlist_description:
 618             video_info['description'] = playlist_description
 619         return video_info
 620
 621     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 622         """
 623         Perform a regex search on the given string, using a single or a list of
 624         patterns returning the first matching group.
 625         In case of failure return a default value or raise a WARNING or a
 626         RegexNotFoundError, depending on fatal, specifying the field name.
 627         """
 628         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 629             mobj = re.search(pattern, string, flags)
 630         else:
 631             for p in pattern:
 632                 mobj = re.search(p, string, flags)
 633                 if mobj:
 634                     break
 635
 636         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 637             _name = '\033[0;34m%s\033[0m' % name
 638         else:
 639             _name = name
 640
 641         if mobj:
 642             if group is None:
 643                 # return the first matching group
 644                 return next(g for g in mobj.groups() if g is not None)
 645             else:
 646                 return mobj.group(group)
 647         elif default is not NO_DEFAULT:
 648             return default
 649         elif fatal:
 650             raise RegexNotFoundError('Unable to extract %s' % _name)
 651         else:
 652             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 653             return None
 654
 655     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 656         """
 657         Like _search_regex, but strips HTML tags and unescapes entities.
 658         """
 659         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 660         if res:
 661             return clean_html(res).strip()
 662         else:
 663             return res
 664
 665     def _get_netrc_login_info(self, netrc_machine=None):
 666         username = None
 667         password = None
 668         netrc_machine = netrc_machine or self._NETRC_MACHINE
 669
 670         if self._downloader.params.get('usenetrc', False):
 671             try:
 672                 info = netrc.netrc().authenticators(netrc_machine)
 673                 if info is not None:
 674                     username = info[0]
 675                     password = info[2]
 676                 else:
 677                     raise netrc.NetrcParseError(
 678                         'No authenticators for %s' % netrc_machine)
 679             except (IOError, netrc.NetrcParseError) as err:
 680                 self._downloader.report_warning(
 681                     'parsing .netrc: %s' % error_to_compat_str(err))
 682
 683         return username, password
 684
 685     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 686         """
 687         Get the login info as (username, password)
 688         First look for the manually specified credentials using username_option
 689         and password_option as keys in params dictionary. If no such credentials
 690         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 691         value.
 692         If there's no info available, return (None, None)
 693         """
 694         if self._downloader is None:
 695             return (None, None)
 696
 697         downloader_params = self._downloader.params
 698
 699         # Attempt to use provided username and password or .netrc data
 700         if downloader_params.get(username_option) is not None:
 701             username = downloader_params[username_option]
 702             password = downloader_params[password_option]
 703         else:
 704             username, password = self._get_netrc_login_info(netrc_machine)
 705
 706         return username, password
 707
 708     def _get_tfa_info(self, note='two-factor verification code'):
 709         """
 710         Get the two-factor authentication info
 711         TODO - asking the user will be required for sms/phone verify
 712         currently just uses the command line option
 713         If there's no info available, return None
 714         """
 715         if self._downloader is None:
 716             return None
 717         downloader_params = self._downloader.params
 718
 719         if downloader_params.get('twofactor') is not None:
 720             return downloader_params['twofactor']
 721
 722         return compat_getpass('Type %s and press [Return]: ' % note)
 723
 724     # Helper functions for extracting OpenGraph info
 725     @staticmethod
 726     def _og_regexes(prop):
 727         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 728         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 729                        % {'prop': re.escape(prop)})
 730         template = r'<meta[^>]+?%s[^>]+?%s'
 731         return [
 732             template % (property_re, content_re),
 733             template % (content_re, property_re),
 734         ]
 735
 736     @staticmethod
 737     def _meta_regex(prop):
 738         return r'''(?isx)<meta
 739                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 740                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 741
 742     def _og_search_property(self, prop, html, name=None, **kargs):
 743         if not isinstance(prop, (list, tuple)):
 744             prop = [prop]
 745         if name is None:
 746             name = 'OpenGraph %s' % prop[0]
 747         og_regexes = []
 748         for p in prop:
 749             og_regexes.extend(self._og_regexes(p))
 750         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 751         if escaped is None:
 752             return None
 753         return unescapeHTML(escaped)
 754
 755     def _og_search_thumbnail(self, html, **kargs):
 756         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 757
 758     def _og_search_description(self, html, **kargs):
 759         return self._og_search_property('description', html, fatal=False, **kargs)
 760
 761     def _og_search_title(self, html, **kargs):
 762         return self._og_search_property('title', html, **kargs)
 763
 764     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 765         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 766         if secure:
 767             regexes = self._og_regexes('video:secure_url') + regexes
 768         return self._html_search_regex(regexes, html, name, **kargs)
 769
 770     def _og_search_url(self, html, **kargs):
 771         return self._og_search_property('url', html, **kargs)
 772
 773     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 774         if not isinstance(name, (list, tuple)):
 775             name = [name]
 776         if display_name is None:
 777             display_name = name[0]
 778         return self._html_search_regex(
 779             [self._meta_regex(n) for n in name],
 780             html, display_name, fatal=fatal, group='content', **kwargs)
 781
 782     def _dc_search_uploader(self, html):
 783         return self._html_search_meta('dc.creator', html, 'uploader')
 784
 785     def _rta_search(self, html):
 786         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 787         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 788                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 789                      html):
 790             return 18
 791         return 0
 792
 793     def _media_rating_search(self, html):
 794         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 795         rating = self._html_search_meta('rating', html)
 796
 797         if not rating:
 798             return None
 799
 800         RATING_TABLE = {
 801             'safe for kids': 0,
 802             'general': 8,
 803             '14 years': 14,
 804             'mature': 17,
 805             'restricted': 19,
 806         }
 807         return RATING_TABLE.get(rating.lower())
 808
 809     def _family_friendly_search(self, html):
 810         # See http://schema.org/VideoObject
 811         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 812
 813         if not family_friendly:
 814             return None
 815
 816         RATING_TABLE = {
 817             '1': 0,
 818             'true': 0,
 819             '0': 18,
 820             'false': 18,
 821         }
 822         return RATING_TABLE.get(family_friendly.lower())
 823
 824     def _twitter_search_player(self, html):
 825         return self._html_search_meta('twitter:player', html,
 826                                       'twitter card player')
 827
 828     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 829         json_ld = self._search_regex(
 830             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 831             html, 'JSON-LD', group='json_ld', **kwargs)
 832         default = kwargs.get('default', NO_DEFAULT)
 833         if not json_ld:
 834             return default if default is not NO_DEFAULT else {}
 835         # JSON-LD may be malformed and thus `fatal` should be respected.
 836         # At the same time `default` may be passed that assumes `fatal=False`
 837         # for _search_regex. Let's simulate the same behavior here as well.
 838         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 839         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 840
 841     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 842         if isinstance(json_ld, compat_str):
 843             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 844         if not json_ld:
 845             return {}
 846         info = {}
 847         if not isinstance(json_ld, (list, tuple, dict)):
 848             return info
 849         if isinstance(json_ld, dict):
 850             json_ld = [json_ld]
 851         for e in json_ld:
 852             if e.get('@context') == 'http://schema.org':
 853                 item_type = e.get('@type')
 854                 if expected_type is not None and expected_type != item_type:
 855                     return info
 856                 if item_type == 'TVEpisode':
 857                     info.update({
 858                         'episode': unescapeHTML(e.get('name')),
 859                         'episode_number': int_or_none(e.get('episodeNumber')),
 860                         'description': unescapeHTML(e.get('description')),
 861                     })
 862                     part_of_season = e.get('partOfSeason')
 863                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 864                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 865                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 866                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 867                         info['series'] = unescapeHTML(part_of_series.get('name'))
 868                 elif item_type == 'Article':
 869                     info.update({
 870                         'timestamp': parse_iso8601(e.get('datePublished')),
 871                         'title': unescapeHTML(e.get('headline')),
 872                         'description': unescapeHTML(e.get('articleBody')),
 873                     })
 874                 elif item_type == 'VideoObject':
 875                     info.update({
 876                         'url': e.get('contentUrl'),
 877                         'title': unescapeHTML(e.get('name')),
 878                         'description': unescapeHTML(e.get('description')),
 879                         'thumbnail': e.get('thumbnailUrl'),
 880                         'duration': parse_duration(e.get('duration')),
 881                         'timestamp': unified_timestamp(e.get('uploadDate')),
 882                         'filesize': float_or_none(e.get('contentSize')),
 883                         'tbr': int_or_none(e.get('bitrate')),
 884                         'width': int_or_none(e.get('width')),
 885                         'height': int_or_none(e.get('height')),
 886                     })
 887                 break
 888         return dict((k, v) for k, v in info.items() if v is not None)
 889
 890     @staticmethod
 891     def _hidden_inputs(html):
 892         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 893         hidden_inputs = {}
 894         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 895             attrs = extract_attributes(input)
 896             if not input:
 897                 continue
 898             if attrs.get('type') not in ('hidden', 'submit'):
 899                 continue
 900             name = attrs.get('name') or attrs.get('id')
 901             value = attrs.get('value')
 902             if name and value is not None:
 903                 hidden_inputs[name] = value
 904         return hidden_inputs
 905
 906     def _form_hidden_inputs(self, form_id, html):
 907         form = self._search_regex(
 908             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 909             html, '%s form' % form_id, group='form')
 910         return self._hidden_inputs(form)
 911
 912     def _sort_formats(self, formats, field_preference=None):
 913         if not formats:
 914             raise ExtractorError('No video formats found')
 915
 916         for f in formats:
 917             # Automatically determine tbr when missing based on abr and vbr (improves
 918             # formats sorting in some cases)
 919             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 920                 f['tbr'] = f['abr'] + f['vbr']
 921
 922         def _formats_key(f):
 923             # TODO remove the following workaround
 924             from ..utils import determine_ext
 925             if not f.get('ext') and 'url' in f:
 926                 f['ext'] = determine_ext(f['url'])
 927
 928             if isinstance(field_preference, (list, tuple)):
 929                 return tuple(
 930                     f.get(field)
 931                     if f.get(field) is not None
 932                     else ('' if field == 'format_id' else -1)
 933                     for field in field_preference)
 934
 935             preference = f.get('preference')
 936             if preference is None:
 937                 preference = 0
 938                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 939                     preference -= 0.5
 940
 941             protocol = f.get('protocol') or determine_protocol(f)
 942             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
 943
 944             if f.get('vcodec') == 'none':  # audio only
 945                 preference -= 50
 946                 if self._downloader.params.get('prefer_free_formats'):
 947                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 948                 else:
 949                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 950                 ext_preference = 0
 951                 try:
 952                     audio_ext_preference = ORDER.index(f['ext'])
 953                 except ValueError:
 954                     audio_ext_preference = -1
 955             else:
 956                 if f.get('acodec') == 'none':  # video only
 957                     preference -= 40
 958                 if self._downloader.params.get('prefer_free_formats'):
 959                     ORDER = ['flv', 'mp4', 'webm']
 960                 else:
 961                     ORDER = ['webm', 'flv', 'mp4']
 962                 try:
 963                     ext_preference = ORDER.index(f['ext'])
 964                 except ValueError:
 965                     ext_preference = -1
 966                 audio_ext_preference = 0
 967
 968             return (
 969                 preference,
 970                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 971                 f.get('quality') if f.get('quality') is not None else -1,
 972                 f.get('tbr') if f.get('tbr') is not None else -1,
 973                 f.get('filesize') if f.get('filesize') is not None else -1,
 974                 f.get('vbr') if f.get('vbr') is not None else -1,
 975                 f.get('height') if f.get('height') is not None else -1,
 976                 f.get('width') if f.get('width') is not None else -1,
 977                 proto_preference,
 978                 ext_preference,
 979                 f.get('abr') if f.get('abr') is not None else -1,
 980                 audio_ext_preference,
 981                 f.get('fps') if f.get('fps') is not None else -1,
 982                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 983                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 984                 f.get('format_id') if f.get('format_id') is not None else '',
 985             )
 986         formats.sort(key=_formats_key)
 987
 988     def _check_formats(self, formats, video_id):
 989         if formats:
 990             formats[:] = filter(
 991                 lambda f: self._is_valid_url(
 992                     f['url'], video_id,
 993                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 994                 formats)
 995
 996     @staticmethod
 997     def _remove_duplicate_formats(formats):
 998         format_urls = set()
 999         unique_formats = []
1000         for f in formats:
1001             if f['url'] not in format_urls:
1002                 format_urls.add(f['url'])
1003                 unique_formats.append(f)
1004         formats[:] = unique_formats
1005
1006     def _is_valid_url(self, url, video_id, item='video'):
1007         url = self._proto_relative_url(url, scheme='http:')
1008         # For now assume non HTTP(S) URLs always valid
1009         if not (url.startswith('http://') or url.startswith('https://')):
1010             return True
1011         try:
1012             self._request_webpage(url, video_id, 'Checking %s URL' % item)
1013             return True
1014         except ExtractorError as e:
1015             if isinstance(e.cause, compat_urllib_error.URLError):
1016                 self.to_screen(
1017                     '%s: %s URL is invalid, skipping' % (video_id, item))
1018                 return False
1019             raise
1020
1021     def http_scheme(self):
1022         """ Either "http:" or "https:", depending on the user's preferences """
1023         return (
1024             'http:'
1025             if self._downloader.params.get('prefer_insecure', False)
1026             else 'https:')
1027
1028     def _proto_relative_url(self, url, scheme=None):
1029         if url is None:
1030             return url
1031         if url.startswith('//'):
1032             if scheme is None:
1033                 scheme = self.http_scheme()
1034             return scheme + url
1035         else:
1036             return url
1037
1038     def _sleep(self, timeout, video_id, msg_template=None):
1039         if msg_template is None:
1040             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1041         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1042         self.to_screen(msg)
1043         time.sleep(timeout)
1044
1045     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1046                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1047                              fatal=True, m3u8_id=None):
1048         manifest = self._download_xml(
1049             manifest_url, video_id, 'Downloading f4m manifest',
1050             'Unable to download f4m manifest',
1051             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1052             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1053             transform_source=transform_source,
1054             fatal=fatal)
1055
1056         if manifest is False:
1057             return []
1058
1059         return self._parse_f4m_formats(
1060             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1061             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1062
1063     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1064                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1065                            fatal=True, m3u8_id=None):
1066         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1067         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1068         if akamai_pv is not None and ';' in akamai_pv.text:
1069             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1070             if playerVerificationChallenge.strip() != '':
1071                 return []
1072
1073         formats = []
1074         manifest_version = '1.0'
1075         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1076         if not media_nodes:
1077             manifest_version = '2.0'
1078             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1079         # Remove unsupported DRM protected media from final formats
1080         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1081         media_nodes = remove_encrypted_media(media_nodes)
1082         if not media_nodes:
1083             return formats
1084         base_url = xpath_text(
1085             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1086             'base URL', default=None)
1087         if base_url:
1088             base_url = base_url.strip()
1089
1090         bootstrap_info = xpath_element(
1091             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1092             'bootstrap info', default=None)
1093
1094         for i, media_el in enumerate(media_nodes):
1095             tbr = int_or_none(media_el.attrib.get('bitrate'))
1096             width = int_or_none(media_el.attrib.get('width'))
1097             height = int_or_none(media_el.attrib.get('height'))
1098             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1099             # If <bootstrapInfo> is present, the specified f4m is a
1100             # stream-level manifest, and only set-level manifests may refer to
1101             # external resources.  See section 11.4 and section 4 of F4M spec
1102             if bootstrap_info is None:
1103                 media_url = None
1104                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1105                 if manifest_version == '2.0':
1106                     media_url = media_el.attrib.get('href')
1107                 if media_url is None:
1108                     media_url = media_el.attrib.get('url')
1109                 if not media_url:
1110                     continue
1111                 manifest_url = (
1112                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1113                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1114                 # If media_url is itself a f4m manifest do the recursive extraction
1115                 # since bitrates in parent manifest (this one) and media_url manifest
1116                 # may differ leading to inability to resolve the format by requested
1117                 # bitrate in f4m downloader
1118                 ext = determine_ext(manifest_url)
1119                 if ext == 'f4m':
1120                     f4m_formats = self._extract_f4m_formats(
1121                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1122                         transform_source=transform_source, fatal=fatal)
1123                     # Sometimes stream-level manifest contains single media entry that
1124                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1125                     # At the same time parent's media entry in set-level manifest may
1126                     # contain it. We will copy it from parent in such cases.
1127                     if len(f4m_formats) == 1:
1128                         f = f4m_formats[0]
1129                         f.update({
1130                             'tbr': f.get('tbr') or tbr,
1131                             'width': f.get('width') or width,
1132                             'height': f.get('height') or height,
1133                             'format_id': f.get('format_id') if not tbr else format_id,
1134                         })
1135                     formats.extend(f4m_formats)
1136                     continue
1137                 elif ext == 'm3u8':
1138                     formats.extend(self._extract_m3u8_formats(
1139                         manifest_url, video_id, 'mp4', preference=preference,
1140                         m3u8_id=m3u8_id, fatal=fatal))
1141                     continue
1142             formats.append({
1143                 'format_id': format_id,
1144                 'url': manifest_url,
1145                 'ext': 'flv' if bootstrap_info is not None else None,
1146                 'tbr': tbr,
1147                 'width': width,
1148                 'height': height,
1149                 'preference': preference,
1150             })
1151         return formats
1152
1153     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1154         return {
1155             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1156             'url': m3u8_url,
1157             'ext': ext,
1158             'protocol': 'm3u8',
1159             'preference': preference - 100 if preference else -100,
1160             'resolution': 'multiple',
1161             'format_note': 'Quality selection URL',
1162         }
1163
1164     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1165                               entry_protocol='m3u8', preference=None,
1166                               m3u8_id=None, note=None, errnote=None,
1167                               fatal=True, live=False):
1168
1169         res = self._download_webpage_handle(
1170             m3u8_url, video_id,
1171             note=note or 'Downloading m3u8 information',
1172             errnote=errnote or 'Failed to download m3u8 information',
1173             fatal=fatal)
1174         if res is False:
1175             return []
1176         m3u8_doc, urlh = res
1177         m3u8_url = urlh.geturl()
1178
1179         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1180
1181         format_url = lambda u: (
1182             u
1183             if re.match(r'^https?://', u)
1184             else compat_urlparse.urljoin(m3u8_url, u))
1185
1186         # We should try extracting formats only from master playlists [1], i.e.
1187         # playlists that describe available qualities. On the other hand media
1188         # playlists [2] should be returned as is since they contain just the media
1189         # without qualities renditions.
1190         # Fortunately, master playlist can be easily distinguished from media
1191         # playlist based on particular tags availability. As of [1, 2] master
1192         # playlist tags MUST NOT appear in a media playist and vice versa.
1193         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1194         # and MUST NOT appear in master playlist thus we can clearly detect media
1195         # playlist with this criterion.
1196         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1197         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1198         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1199         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1200             return [{
1201                 'url': m3u8_url,
1202                 'format_id': m3u8_id,
1203                 'ext': ext,
1204                 'protocol': entry_protocol,
1205                 'preference': preference,
1206             }]
1207         last_info = {}
1208         last_media = {}
1209         for line in m3u8_doc.splitlines():
1210             if line.startswith('#EXT-X-STREAM-INF:'):
1211                 last_info = parse_m3u8_attributes(line)
1212             elif line.startswith('#EXT-X-MEDIA:'):
1213                 media = parse_m3u8_attributes(line)
1214                 media_type = media.get('TYPE')
1215                 if media_type in ('VIDEO', 'AUDIO'):
1216                     media_url = media.get('URI')
1217                     if media_url:
1218                         format_id = []
1219                         for v in (media.get('GROUP-ID'), media.get('NAME')):
1220                             if v:
1221                                 format_id.append(v)
1222                         formats.append({
1223                             'format_id': '-'.join(format_id),
1224                             'url': format_url(media_url),
1225                             'language': media.get('LANGUAGE'),
1226                             'vcodec': 'none' if media_type == 'AUDIO' else None,
1227                             'ext': ext,
1228                             'protocol': entry_protocol,
1229                             'preference': preference,
1230                         })
1231                     else:
1232                         # When there is no URI in EXT-X-MEDIA let this tag's
1233                         # data be used by regular URI lines below
1234                         last_media = media
1235             elif line.startswith('#') or not line.strip():
1236                 continue
1237             else:
1238                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1239                 format_id = []
1240                 if m3u8_id:
1241                     format_id.append(m3u8_id)
1242                 # Despite specification does not mention NAME attribute for
1243                 # EXT-X-STREAM-INF it still sometimes may be present
1244                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1245                 # Bandwidth of live streams may differ over time thus making
1246                 # format_id unpredictable. So it's better to keep provided
1247                 # format_id intact.
1248                 if not live:
1249                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1250                 f = {
1251                     'format_id': '-'.join(format_id),
1252                     'url': format_url(line.strip()),
1253                     'tbr': tbr,
1254                     'ext': ext,
1255                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1256                     'protocol': entry_protocol,
1257                     'preference': preference,
1258                 }
1259                 resolution = last_info.get('RESOLUTION')
1260                 if resolution:
1261                     width_str, height_str = resolution.split('x')
1262                     f['width'] = int(width_str)
1263                     f['height'] = int(height_str)
1264                 # Unified Streaming Platform
1265                 mobj = re.search(
1266                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1267                 if mobj:
1268                     abr, vbr = mobj.groups()
1269                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1270                     f.update({
1271                         'vbr': vbr,
1272                         'abr': abr,
1273                     })
1274                 f.update(parse_codecs(last_info.get('CODECS')))
1275                 formats.append(f)
1276                 last_info = {}
1277                 last_media = {}
1278         return formats
1279
1280     @staticmethod
1281     def _xpath_ns(path, namespace=None):
1282         if not namespace:
1283             return path
1284         out = []
1285         for c in path.split('/'):
1286             if not c or c == '.':
1287                 out.append(c)
1288             else:
1289                 out.append('{%s}%s' % (namespace, c))
1290         return '/'.join(out)
1291
1292     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1293         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1294
1295         if smil is False:
1296             assert not fatal
1297             return []
1298
1299         namespace = self._parse_smil_namespace(smil)
1300
1301         return self._parse_smil_formats(
1302             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1303
1304     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1305         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1306         if smil is False:
1307             return {}
1308         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1309
1310     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1311         return self._download_xml(
1312             smil_url, video_id, 'Downloading SMIL file',
1313             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1314
1315     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1316         namespace = self._parse_smil_namespace(smil)
1317
1318         formats = self._parse_smil_formats(
1319             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1320         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1321
1322         video_id = os.path.splitext(url_basename(smil_url))[0]
1323         title = None
1324         description = None
1325         upload_date = None
1326         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1327             name = meta.attrib.get('name')
1328             content = meta.attrib.get('content')
1329             if not name or not content:
1330                 continue
1331             if not title and name == 'title':
1332                 title = content
1333             elif not description and name in ('description', 'abstract'):
1334                 description = content
1335             elif not upload_date and name == 'date':
1336                 upload_date = unified_strdate(content)
1337
1338         thumbnails = [{
1339             'id': image.get('type'),
1340             'url': image.get('src'),
1341             'width': int_or_none(image.get('width')),
1342             'height': int_or_none(image.get('height')),
1343         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1344
1345         return {
1346             'id': video_id,
1347             'title': title or video_id,
1348             'description': description,
1349             'upload_date': upload_date,
1350             'thumbnails': thumbnails,
1351             'formats': formats,
1352             'subtitles': subtitles,
1353         }
1354
1355     def _parse_smil_namespace(self, smil):
1356         return self._search_regex(
1357             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1358
1359     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1360         base = smil_url
1361         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1362             b = meta.get('base') or meta.get('httpBase')
1363             if b:
1364                 base = b
1365                 break
1366
1367         formats = []
1368         rtmp_count = 0
1369         http_count = 0
1370         m3u8_count = 0
1371
1372         srcs = []
1373         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1374         for medium in media:
1375             src = medium.get('src')
1376             if not src or src in srcs:
1377                 continue
1378             srcs.append(src)
1379
1380             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1381             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1382             width = int_or_none(medium.get('width'))
1383             height = int_or_none(medium.get('height'))
1384             proto = medium.get('proto')
1385             ext = medium.get('ext')
1386             src_ext = determine_ext(src)
1387             streamer = medium.get('streamer') or base
1388
1389             if proto == 'rtmp' or streamer.startswith('rtmp'):
1390                 rtmp_count += 1
1391                 formats.append({
1392                     'url': streamer,
1393                     'play_path': src,
1394                     'ext': 'flv',
1395                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1396                     'tbr': bitrate,
1397                     'filesize': filesize,
1398                     'width': width,
1399                     'height': height,
1400                 })
1401                 if transform_rtmp_url:
1402                     streamer, src = transform_rtmp_url(streamer, src)
1403                     formats[-1].update({
1404                         'url': streamer,
1405                         'play_path': src,
1406                     })
1407                 continue
1408
1409             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1410             src_url = src_url.strip()
1411
1412             if proto == 'm3u8' or src_ext == 'm3u8':
1413                 m3u8_formats = self._extract_m3u8_formats(
1414                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1415                 if len(m3u8_formats) == 1:
1416                     m3u8_count += 1
1417                     m3u8_formats[0].update({
1418                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1419                         'tbr': bitrate,
1420                         'width': width,
1421                         'height': height,
1422                     })
1423                 formats.extend(m3u8_formats)
1424                 continue
1425
1426             if src_ext == 'f4m':
1427                 f4m_url = src_url
1428                 if not f4m_params:
1429                     f4m_params = {
1430                         'hdcore': '3.2.0',
1431                         'plugin': 'flowplayer-3.2.0.1',
1432                     }
1433                 f4m_url += '&' if '?' in f4m_url else '?'
1434                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1435                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1436                 continue
1437
1438             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1439                 http_count += 1
1440                 formats.append({
1441                     'url': src_url,
1442                     'ext': ext or src_ext or 'flv',
1443                     'format_id': 'http-%d' % (bitrate or http_count),
1444                     'tbr': bitrate,
1445                     'filesize': filesize,
1446                     'width': width,
1447                     'height': height,
1448                 })
1449                 continue
1450
1451         return formats
1452
1453     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1454         urls = []
1455         subtitles = {}
1456         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1457             src = textstream.get('src')
1458             if not src or src in urls:
1459                 continue
1460             urls.append(src)
1461             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1462             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1463             subtitles.setdefault(lang, []).append({
1464                 'url': src,
1465                 'ext': ext,
1466             })
1467         return subtitles
1468
1469     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1470         xspf = self._download_xml(
1471             playlist_url, playlist_id, 'Downloading xpsf playlist',
1472             'Unable to download xspf manifest', fatal=fatal)
1473         if xspf is False:
1474             return []
1475         return self._parse_xspf(xspf, playlist_id)
1476
1477     def _parse_xspf(self, playlist, playlist_id):
1478         NS_MAP = {
1479             'xspf': 'http://xspf.org/ns/0/',
1480             's1': 'http://static.streamone.nl/player/ns/0',
1481         }
1482
1483         entries = []
1484         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1485             title = xpath_text(
1486                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1487             description = xpath_text(
1488                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1489             thumbnail = xpath_text(
1490                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1491             duration = float_or_none(
1492                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1493
1494             formats = [{
1495                 'url': location.text,
1496                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1497                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1498                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1499             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1500             self._sort_formats(formats)
1501
1502             entries.append({
1503                 'id': playlist_id,
1504                 'title': title,
1505                 'description': description,
1506                 'thumbnail': thumbnail,
1507                 'duration': duration,
1508                 'formats': formats,
1509             })
1510         return entries
1511
1512     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1513         res = self._download_webpage_handle(
1514             mpd_url, video_id,
1515             note=note or 'Downloading MPD manifest',
1516             errnote=errnote or 'Failed to download MPD manifest',
1517             fatal=fatal)
1518         if res is False:
1519             return []
1520         mpd, urlh = res
1521         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1522
1523         return self._parse_mpd_formats(
1524             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1525
1526     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1527         """
1528         Parse formats from MPD manifest.
1529         References:
1530          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1531             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1532          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1533         """
1534         if mpd_doc.get('type') == 'dynamic':
1535             return []
1536
1537         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1538
1539         def _add_ns(path):
1540             return self._xpath_ns(path, namespace)
1541
1542         def is_drm_protected(element):
1543             return element.find(_add_ns('ContentProtection')) is not None
1544
1545         def extract_multisegment_info(element, ms_parent_info):
1546             ms_info = ms_parent_info.copy()
1547             segment_list = element.find(_add_ns('SegmentList'))
1548             if segment_list is not None:
1549                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1550                 if segment_urls_e:
1551                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1552                 initialization = segment_list.find(_add_ns('Initialization'))
1553                 if initialization is not None:
1554                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1555             else:
1556                 segment_template = element.find(_add_ns('SegmentTemplate'))
1557                 if segment_template is not None:
1558                     start_number = segment_template.get('startNumber')
1559                     if start_number:
1560                         ms_info['start_number'] = int(start_number)
1561                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1562                     if segment_timeline is not None:
1563                         s_e = segment_timeline.findall(_add_ns('S'))
1564                         if s_e:
1565                             ms_info['total_number'] = 0
1566                             ms_info['s'] = []
1567                             for s in s_e:
1568                                 r = int(s.get('r', 0))
1569                                 ms_info['total_number'] += 1 + r
1570                                 ms_info['s'].append({
1571                                     't': int(s.get('t', 0)),
1572                                     # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1573                                     'd': int(s.attrib['d']),
1574                                     'r': r,
1575                                 })
1576                     else:
1577                         timescale = segment_template.get('timescale')
1578                         if timescale:
1579                             ms_info['timescale'] = int(timescale)
1580                         segment_duration = segment_template.get('duration')
1581                         if segment_duration:
1582                             ms_info['segment_duration'] = int(segment_duration)
1583                     media_template = segment_template.get('media')
1584                     if media_template:
1585                         ms_info['media_template'] = media_template
1586                     initialization = segment_template.get('initialization')
1587                     if initialization:
1588                         ms_info['initialization_url'] = initialization
1589                     else:
1590                         initialization = segment_template.find(_add_ns('Initialization'))
1591                         if initialization is not None:
1592                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1593             return ms_info
1594
1595         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1596         formats = []
1597         for period in mpd_doc.findall(_add_ns('Period')):
1598             period_duration = parse_duration(period.get('duration')) or mpd_duration
1599             period_ms_info = extract_multisegment_info(period, {
1600                 'start_number': 1,
1601                 'timescale': 1,
1602             })
1603             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1604                 if is_drm_protected(adaptation_set):
1605                     continue
1606                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1607                 for representation in adaptation_set.findall(_add_ns('Representation')):
1608                     if is_drm_protected(representation):
1609                         continue
1610                     representation_attrib = adaptation_set.attrib.copy()
1611                     representation_attrib.update(representation.attrib)
1612                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1613                     mime_type = representation_attrib['mimeType']
1614                     content_type = mime_type.split('/')[0]
1615                     if content_type == 'text':
1616                         # TODO implement WebVTT downloading
1617                         pass
1618                     elif content_type == 'video' or content_type == 'audio':
1619                         base_url = ''
1620                         for element in (representation, adaptation_set, period, mpd_doc):
1621                             base_url_e = element.find(_add_ns('BaseURL'))
1622                             if base_url_e is not None:
1623                                 base_url = base_url_e.text + base_url
1624                                 if re.match(r'^https?://', base_url):
1625                                     break
1626                         if mpd_base_url and not re.match(r'^https?://', base_url):
1627                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1628                                 mpd_base_url += '/'
1629                             base_url = mpd_base_url + base_url
1630                         representation_id = representation_attrib.get('id')
1631                         lang = representation_attrib.get('lang')
1632                         url_el = representation.find(_add_ns('BaseURL'))
1633                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1634                         f = {
1635                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1636                             'url': base_url,
1637                             'ext': mimetype2ext(mime_type),
1638                             'width': int_or_none(representation_attrib.get('width')),
1639                             'height': int_or_none(representation_attrib.get('height')),
1640                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1641                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1642                             'fps': int_or_none(representation_attrib.get('frameRate')),
1643                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1644                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1645                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1646                             'format_note': 'DASH %s' % content_type,
1647                             'filesize': filesize,
1648                         }
1649                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1650                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1651                             if 'total_number' not in representation_ms_info and 'segment_duration':
1652                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1653                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1654                             media_template = representation_ms_info['media_template']
1655                             media_template = media_template.replace('$RepresentationID$', representation_id)
1656                             media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1657                             media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1658                             media_template.replace('$$', '$')
1659
1660                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1661                             # can't be used at the same time
1662                             if '%(Number' in media_template:
1663                                 representation_ms_info['segment_urls'] = [
1664                                     media_template % {
1665                                         'Number': segment_number,
1666                                         'Bandwidth': representation_attrib.get('bandwidth'),
1667                                     }
1668                                     for segment_number in range(
1669                                         representation_ms_info['start_number'],
1670                                         representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1671                             else:
1672                                 representation_ms_info['segment_urls'] = []
1673                                 segment_time = 0
1674
1675                                 def add_segment_url():
1676                                     representation_ms_info['segment_urls'].append(
1677                                         media_template % {
1678                                             'Time': segment_time,
1679                                             'Bandwidth': representation_attrib.get('bandwidth'),
1680                                         }
1681                                     )
1682
1683                                 for num, s in enumerate(representation_ms_info['s']):
1684                                     segment_time = s.get('t') or segment_time
1685                                     add_segment_url()
1686                                     for r in range(s.get('r', 0)):
1687                                         segment_time += s['d']
1688                                         add_segment_url()
1689                                     segment_time += s['d']
1690                         if 'segment_urls' in representation_ms_info:
1691                             f.update({
1692                                 'segment_urls': representation_ms_info['segment_urls'],
1693                                 'protocol': 'http_dash_segments',
1694                             })
1695                             if 'initialization_url' in representation_ms_info:
1696                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1697                                 f.update({
1698                                     'initialization_url': initialization_url,
1699                                 })
1700                                 if not f.get('url'):
1701                                     f['url'] = initialization_url
1702                         try:
1703                             existing_format = next(
1704                                 fo for fo in formats
1705                                 if fo['format_id'] == representation_id)
1706                         except StopIteration:
1707                             full_info = formats_dict.get(representation_id, {}).copy()
1708                             full_info.update(f)
1709                             formats.append(full_info)
1710                         else:
1711                             existing_format.update(f)
1712                     else:
1713                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1714         return formats
1715
1716     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
1717         def absolute_url(video_url):
1718             return compat_urlparse.urljoin(base_url, video_url)
1719
1720         def parse_content_type(content_type):
1721             if not content_type:
1722                 return {}
1723             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1724             if ctr:
1725                 mimetype, codecs = ctr.groups()
1726                 f = parse_codecs(codecs)
1727                 f['ext'] = mimetype2ext(mimetype)
1728                 return f
1729             return {}
1730
1731         def _media_formats(src, cur_media_type):
1732             full_url = absolute_url(src)
1733             if determine_ext(full_url) == 'm3u8':
1734                 is_plain_url = False
1735                 formats = self._extract_m3u8_formats(
1736                     full_url, video_id, ext='mp4',
1737                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1738             else:
1739                 is_plain_url = True
1740                 formats = [{
1741                     'url': full_url,
1742                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1743                 }]
1744             return is_plain_url, formats
1745
1746         entries = []
1747         for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
1748             media_info = {
1749                 'formats': [],
1750                 'subtitles': {},
1751             }
1752             media_attributes = extract_attributes(media_tag)
1753             src = media_attributes.get('src')
1754             if src:
1755                 _, formats = _media_formats(src, media_type)
1756                 media_info['formats'].extend(formats)
1757             media_info['thumbnail'] = media_attributes.get('poster')
1758             if media_content:
1759                 for source_tag in re.findall(r'<source[^>]+>', media_content):
1760                     source_attributes = extract_attributes(source_tag)
1761                     src = source_attributes.get('src')
1762                     if not src:
1763                         continue
1764                     is_plain_url, formats = _media_formats(src, media_type)
1765                     if is_plain_url:
1766                         f = parse_content_type(source_attributes.get('type'))
1767                         f.update(formats[0])
1768                         media_info['formats'].append(f)
1769                     else:
1770                         media_info['formats'].extend(formats)
1771                 for track_tag in re.findall(r'<track[^>]+>', media_content):
1772                     track_attributes = extract_attributes(track_tag)
1773                     kind = track_attributes.get('kind')
1774                     if not kind or kind == 'subtitles':
1775                         src = track_attributes.get('src')
1776                         if not src:
1777                             continue
1778                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1779                         media_info['subtitles'].setdefault(lang, []).append({
1780                             'url': absolute_url(src),
1781                         })
1782             if media_info['formats']:
1783                 entries.append(media_info)
1784         return entries
1785
1786     def _extract_akamai_formats(self, manifest_url, video_id):
1787         formats = []
1788         f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1789         formats.extend(self._extract_f4m_formats(
1790             update_url_query(f4m_url, {'hdcore': '3.7.0'}),
1791             video_id, f4m_id='hds', fatal=False))
1792         m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1793         formats.extend(self._extract_m3u8_formats(
1794             m3u8_url, video_id, 'mp4', 'm3u8_native',
1795             m3u8_id='hls', fatal=False))
1796         return formats
1797
1798     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1799         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1800         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1801         http_base_url = 'http' + url_base
1802         formats = []
1803         if 'm3u8' not in skip_protocols:
1804             formats.extend(self._extract_m3u8_formats(
1805                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
1806                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
1807         if 'f4m' not in skip_protocols:
1808             formats.extend(self._extract_f4m_formats(
1809                 http_base_url + '/manifest.f4m',
1810                 video_id, f4m_id='hds', fatal=False))
1811         if re.search(r'(?:/smil:|\.smil)', url_base):
1812             if 'dash' not in skip_protocols:
1813                 formats.extend(self._extract_mpd_formats(
1814                     http_base_url + '/manifest.mpd',
1815                     video_id, mpd_id='dash', fatal=False))
1816             if 'smil' not in skip_protocols:
1817                 rtmp_formats = self._extract_smil_formats(
1818                     http_base_url + '/jwplayer.smil',
1819                     video_id, fatal=False)
1820                 for rtmp_format in rtmp_formats:
1821                     rtsp_format = rtmp_format.copy()
1822                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
1823                     del rtsp_format['play_path']
1824                     del rtsp_format['ext']
1825                     rtsp_format.update({
1826                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
1827                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
1828                         'protocol': 'rtsp',
1829                     })
1830                     formats.extend([rtmp_format, rtsp_format])
1831         else:
1832             for protocol in ('rtmp', 'rtsp'):
1833                 if protocol not in skip_protocols:
1834                     formats.append({
1835                         'url': protocol + url_base,
1836                         'format_id': protocol,
1837                         'protocol': protocol,
1838                     })
1839         return formats
1840
1841     def _live_title(self, name):
1842         """ Generate the title for a live video """
1843         now = datetime.datetime.now()
1844         now_str = now.strftime('%Y-%m-%d %H:%M')
1845         return name + ' ' + now_str
1846
1847     def _int(self, v, name, fatal=False, **kwargs):
1848         res = int_or_none(v, **kwargs)
1849         if 'get_attr' in kwargs:
1850             print(getattr(v, kwargs['get_attr']))
1851         if res is None:
1852             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1853             if fatal:
1854                 raise ExtractorError(msg)
1855             else:
1856                 self._downloader.report_warning(msg)
1857         return res
1858
1859     def _float(self, v, name, fatal=False, **kwargs):
1860         res = float_or_none(v, **kwargs)
1861         if res is None:
1862             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1863             if fatal:
1864                 raise ExtractorError(msg)
1865             else:
1866                 self._downloader.report_warning(msg)
1867         return res
1868
1869     def _set_cookie(self, domain, name, value, expire_time=None):
1870         cookie = compat_cookiejar.Cookie(
1871             0, name, value, None, None, domain, None,
1872             None, '/', True, False, expire_time, '', None, None, None)
1873         self._downloader.cookiejar.set_cookie(cookie)
1874
1875     def _get_cookies(self, url):
1876         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1877         req = sanitized_Request(url)
1878         self._downloader.cookiejar.add_cookie_header(req)
1879         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1880
1881     def get_testcases(self, include_onlymatching=False):
1882         t = getattr(self, '_TEST', None)
1883         if t:
1884             assert not hasattr(self, '_TESTS'), \
1885                 '%s has _TEST and _TESTS' % type(self).__name__
1886             tests = [t]
1887         else:
1888             tests = getattr(self, '_TESTS', [])
1889         for t in tests:
1890             if not include_onlymatching and t.get('only_matching', False):
1891                 continue
1892             t['name'] = type(self).__name__[:-len('IE')]
1893             yield t
1894
1895     def is_suitable(self, age_limit):
1896         """ Test whether the extractor is generally suitable for the given
1897         age limit (i.e. pornographic sites are not, all others usually are) """
1898
1899         any_restricted = False
1900         for tc in self.get_testcases(include_onlymatching=False):
1901             if tc.get('playlist', []):
1902                 tc = tc['playlist'][0]
1903             is_restricted = age_restricted(
1904                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1905             if not is_restricted:
1906                 return True
1907             any_restricted = any_restricted or is_restricted
1908         return not any_restricted
1909
1910     def extract_subtitles(self, *args, **kwargs):
1911         if (self._downloader.params.get('writesubtitles', False) or
1912                 self._downloader.params.get('listsubtitles')):
1913             return self._get_subtitles(*args, **kwargs)
1914         return {}
1915
1916     def _get_subtitles(self, *args, **kwargs):
1917         raise NotImplementedError('This method must be implemented by subclasses')
1918
1919     @staticmethod
1920     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1921         """ Merge subtitle items for one language. Items with duplicated URLs
1922         will be dropped. """
1923         list1_urls = set([item['url'] for item in subtitle_list1])
1924         ret = list(subtitle_list1)
1925         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1926         return ret
1927
1928     @classmethod
1929     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1930         """ Merge two subtitle dictionaries, language by language. """
1931         ret = dict(subtitle_dict1)
1932         for lang in subtitle_dict2:
1933             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1934         return ret
1935
1936     def extract_automatic_captions(self, *args, **kwargs):
1937         if (self._downloader.params.get('writeautomaticsub', False) or
1938                 self._downloader.params.get('listsubtitles')):
1939             return self._get_automatic_captions(*args, **kwargs)
1940         return {}
1941
1942     def _get_automatic_captions(self, *args, **kwargs):
1943         raise NotImplementedError('This method must be implemented by subclasses')
1944
1945     def mark_watched(self, *args, **kwargs):
1946         if (self._downloader.params.get('mark_watched', False) and
1947                 (self._get_login_info()[0] is not None or
1948                     self._downloader.params.get('cookiefile') is not None)):
1949             self._mark_watched(*args, **kwargs)
1950
1951     def _mark_watched(self, *args, **kwargs):
1952         raise NotImplementedError('This method must be implemented by subclasses')
1953
1954     def geo_verification_headers(self):
1955         headers = {}
1956         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
1957         if geo_verification_proxy:
1958             headers['Ytdl-request-proxy'] = geo_verification_proxy
1959         return headers
1960
1961
1962 class SearchInfoExtractor(InfoExtractor):
1963     """
1964     Base class for paged search queries extractors.
1965     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1966     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1967     """
1968
1969     @classmethod
1970     def _make_valid_url(cls):
1971         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1972
1973     @classmethod
1974     def suitable(cls, url):
1975         return re.match(cls._make_valid_url(), url) is not None
1976
1977     def _real_extract(self, query):
1978         mobj = re.match(self._make_valid_url(), query)
1979         if mobj is None:
1980             raise ExtractorError('Invalid search query "%s"' % query)
1981
1982         prefix = mobj.group('prefix')
1983         query = mobj.group('query')
1984         if prefix == '':
1985             return self._get_n_results(query, 1)
1986         elif prefix == 'all':
1987             return self._get_n_results(query, self._MAX_RESULTS)
1988         else:
1989             n = int(prefix)
1990             if n <= 0:
1991                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1992             elif n > self._MAX_RESULTS:
1993                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1994                 n = self._MAX_RESULTS
1995             return self._get_n_results(query, n)
1996
1997     def _get_n_results(self, query, n):
1998         """Get a specified number of results for a query"""
1999         raise NotImplementedError('This method must be implemented by subclasses')
2000
2001     @property
2002     def SEARCH_KEY(self):
2003         return self._SEARCH_KEY