git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     unified_timestamp,
  48     url_basename,
  49     xpath_element,
  50     xpath_text,
  51     xpath_with_ns,
  52     determine_protocol,
  53     parse_duration,
  54     mimetype2ext,
  55     update_Request,
  56     update_url_query,
  57     parse_m3u8_attributes,
  58     extract_attributes,
  59     parse_codecs,
  60 )
  61
  62
  63 class InfoExtractor(object):
  64     """Information Extractor class.
  65
  66     Information extractors are the classes that, given a URL, extract
  67     information about the video (or videos) the URL refers to. This
  68     information includes the real video URL, the video title, author and
  69     others. The information is stored in a dictionary which is then
  70     passed to the YoutubeDL. The YoutubeDL processes this
  71     information possibly downloading the video to the file system, among
  72     other possible outcomes.
  73
  74     The type field determines the type of the result.
  75     By far the most common value (and the default if _type is missing) is
  76     "video", which indicates a single video.
  77
  78     For a video, the dictionaries must include the following fields:
  79
  80     id:             Video identifier.
  81     title:          Video title, unescaped.
  82
  83     Additionally, it must contain either a formats entry or a url one:
  84
  85     formats:        A list of dictionaries for each format available, ordered
  86                     from worst to best quality.
  87
  88                     Potential fields:
  89                     * url        Mandatory. The URL of the video file or URL of
  90                                  the manifest file in case of fragmented media
  91                                  (DASH, hls, hds).
  92                     * ext        Will be calculated from URL if missing
  93                     * format     A human-readable description of the format
  94                                  ("mp4 container with h264/opus").
  95                                  Calculated from the format_id, width, height.
  96                                  and format_note fields if missing.
  97                     * format_id  A short description of the format
  98                                  ("mp4_h264_opus" or "19").
  99                                 Technically optional, but strongly recommended.
 100                     * format_note Additional info about the format
 101                                  ("3D" or "DASH video")
 102                     * width      Width of the video, if known
 103                     * height     Height of the video, if known
 104                     * resolution Textual description of width and height
 105                     * tbr        Average bitrate of audio and video in KBit/s
 106                     * abr        Average audio bitrate in KBit/s
 107                     * acodec     Name of the audio codec in use
 108                     * asr        Audio sampling rate in Hertz
 109                     * vbr        Average video bitrate in KBit/s
 110                     * fps        Frame rate
 111                     * vcodec     Name of the video codec in use
 112                     * container  Name of the container format
 113                     * filesize   The number of bytes, if known in advance
 114                     * filesize_approx  An estimate for the number of bytes
 115                     * player_url SWF Player URL (used for rtmpdump).
 116                     * protocol   The protocol that will be used for the actual
 117                                  download, lower-case.
 118                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 119                                  "m3u8", "m3u8_native" or "http_dash_segments".
 120                     * fragments  A list of fragments of the fragmented media,
 121                                  with the following entries:
 122                                  * "url" (mandatory) - fragment's URL
 123                                  * "duration" (optional, int or float)
 124                                  * "filesize" (optional, int)
 125                     * preference Order number of this format. If this field is
 126                                  present and not None, the formats get sorted
 127                                  by this field, regardless of all other values.
 128                                  -1 for default (order by other properties),
 129                                  -2 or smaller for less than default.
 130                                  < -1000 to hide the format (if there is
 131                                     another one which is strictly better)
 132                     * language   Language code, e.g. "de" or "en-US".
 133                     * language_preference  Is this in the language mentioned in
 134                                  the URL?
 135                                  10 if it's what the URL is about,
 136                                  -1 for default (don't know),
 137                                  -10 otherwise, other values reserved for now.
 138                     * quality    Order number of the video quality of this
 139                                  format, irrespective of the file format.
 140                                  -1 for default (order by other properties),
 141                                  -2 or smaller for less than default.
 142                     * source_preference  Order number for this video source
 143                                   (quality takes higher priority)
 144                                  -1 for default (order by other properties),
 145                                  -2 or smaller for less than default.
 146                     * http_headers  A dictionary of additional HTTP headers
 147                                  to add to the request.
 148                     * stretched_ratio  If given and not 1, indicates that the
 149                                  video's pixels are not square.
 150                                  width : height ratio as float.
 151                     * no_resume  The server does not support resuming the
 152                                  (HTTP or RTMP) download. Boolean.
 153
 154     url:            Final video URL.
 155     ext:            Video filename extension.
 156     format:         The video format, defaults to ext (used for --get-format)
 157     player_url:     SWF Player URL (used for rtmpdump).
 158
 159     The following fields are optional:
 160
 161     alt_title:      A secondary title of the video.
 162     display_id      An alternative identifier for the video, not necessarily
 163                     unique, but available before title. Typically, id is
 164                     something like "4234987", title "Dancing naked mole rats",
 165                     and display_id "dancing-naked-mole-rats"
 166     thumbnails:     A list of dictionaries, with the following entries:
 167                         * "id" (optional, string) - Thumbnail format ID
 168                         * "url"
 169                         * "preference" (optional, int) - quality of the image
 170                         * "width" (optional, int)
 171                         * "height" (optional, int)
 172                         * "resolution" (optional, string "{width}x{height"},
 173                                         deprecated)
 174                         * "filesize" (optional, int)
 175     thumbnail:      Full URL to a video thumbnail image.
 176     description:    Full video description.
 177     uploader:       Full name of the video uploader.
 178     license:        License name the video is licensed under.
 179     creator:        The creator of the video.
 180     release_date:   The date (YYYYMMDD) when the video was released.
 181     timestamp:      UNIX timestamp of the moment the video became available.
 182     upload_date:    Video upload date (YYYYMMDD).
 183                     If not explicitly set, calculated from timestamp.
 184     uploader_id:    Nickname or id of the video uploader.
 185     uploader_url:   Full URL to a personal webpage of the video uploader.
 186     location:       Physical location where the video was filmed.
 187     subtitles:      The available subtitles as a dictionary in the format
 188                     {language: subformats}. "subformats" is a list sorted from
 189                     lower to higher preference, each element is a dictionary
 190                     with the "ext" entry and one of:
 191                         * "data": The subtitles file contents
 192                         * "url": A URL pointing to the subtitles file
 193                     "ext" will be calculated from URL if missing
 194     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 195                     automatically generated captions
 196     duration:       Length of the video in seconds, as an integer or float.
 197     view_count:     How many users have watched the video on the platform.
 198     like_count:     Number of positive ratings of the video
 199     dislike_count:  Number of negative ratings of the video
 200     repost_count:   Number of reposts of the video
 201     average_rating: Average rating give by users, the scale used depends on the webpage
 202     comment_count:  Number of comments on the video
 203     comments:       A list of comments, each with one or more of the following
 204                     properties (all but one of text or html optional):
 205                         * "author" - human-readable name of the comment author
 206                         * "author_id" - user ID of the comment author
 207                         * "id" - Comment ID
 208                         * "html" - Comment as HTML
 209                         * "text" - Plain text of the comment
 210                         * "timestamp" - UNIX timestamp of comment
 211                         * "parent" - ID of the comment this one is replying to.
 212                                      Set to "root" to indicate that this is a
 213                                      comment to the original video.
 214     age_limit:      Age restriction for the video, as an integer (years)
 215     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 216                     should allow to get the same result again. (It will be set
 217                     by YoutubeDL if it's missing)
 218     categories:     A list of categories that the video falls in, for example
 219                     ["Sports", "Berlin"]
 220     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 221     is_live:        True, False, or None (=unknown). Whether this video is a
 222                     live stream that goes on instead of a fixed-length video.
 223     start_time:     Time in seconds where the reproduction should start, as
 224                     specified in the URL.
 225     end_time:       Time in seconds where the reproduction should end, as
 226                     specified in the URL.
 227
 228     The following fields should only be used when the video belongs to some logical
 229     chapter or section:
 230
 231     chapter:        Name or title of the chapter the video belongs to.
 232     chapter_number: Number of the chapter the video belongs to, as an integer.
 233     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 234
 235     The following fields should only be used when the video is an episode of some
 236     series or programme:
 237
 238     series:         Title of the series or programme the video episode belongs to.
 239     season:         Title of the season the video episode belongs to.
 240     season_number:  Number of the season the video episode belongs to, as an integer.
 241     season_id:      Id of the season the video episode belongs to, as a unicode string.
 242     episode:        Title of the video episode. Unlike mandatory video title field,
 243                     this field should denote the exact title of the video episode
 244                     without any kind of decoration.
 245     episode_number: Number of the video episode within a season, as an integer.
 246     episode_id:     Id of the video episode, as a unicode string.
 247
 248     The following fields should only be used when the media is a track or a part of
 249     a music album:
 250
 251     track:          Title of the track.
 252     track_number:   Number of the track within an album or a disc, as an integer.
 253     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 254                     as a unicode string.
 255     artist:         Artist(s) of the track.
 256     genre:          Genre(s) of the track.
 257     album:          Title of the album the track belongs to.
 258     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 259     album_artist:   List of all artists appeared on the album (e.g.
 260                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 261                     and compilations).
 262     disc_number:    Number of the disc or other physical medium the track belongs to,
 263                     as an integer.
 264     release_year:   Year (YYYY) when the album was released.
 265
 266     Unless mentioned otherwise, the fields should be Unicode strings.
 267
 268     Unless mentioned otherwise, None is equivalent to absence of information.
 269
 270
 271     _type "playlist" indicates multiple videos.
 272     There must be a key "entries", which is a list, an iterable, or a PagedList
 273     object, each element of which is a valid dictionary by this specification.
 274
 275     Additionally, playlists can have "title", "description" and "id" attributes
 276     with the same semantics as videos (see above).
 277
 278
 279     _type "multi_video" indicates that there are multiple videos that
 280     form a single show, for examples multiple acts of an opera or TV episode.
 281     It must have an entries key like a playlist and contain all the keys
 282     required for a video at the same time.
 283
 284
 285     _type "url" indicates that the video must be extracted from another
 286     location, possibly by a different extractor. Its only required key is:
 287     "url" - the next URL to extract.
 288     The key "ie_key" can be set to the class name (minus the trailing "IE",
 289     e.g. "Youtube") if the extractor class is known in advance.
 290     Additionally, the dictionary may have any properties of the resolved entity
 291     known in advance, for example "title" if the title of the referred video is
 292     known ahead of time.
 293
 294
 295     _type "url_transparent" entities have the same specification as "url", but
 296     indicate that the given additional information is more precise than the one
 297     associated with the resolved URL.
 298     This is useful when a site employs a video service that hosts the video and
 299     its technical metadata, but that video service does not embed a useful
 300     title, description etc.
 301
 302
 303     Subclasses of this one should re-define the _real_initialize() and
 304     _real_extract() methods and define a _VALID_URL regexp.
 305     Probably, they should also be added to the list of extractors.
 306
 307     Finally, the _WORKING attribute should be set to False for broken IEs
 308     in order to warn the users and skip the tests.
 309     """
 310
 311     _ready = False
 312     _downloader = None
 313     _WORKING = True
 314
 315     def __init__(self, downloader=None):
 316         """Constructor. Receives an optional downloader."""
 317         self._ready = False
 318         self.set_downloader(downloader)
 319
 320     @classmethod
 321     def suitable(cls, url):
 322         """Receives a URL and returns True if suitable for this IE."""
 323
 324         # This does not use has/getattr intentionally - we want to know whether
 325         # we have cached the regexp for *this* class, whereas getattr would also
 326         # match the superclass
 327         if '_VALID_URL_RE' not in cls.__dict__:
 328             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 329         return cls._VALID_URL_RE.match(url) is not None
 330
 331     @classmethod
 332     def _match_id(cls, url):
 333         if '_VALID_URL_RE' not in cls.__dict__:
 334             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 335         m = cls._VALID_URL_RE.match(url)
 336         assert m
 337         return m.group('id')
 338
 339     @classmethod
 340     def working(cls):
 341         """Getter method for _WORKING."""
 342         return cls._WORKING
 343
 344     def initialize(self):
 345         """Initializes an instance (authentication, etc)."""
 346         if not self._ready:
 347             self._real_initialize()
 348             self._ready = True
 349
 350     def extract(self, url):
 351         """Extracts URL information and returns it in list of dicts."""
 352         try:
 353             self.initialize()
 354             return self._real_extract(url)
 355         except ExtractorError:
 356             raise
 357         except compat_http_client.IncompleteRead as e:
 358             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 359         except (KeyError, StopIteration) as e:
 360             raise ExtractorError('An extractor error has occurred.', cause=e)
 361
 362     def set_downloader(self, downloader):
 363         """Sets the downloader for this IE."""
 364         self._downloader = downloader
 365
 366     def _real_initialize(self):
 367         """Real initialization process. Redefine in subclasses."""
 368         pass
 369
 370     def _real_extract(self, url):
 371         """Real extraction process. Redefine in subclasses."""
 372         pass
 373
 374     @classmethod
 375     def ie_key(cls):
 376         """A string for getting the InfoExtractor with get_info_extractor"""
 377         return compat_str(cls.__name__[:-2])
 378
 379     @property
 380     def IE_NAME(self):
 381         return compat_str(type(self).__name__[:-2])
 382
 383     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 384         """ Returns the response handle """
 385         if note is None:
 386             self.report_download_webpage(video_id)
 387         elif note is not False:
 388             if video_id is None:
 389                 self.to_screen('%s' % (note,))
 390             else:
 391                 self.to_screen('%s: %s' % (video_id, note))
 392         if isinstance(url_or_request, compat_urllib_request.Request):
 393             url_or_request = update_Request(
 394                 url_or_request, data=data, headers=headers, query=query)
 395         else:
 396             if query:
 397                 url_or_request = update_url_query(url_or_request, query)
 398             if data is not None or headers:
 399                 url_or_request = sanitized_Request(url_or_request, data, headers)
 400         try:
 401             return self._downloader.urlopen(url_or_request)
 402         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 403             if errnote is False:
 404                 return False
 405             if errnote is None:
 406                 errnote = 'Unable to download webpage'
 407
 408             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 409             if fatal:
 410                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 411             else:
 412                 self._downloader.report_warning(errmsg)
 413                 return False
 414
 415     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 416         """ Returns a tuple (page content as string, URL handle) """
 417         # Strip hashes from the URL (#1038)
 418         if isinstance(url_or_request, (compat_str, str)):
 419             url_or_request = url_or_request.partition('#')[0]
 420
 421         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 422         if urlh is False:
 423             assert not fatal
 424             return False
 425         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 426         return (content, urlh)
 427
 428     @staticmethod
 429     def _guess_encoding_from_content(content_type, webpage_bytes):
 430         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 431         if m:
 432             encoding = m.group(1)
 433         else:
 434             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 435                           webpage_bytes[:1024])
 436             if m:
 437                 encoding = m.group(1).decode('ascii')
 438             elif webpage_bytes.startswith(b'\xff\xfe'):
 439                 encoding = 'utf-16'
 440             else:
 441                 encoding = 'utf-8'
 442
 443         return encoding
 444
 445     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 446         content_type = urlh.headers.get('Content-Type', '')
 447         webpage_bytes = urlh.read()
 448         if prefix is not None:
 449             webpage_bytes = prefix + webpage_bytes
 450         if not encoding:
 451             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 452         if self._downloader.params.get('dump_intermediate_pages', False):
 453             try:
 454                 url = url_or_request.get_full_url()
 455             except AttributeError:
 456                 url = url_or_request
 457             self.to_screen('Dumping request to ' + url)
 458             dump = base64.b64encode(webpage_bytes).decode('ascii')
 459             self._downloader.to_screen(dump)
 460         if self._downloader.params.get('write_pages', False):
 461             try:
 462                 url = url_or_request.get_full_url()
 463             except AttributeError:
 464                 url = url_or_request
 465             basen = '%s_%s' % (video_id, url)
 466             if len(basen) > 240:
 467                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 468                 basen = basen[:240 - len(h)] + h
 469             raw_filename = basen + '.dump'
 470             filename = sanitize_filename(raw_filename, restricted=True)
 471             self.to_screen('Saving request to ' + filename)
 472             # Working around MAX_PATH limitation on Windows (see
 473             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 474             if compat_os_name == 'nt':
 475                 absfilepath = os.path.abspath(filename)
 476                 if len(absfilepath) > 259:
 477                     filename = '\\\\?\\' + absfilepath
 478             with open(filename, 'wb') as outf:
 479                 outf.write(webpage_bytes)
 480
 481         try:
 482             content = webpage_bytes.decode(encoding, 'replace')
 483         except LookupError:
 484             content = webpage_bytes.decode('utf-8', 'replace')
 485
 486         if ('<title>Access to this site is blocked</title>' in content and
 487                 'Websense' in content[:512]):
 488             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 489             blocked_iframe = self._html_search_regex(
 490                 r'<iframe src="([^"]+)"', content,
 491                 'Websense information URL', default=None)
 492             if blocked_iframe:
 493                 msg += ' Visit %s for more details' % blocked_iframe
 494             raise ExtractorError(msg, expected=True)
 495         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 496             msg = (
 497                 'Access to this webpage has been blocked by Indian censorship. '
 498                 'Use a VPN or proxy server (with --proxy) to route around it.')
 499             block_msg = self._html_search_regex(
 500                 r'</h1><p>(.*?)</p>',
 501                 content, 'block message', default=None)
 502             if block_msg:
 503                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 504             raise ExtractorError(msg, expected=True)
 505
 506         return content
 507
 508     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 509         """ Returns the data of the page as a string """
 510         success = False
 511         try_count = 0
 512         while success is False:
 513             try:
 514                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 515                 success = True
 516             except compat_http_client.IncompleteRead as e:
 517                 try_count += 1
 518                 if try_count >= tries:
 519                     raise e
 520                 self._sleep(timeout, video_id)
 521         if res is False:
 522             return res
 523         else:
 524             content, _ = res
 525             return content
 526
 527     def _download_xml(self, url_or_request, video_id,
 528                       note='Downloading XML', errnote='Unable to download XML',
 529                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 530         """Return the xml as an xml.etree.ElementTree.Element"""
 531         xml_string = self._download_webpage(
 532             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 533         if xml_string is False:
 534             return xml_string
 535         if transform_source:
 536             xml_string = transform_source(xml_string)
 537         return compat_etree_fromstring(xml_string.encode('utf-8'))
 538
 539     def _download_json(self, url_or_request, video_id,
 540                        note='Downloading JSON metadata',
 541                        errnote='Unable to download JSON metadata',
 542                        transform_source=None,
 543                        fatal=True, encoding=None, data=None, headers={}, query={}):
 544         json_string = self._download_webpage(
 545             url_or_request, video_id, note, errnote, fatal=fatal,
 546             encoding=encoding, data=data, headers=headers, query=query)
 547         if (not fatal) and json_string is False:
 548             return None
 549         return self._parse_json(
 550             json_string, video_id, transform_source=transform_source, fatal=fatal)
 551
 552     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 553         if transform_source:
 554             json_string = transform_source(json_string)
 555         try:
 556             return json.loads(json_string)
 557         except ValueError as ve:
 558             errmsg = '%s: Failed to parse JSON ' % video_id
 559             if fatal:
 560                 raise ExtractorError(errmsg, cause=ve)
 561             else:
 562                 self.report_warning(errmsg + str(ve))
 563
 564     def report_warning(self, msg, video_id=None):
 565         idstr = '' if video_id is None else '%s: ' % video_id
 566         self._downloader.report_warning(
 567             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 568
 569     def to_screen(self, msg):
 570         """Print msg to screen, prefixing it with '[ie_name]'"""
 571         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 572
 573     def report_extraction(self, id_or_name):
 574         """Report information extraction."""
 575         self.to_screen('%s: Extracting information' % id_or_name)
 576
 577     def report_download_webpage(self, video_id):
 578         """Report webpage download."""
 579         self.to_screen('%s: Downloading webpage' % video_id)
 580
 581     def report_age_confirmation(self):
 582         """Report attempt to confirm age."""
 583         self.to_screen('Confirming age')
 584
 585     def report_login(self):
 586         """Report attempt to log in."""
 587         self.to_screen('Logging in')
 588
 589     @staticmethod
 590     def raise_login_required(msg='This video is only available for registered users'):
 591         raise ExtractorError(
 592             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 593             expected=True)
 594
 595     @staticmethod
 596     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 597         raise ExtractorError(
 598             '%s. You might want to use --proxy to workaround.' % msg,
 599             expected=True)
 600
 601     # Methods for following #608
 602     @staticmethod
 603     def url_result(url, ie=None, video_id=None, video_title=None):
 604         """Returns a URL that points to a page that should be processed"""
 605         # TODO: ie should be the class used for getting the info
 606         video_info = {'_type': 'url',
 607                       'url': url,
 608                       'ie_key': ie}
 609         if video_id is not None:
 610             video_info['id'] = video_id
 611         if video_title is not None:
 612             video_info['title'] = video_title
 613         return video_info
 614
 615     @staticmethod
 616     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 617         """Returns a playlist"""
 618         video_info = {'_type': 'playlist',
 619                       'entries': entries}
 620         if playlist_id:
 621             video_info['id'] = playlist_id
 622         if playlist_title:
 623             video_info['title'] = playlist_title
 624         if playlist_description:
 625             video_info['description'] = playlist_description
 626         return video_info
 627
 628     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 629         """
 630         Perform a regex search on the given string, using a single or a list of
 631         patterns returning the first matching group.
 632         In case of failure return a default value or raise a WARNING or a
 633         RegexNotFoundError, depending on fatal, specifying the field name.
 634         """
 635         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 636             mobj = re.search(pattern, string, flags)
 637         else:
 638             for p in pattern:
 639                 mobj = re.search(p, string, flags)
 640                 if mobj:
 641                     break
 642
 643         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 644             _name = '\033[0;34m%s\033[0m' % name
 645         else:
 646             _name = name
 647
 648         if mobj:
 649             if group is None:
 650                 # return the first matching group
 651                 return next(g for g in mobj.groups() if g is not None)
 652             else:
 653                 return mobj.group(group)
 654         elif default is not NO_DEFAULT:
 655             return default
 656         elif fatal:
 657             raise RegexNotFoundError('Unable to extract %s' % _name)
 658         else:
 659             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 660             return None
 661
 662     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 663         """
 664         Like _search_regex, but strips HTML tags and unescapes entities.
 665         """
 666         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 667         if res:
 668             return clean_html(res).strip()
 669         else:
 670             return res
 671
 672     def _get_netrc_login_info(self, netrc_machine=None):
 673         username = None
 674         password = None
 675         netrc_machine = netrc_machine or self._NETRC_MACHINE
 676
 677         if self._downloader.params.get('usenetrc', False):
 678             try:
 679                 info = netrc.netrc().authenticators(netrc_machine)
 680                 if info is not None:
 681                     username = info[0]
 682                     password = info[2]
 683                 else:
 684                     raise netrc.NetrcParseError(
 685                         'No authenticators for %s' % netrc_machine)
 686             except (IOError, netrc.NetrcParseError) as err:
 687                 self._downloader.report_warning(
 688                     'parsing .netrc: %s' % error_to_compat_str(err))
 689
 690         return username, password
 691
 692     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 693         """
 694         Get the login info as (username, password)
 695         First look for the manually specified credentials using username_option
 696         and password_option as keys in params dictionary. If no such credentials
 697         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 698         value.
 699         If there's no info available, return (None, None)
 700         """
 701         if self._downloader is None:
 702             return (None, None)
 703
 704         downloader_params = self._downloader.params
 705
 706         # Attempt to use provided username and password or .netrc data
 707         if downloader_params.get(username_option) is not None:
 708             username = downloader_params[username_option]
 709             password = downloader_params[password_option]
 710         else:
 711             username, password = self._get_netrc_login_info(netrc_machine)
 712
 713         return username, password
 714
 715     def _get_tfa_info(self, note='two-factor verification code'):
 716         """
 717         Get the two-factor authentication info
 718         TODO - asking the user will be required for sms/phone verify
 719         currently just uses the command line option
 720         If there's no info available, return None
 721         """
 722         if self._downloader is None:
 723             return None
 724         downloader_params = self._downloader.params
 725
 726         if downloader_params.get('twofactor') is not None:
 727             return downloader_params['twofactor']
 728
 729         return compat_getpass('Type %s and press [Return]: ' % note)
 730
 731     # Helper functions for extracting OpenGraph info
 732     @staticmethod
 733     def _og_regexes(prop):
 734         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 735         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 736                        % {'prop': re.escape(prop)})
 737         template = r'<meta[^>]+?%s[^>]+?%s'
 738         return [
 739             template % (property_re, content_re),
 740             template % (content_re, property_re),
 741         ]
 742
 743     @staticmethod
 744     def _meta_regex(prop):
 745         return r'''(?isx)<meta
 746                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 747                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 748
 749     def _og_search_property(self, prop, html, name=None, **kargs):
 750         if not isinstance(prop, (list, tuple)):
 751             prop = [prop]
 752         if name is None:
 753             name = 'OpenGraph %s' % prop[0]
 754         og_regexes = []
 755         for p in prop:
 756             og_regexes.extend(self._og_regexes(p))
 757         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 758         if escaped is None:
 759             return None
 760         return unescapeHTML(escaped)
 761
 762     def _og_search_thumbnail(self, html, **kargs):
 763         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 764
 765     def _og_search_description(self, html, **kargs):
 766         return self._og_search_property('description', html, fatal=False, **kargs)
 767
 768     def _og_search_title(self, html, **kargs):
 769         return self._og_search_property('title', html, **kargs)
 770
 771     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 772         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 773         if secure:
 774             regexes = self._og_regexes('video:secure_url') + regexes
 775         return self._html_search_regex(regexes, html, name, **kargs)
 776
 777     def _og_search_url(self, html, **kargs):
 778         return self._og_search_property('url', html, **kargs)
 779
 780     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 781         if not isinstance(name, (list, tuple)):
 782             name = [name]
 783         if display_name is None:
 784             display_name = name[0]
 785         return self._html_search_regex(
 786             [self._meta_regex(n) for n in name],
 787             html, display_name, fatal=fatal, group='content', **kwargs)
 788
 789     def _dc_search_uploader(self, html):
 790         return self._html_search_meta('dc.creator', html, 'uploader')
 791
 792     def _rta_search(self, html):
 793         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 794         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 795                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 796                      html):
 797             return 18
 798         return 0
 799
 800     def _media_rating_search(self, html):
 801         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 802         rating = self._html_search_meta('rating', html)
 803
 804         if not rating:
 805             return None
 806
 807         RATING_TABLE = {
 808             'safe for kids': 0,
 809             'general': 8,
 810             '14 years': 14,
 811             'mature': 17,
 812             'restricted': 19,
 813         }
 814         return RATING_TABLE.get(rating.lower())
 815
 816     def _family_friendly_search(self, html):
 817         # See http://schema.org/VideoObject
 818         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 819
 820         if not family_friendly:
 821             return None
 822
 823         RATING_TABLE = {
 824             '1': 0,
 825             'true': 0,
 826             '0': 18,
 827             'false': 18,
 828         }
 829         return RATING_TABLE.get(family_friendly.lower())
 830
 831     def _twitter_search_player(self, html):
 832         return self._html_search_meta('twitter:player', html,
 833                                       'twitter card player')
 834
 835     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 836         json_ld = self._search_regex(
 837             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 838             html, 'JSON-LD', group='json_ld', **kwargs)
 839         default = kwargs.get('default', NO_DEFAULT)
 840         if not json_ld:
 841             return default if default is not NO_DEFAULT else {}
 842         # JSON-LD may be malformed and thus `fatal` should be respected.
 843         # At the same time `default` may be passed that assumes `fatal=False`
 844         # for _search_regex. Let's simulate the same behavior here as well.
 845         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 846         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 847
 848     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 849         if isinstance(json_ld, compat_str):
 850             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 851         if not json_ld:
 852             return {}
 853         info = {}
 854         if not isinstance(json_ld, (list, tuple, dict)):
 855             return info
 856         if isinstance(json_ld, dict):
 857             json_ld = [json_ld]
 858         for e in json_ld:
 859             if e.get('@context') == 'http://schema.org':
 860                 item_type = e.get('@type')
 861                 if expected_type is not None and expected_type != item_type:
 862                     return info
 863                 if item_type == 'TVEpisode':
 864                     info.update({
 865                         'episode': unescapeHTML(e.get('name')),
 866                         'episode_number': int_or_none(e.get('episodeNumber')),
 867                         'description': unescapeHTML(e.get('description')),
 868                     })
 869                     part_of_season = e.get('partOfSeason')
 870                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 871                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 872                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 873                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 874                         info['series'] = unescapeHTML(part_of_series.get('name'))
 875                 elif item_type == 'Article':
 876                     info.update({
 877                         'timestamp': parse_iso8601(e.get('datePublished')),
 878                         'title': unescapeHTML(e.get('headline')),
 879                         'description': unescapeHTML(e.get('articleBody')),
 880                     })
 881                 elif item_type == 'VideoObject':
 882                     info.update({
 883                         'url': e.get('contentUrl'),
 884                         'title': unescapeHTML(e.get('name')),
 885                         'description': unescapeHTML(e.get('description')),
 886                         'thumbnail': e.get('thumbnailUrl'),
 887                         'duration': parse_duration(e.get('duration')),
 888                         'timestamp': unified_timestamp(e.get('uploadDate')),
 889                         'filesize': float_or_none(e.get('contentSize')),
 890                         'tbr': int_or_none(e.get('bitrate')),
 891                         'width': int_or_none(e.get('width')),
 892                         'height': int_or_none(e.get('height')),
 893                     })
 894                 break
 895         return dict((k, v) for k, v in info.items() if v is not None)
 896
 897     @staticmethod
 898     def _hidden_inputs(html):
 899         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 900         hidden_inputs = {}
 901         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 902             attrs = extract_attributes(input)
 903             if not input:
 904                 continue
 905             if attrs.get('type') not in ('hidden', 'submit'):
 906                 continue
 907             name = attrs.get('name') or attrs.get('id')
 908             value = attrs.get('value')
 909             if name and value is not None:
 910                 hidden_inputs[name] = value
 911         return hidden_inputs
 912
 913     def _form_hidden_inputs(self, form_id, html):
 914         form = self._search_regex(
 915             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 916             html, '%s form' % form_id, group='form')
 917         return self._hidden_inputs(form)
 918
 919     def _sort_formats(self, formats, field_preference=None):
 920         if not formats:
 921             raise ExtractorError('No video formats found')
 922
 923         for f in formats:
 924             # Automatically determine tbr when missing based on abr and vbr (improves
 925             # formats sorting in some cases)
 926             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 927                 f['tbr'] = f['abr'] + f['vbr']
 928
 929         def _formats_key(f):
 930             # TODO remove the following workaround
 931             from ..utils import determine_ext
 932             if not f.get('ext') and 'url' in f:
 933                 f['ext'] = determine_ext(f['url'])
 934
 935             if isinstance(field_preference, (list, tuple)):
 936                 return tuple(
 937                     f.get(field)
 938                     if f.get(field) is not None
 939                     else ('' if field == 'format_id' else -1)
 940                     for field in field_preference)
 941
 942             preference = f.get('preference')
 943             if preference is None:
 944                 preference = 0
 945                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 946                     preference -= 0.5
 947
 948             protocol = f.get('protocol') or determine_protocol(f)
 949             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
 950
 951             if f.get('vcodec') == 'none':  # audio only
 952                 preference -= 50
 953                 if self._downloader.params.get('prefer_free_formats'):
 954                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 955                 else:
 956                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 957                 ext_preference = 0
 958                 try:
 959                     audio_ext_preference = ORDER.index(f['ext'])
 960                 except ValueError:
 961                     audio_ext_preference = -1
 962             else:
 963                 if f.get('acodec') == 'none':  # video only
 964                     preference -= 40
 965                 if self._downloader.params.get('prefer_free_formats'):
 966                     ORDER = ['flv', 'mp4', 'webm']
 967                 else:
 968                     ORDER = ['webm', 'flv', 'mp4']
 969                 try:
 970                     ext_preference = ORDER.index(f['ext'])
 971                 except ValueError:
 972                     ext_preference = -1
 973                 audio_ext_preference = 0
 974
 975             return (
 976                 preference,
 977                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 978                 f.get('quality') if f.get('quality') is not None else -1,
 979                 f.get('tbr') if f.get('tbr') is not None else -1,
 980                 f.get('filesize') if f.get('filesize') is not None else -1,
 981                 f.get('vbr') if f.get('vbr') is not None else -1,
 982                 f.get('height') if f.get('height') is not None else -1,
 983                 f.get('width') if f.get('width') is not None else -1,
 984                 proto_preference,
 985                 ext_preference,
 986                 f.get('abr') if f.get('abr') is not None else -1,
 987                 audio_ext_preference,
 988                 f.get('fps') if f.get('fps') is not None else -1,
 989                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 990                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 991                 f.get('format_id') if f.get('format_id') is not None else '',
 992             )
 993         formats.sort(key=_formats_key)
 994
 995     def _check_formats(self, formats, video_id):
 996         if formats:
 997             formats[:] = filter(
 998                 lambda f: self._is_valid_url(
 999                     f['url'], video_id,
1000                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1001                 formats)
1002
1003     @staticmethod
1004     def _remove_duplicate_formats(formats):
1005         format_urls = set()
1006         unique_formats = []
1007         for f in formats:
1008             if f['url'] not in format_urls:
1009                 format_urls.add(f['url'])
1010                 unique_formats.append(f)
1011         formats[:] = unique_formats
1012
1013     def _is_valid_url(self, url, video_id, item='video'):
1014         url = self._proto_relative_url(url, scheme='http:')
1015         # For now assume non HTTP(S) URLs always valid
1016         if not (url.startswith('http://') or url.startswith('https://')):
1017             return True
1018         try:
1019             self._request_webpage(url, video_id, 'Checking %s URL' % item)
1020             return True
1021         except ExtractorError as e:
1022             if isinstance(e.cause, compat_urllib_error.URLError):
1023                 self.to_screen(
1024                     '%s: %s URL is invalid, skipping' % (video_id, item))
1025                 return False
1026             raise
1027
1028     def http_scheme(self):
1029         """ Either "http:" or "https:", depending on the user's preferences """
1030         return (
1031             'http:'
1032             if self._downloader.params.get('prefer_insecure', False)
1033             else 'https:')
1034
1035     def _proto_relative_url(self, url, scheme=None):
1036         if url is None:
1037             return url
1038         if url.startswith('//'):
1039             if scheme is None:
1040                 scheme = self.http_scheme()
1041             return scheme + url
1042         else:
1043             return url
1044
1045     def _sleep(self, timeout, video_id, msg_template=None):
1046         if msg_template is None:
1047             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1048         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1049         self.to_screen(msg)
1050         time.sleep(timeout)
1051
1052     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1053                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1054                              fatal=True, m3u8_id=None):
1055         manifest = self._download_xml(
1056             manifest_url, video_id, 'Downloading f4m manifest',
1057             'Unable to download f4m manifest',
1058             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1059             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1060             transform_source=transform_source,
1061             fatal=fatal)
1062
1063         if manifest is False:
1064             return []
1065
1066         return self._parse_f4m_formats(
1067             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1068             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1069
1070     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1071                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1072                            fatal=True, m3u8_id=None):
1073         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1074         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1075         if akamai_pv is not None and ';' in akamai_pv.text:
1076             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1077             if playerVerificationChallenge.strip() != '':
1078                 return []
1079
1080         formats = []
1081         manifest_version = '1.0'
1082         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1083         if not media_nodes:
1084             manifest_version = '2.0'
1085             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1086         # Remove unsupported DRM protected media from final formats
1087         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1088         media_nodes = remove_encrypted_media(media_nodes)
1089         if not media_nodes:
1090             return formats
1091         base_url = xpath_text(
1092             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1093             'base URL', default=None)
1094         if base_url:
1095             base_url = base_url.strip()
1096
1097         bootstrap_info = xpath_element(
1098             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1099             'bootstrap info', default=None)
1100
1101         for i, media_el in enumerate(media_nodes):
1102             tbr = int_or_none(media_el.attrib.get('bitrate'))
1103             width = int_or_none(media_el.attrib.get('width'))
1104             height = int_or_none(media_el.attrib.get('height'))
1105             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1106             # If <bootstrapInfo> is present, the specified f4m is a
1107             # stream-level manifest, and only set-level manifests may refer to
1108             # external resources.  See section 11.4 and section 4 of F4M spec
1109             if bootstrap_info is None:
1110                 media_url = None
1111                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1112                 if manifest_version == '2.0':
1113                     media_url = media_el.attrib.get('href')
1114                 if media_url is None:
1115                     media_url = media_el.attrib.get('url')
1116                 if not media_url:
1117                     continue
1118                 manifest_url = (
1119                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1120                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1121                 # If media_url is itself a f4m manifest do the recursive extraction
1122                 # since bitrates in parent manifest (this one) and media_url manifest
1123                 # may differ leading to inability to resolve the format by requested
1124                 # bitrate in f4m downloader
1125                 ext = determine_ext(manifest_url)
1126                 if ext == 'f4m':
1127                     f4m_formats = self._extract_f4m_formats(
1128                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1129                         transform_source=transform_source, fatal=fatal)
1130                     # Sometimes stream-level manifest contains single media entry that
1131                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1132                     # At the same time parent's media entry in set-level manifest may
1133                     # contain it. We will copy it from parent in such cases.
1134                     if len(f4m_formats) == 1:
1135                         f = f4m_formats[0]
1136                         f.update({
1137                             'tbr': f.get('tbr') or tbr,
1138                             'width': f.get('width') or width,
1139                             'height': f.get('height') or height,
1140                             'format_id': f.get('format_id') if not tbr else format_id,
1141                         })
1142                     formats.extend(f4m_formats)
1143                     continue
1144                 elif ext == 'm3u8':
1145                     formats.extend(self._extract_m3u8_formats(
1146                         manifest_url, video_id, 'mp4', preference=preference,
1147                         m3u8_id=m3u8_id, fatal=fatal))
1148                     continue
1149             formats.append({
1150                 'format_id': format_id,
1151                 'url': manifest_url,
1152                 'ext': 'flv' if bootstrap_info is not None else None,
1153                 'tbr': tbr,
1154                 'width': width,
1155                 'height': height,
1156                 'preference': preference,
1157             })
1158         return formats
1159
1160     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1161         return {
1162             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1163             'url': m3u8_url,
1164             'ext': ext,
1165             'protocol': 'm3u8',
1166             'preference': preference - 100 if preference else -100,
1167             'resolution': 'multiple',
1168             'format_note': 'Quality selection URL',
1169         }
1170
1171     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1172                               entry_protocol='m3u8', preference=None,
1173                               m3u8_id=None, note=None, errnote=None,
1174                               fatal=True, live=False):
1175
1176         res = self._download_webpage_handle(
1177             m3u8_url, video_id,
1178             note=note or 'Downloading m3u8 information',
1179             errnote=errnote or 'Failed to download m3u8 information',
1180             fatal=fatal)
1181         if res is False:
1182             return []
1183         m3u8_doc, urlh = res
1184         m3u8_url = urlh.geturl()
1185
1186         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1187
1188         format_url = lambda u: (
1189             u
1190             if re.match(r'^https?://', u)
1191             else compat_urlparse.urljoin(m3u8_url, u))
1192
1193         # We should try extracting formats only from master playlists [1], i.e.
1194         # playlists that describe available qualities. On the other hand media
1195         # playlists [2] should be returned as is since they contain just the media
1196         # without qualities renditions.
1197         # Fortunately, master playlist can be easily distinguished from media
1198         # playlist based on particular tags availability. As of [1, 2] master
1199         # playlist tags MUST NOT appear in a media playist and vice versa.
1200         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1201         # and MUST NOT appear in master playlist thus we can clearly detect media
1202         # playlist with this criterion.
1203         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1204         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1205         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1206         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1207             return [{
1208                 'url': m3u8_url,
1209                 'format_id': m3u8_id,
1210                 'ext': ext,
1211                 'protocol': entry_protocol,
1212                 'preference': preference,
1213             }]
1214         last_info = {}
1215         last_media = {}
1216         for line in m3u8_doc.splitlines():
1217             if line.startswith('#EXT-X-STREAM-INF:'):
1218                 last_info = parse_m3u8_attributes(line)
1219             elif line.startswith('#EXT-X-MEDIA:'):
1220                 media = parse_m3u8_attributes(line)
1221                 media_type = media.get('TYPE')
1222                 if media_type in ('VIDEO', 'AUDIO'):
1223                     media_url = media.get('URI')
1224                     if media_url:
1225                         format_id = []
1226                         for v in (media.get('GROUP-ID'), media.get('NAME')):
1227                             if v:
1228                                 format_id.append(v)
1229                         formats.append({
1230                             'format_id': '-'.join(format_id),
1231                             'url': format_url(media_url),
1232                             'language': media.get('LANGUAGE'),
1233                             'vcodec': 'none' if media_type == 'AUDIO' else None,
1234                             'ext': ext,
1235                             'protocol': entry_protocol,
1236                             'preference': preference,
1237                         })
1238                     else:
1239                         # When there is no URI in EXT-X-MEDIA let this tag's
1240                         # data be used by regular URI lines below
1241                         last_media = media
1242             elif line.startswith('#') or not line.strip():
1243                 continue
1244             else:
1245                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1246                 format_id = []
1247                 if m3u8_id:
1248                     format_id.append(m3u8_id)
1249                 # Despite specification does not mention NAME attribute for
1250                 # EXT-X-STREAM-INF it still sometimes may be present
1251                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1252                 # Bandwidth of live streams may differ over time thus making
1253                 # format_id unpredictable. So it's better to keep provided
1254                 # format_id intact.
1255                 if not live:
1256                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1257                 f = {
1258                     'format_id': '-'.join(format_id),
1259                     'url': format_url(line.strip()),
1260                     'tbr': tbr,
1261                     'ext': ext,
1262                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1263                     'protocol': entry_protocol,
1264                     'preference': preference,
1265                 }
1266                 resolution = last_info.get('RESOLUTION')
1267                 if resolution:
1268                     width_str, height_str = resolution.split('x')
1269                     f['width'] = int(width_str)
1270                     f['height'] = int(height_str)
1271                 # Unified Streaming Platform
1272                 mobj = re.search(
1273                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1274                 if mobj:
1275                     abr, vbr = mobj.groups()
1276                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1277                     f.update({
1278                         'vbr': vbr,
1279                         'abr': abr,
1280                     })
1281                 f.update(parse_codecs(last_info.get('CODECS')))
1282                 formats.append(f)
1283                 last_info = {}
1284                 last_media = {}
1285         return formats
1286
1287     @staticmethod
1288     def _xpath_ns(path, namespace=None):
1289         if not namespace:
1290             return path
1291         out = []
1292         for c in path.split('/'):
1293             if not c or c == '.':
1294                 out.append(c)
1295             else:
1296                 out.append('{%s}%s' % (namespace, c))
1297         return '/'.join(out)
1298
1299     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1300         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1301
1302         if smil is False:
1303             assert not fatal
1304             return []
1305
1306         namespace = self._parse_smil_namespace(smil)
1307
1308         return self._parse_smil_formats(
1309             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1310
1311     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1312         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1313         if smil is False:
1314             return {}
1315         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1316
1317     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1318         return self._download_xml(
1319             smil_url, video_id, 'Downloading SMIL file',
1320             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1321
1322     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1323         namespace = self._parse_smil_namespace(smil)
1324
1325         formats = self._parse_smil_formats(
1326             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1327         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1328
1329         video_id = os.path.splitext(url_basename(smil_url))[0]
1330         title = None
1331         description = None
1332         upload_date = None
1333         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1334             name = meta.attrib.get('name')
1335             content = meta.attrib.get('content')
1336             if not name or not content:
1337                 continue
1338             if not title and name == 'title':
1339                 title = content
1340             elif not description and name in ('description', 'abstract'):
1341                 description = content
1342             elif not upload_date and name == 'date':
1343                 upload_date = unified_strdate(content)
1344
1345         thumbnails = [{
1346             'id': image.get('type'),
1347             'url': image.get('src'),
1348             'width': int_or_none(image.get('width')),
1349             'height': int_or_none(image.get('height')),
1350         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1351
1352         return {
1353             'id': video_id,
1354             'title': title or video_id,
1355             'description': description,
1356             'upload_date': upload_date,
1357             'thumbnails': thumbnails,
1358             'formats': formats,
1359             'subtitles': subtitles,
1360         }
1361
1362     def _parse_smil_namespace(self, smil):
1363         return self._search_regex(
1364             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1365
1366     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1367         base = smil_url
1368         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1369             b = meta.get('base') or meta.get('httpBase')
1370             if b:
1371                 base = b
1372                 break
1373
1374         formats = []
1375         rtmp_count = 0
1376         http_count = 0
1377         m3u8_count = 0
1378
1379         srcs = []
1380         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1381         for medium in media:
1382             src = medium.get('src')
1383             if not src or src in srcs:
1384                 continue
1385             srcs.append(src)
1386
1387             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1388             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1389             width = int_or_none(medium.get('width'))
1390             height = int_or_none(medium.get('height'))
1391             proto = medium.get('proto')
1392             ext = medium.get('ext')
1393             src_ext = determine_ext(src)
1394             streamer = medium.get('streamer') or base
1395
1396             if proto == 'rtmp' or streamer.startswith('rtmp'):
1397                 rtmp_count += 1
1398                 formats.append({
1399                     'url': streamer,
1400                     'play_path': src,
1401                     'ext': 'flv',
1402                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1403                     'tbr': bitrate,
1404                     'filesize': filesize,
1405                     'width': width,
1406                     'height': height,
1407                 })
1408                 if transform_rtmp_url:
1409                     streamer, src = transform_rtmp_url(streamer, src)
1410                     formats[-1].update({
1411                         'url': streamer,
1412                         'play_path': src,
1413                     })
1414                 continue
1415
1416             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1417             src_url = src_url.strip()
1418
1419             if proto == 'm3u8' or src_ext == 'm3u8':
1420                 m3u8_formats = self._extract_m3u8_formats(
1421                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1422                 if len(m3u8_formats) == 1:
1423                     m3u8_count += 1
1424                     m3u8_formats[0].update({
1425                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1426                         'tbr': bitrate,
1427                         'width': width,
1428                         'height': height,
1429                     })
1430                 formats.extend(m3u8_formats)
1431                 continue
1432
1433             if src_ext == 'f4m':
1434                 f4m_url = src_url
1435                 if not f4m_params:
1436                     f4m_params = {
1437                         'hdcore': '3.2.0',
1438                         'plugin': 'flowplayer-3.2.0.1',
1439                     }
1440                 f4m_url += '&' if '?' in f4m_url else '?'
1441                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1442                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1443                 continue
1444
1445             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1446                 http_count += 1
1447                 formats.append({
1448                     'url': src_url,
1449                     'ext': ext or src_ext or 'flv',
1450                     'format_id': 'http-%d' % (bitrate or http_count),
1451                     'tbr': bitrate,
1452                     'filesize': filesize,
1453                     'width': width,
1454                     'height': height,
1455                 })
1456                 continue
1457
1458         return formats
1459
1460     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1461         urls = []
1462         subtitles = {}
1463         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1464             src = textstream.get('src')
1465             if not src or src in urls:
1466                 continue
1467             urls.append(src)
1468             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1469             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1470             subtitles.setdefault(lang, []).append({
1471                 'url': src,
1472                 'ext': ext,
1473             })
1474         return subtitles
1475
1476     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1477         xspf = self._download_xml(
1478             playlist_url, playlist_id, 'Downloading xpsf playlist',
1479             'Unable to download xspf manifest', fatal=fatal)
1480         if xspf is False:
1481             return []
1482         return self._parse_xspf(xspf, playlist_id)
1483
1484     def _parse_xspf(self, playlist, playlist_id):
1485         NS_MAP = {
1486             'xspf': 'http://xspf.org/ns/0/',
1487             's1': 'http://static.streamone.nl/player/ns/0',
1488         }
1489
1490         entries = []
1491         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1492             title = xpath_text(
1493                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1494             description = xpath_text(
1495                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1496             thumbnail = xpath_text(
1497                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1498             duration = float_or_none(
1499                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1500
1501             formats = [{
1502                 'url': location.text,
1503                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1504                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1505                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1506             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1507             self._sort_formats(formats)
1508
1509             entries.append({
1510                 'id': playlist_id,
1511                 'title': title,
1512                 'description': description,
1513                 'thumbnail': thumbnail,
1514                 'duration': duration,
1515                 'formats': formats,
1516             })
1517         return entries
1518
1519     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1520         res = self._download_webpage_handle(
1521             mpd_url, video_id,
1522             note=note or 'Downloading MPD manifest',
1523             errnote=errnote or 'Failed to download MPD manifest',
1524             fatal=fatal)
1525         if res is False:
1526             return []
1527         mpd, urlh = res
1528         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1529
1530         return self._parse_mpd_formats(
1531             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1532
1533     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1534         """
1535         Parse formats from MPD manifest.
1536         References:
1537          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1538             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1539          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1540         """
1541         if mpd_doc.get('type') == 'dynamic':
1542             return []
1543
1544         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1545
1546         def _add_ns(path):
1547             return self._xpath_ns(path, namespace)
1548
1549         def is_drm_protected(element):
1550             return element.find(_add_ns('ContentProtection')) is not None
1551
1552         def extract_multisegment_info(element, ms_parent_info):
1553             ms_info = ms_parent_info.copy()
1554             segment_list = element.find(_add_ns('SegmentList'))
1555             if segment_list is not None:
1556                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1557                 if segment_urls_e:
1558                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1559                 initialization = segment_list.find(_add_ns('Initialization'))
1560                 if initialization is not None:
1561                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1562             else:
1563                 segment_template = element.find(_add_ns('SegmentTemplate'))
1564                 if segment_template is not None:
1565                     start_number = segment_template.get('startNumber')
1566                     if start_number:
1567                         ms_info['start_number'] = int(start_number)
1568                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1569                     if segment_timeline is not None:
1570                         s_e = segment_timeline.findall(_add_ns('S'))
1571                         if s_e:
1572                             ms_info['total_number'] = 0
1573                             ms_info['s'] = []
1574                             for s in s_e:
1575                                 r = int(s.get('r', 0))
1576                                 ms_info['total_number'] += 1 + r
1577                                 ms_info['s'].append({
1578                                     't': int(s.get('t', 0)),
1579                                     # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1580                                     'd': int(s.attrib['d']),
1581                                     'r': r,
1582                                 })
1583                     else:
1584                         timescale = segment_template.get('timescale')
1585                         if timescale:
1586                             ms_info['timescale'] = int(timescale)
1587                         segment_duration = segment_template.get('duration')
1588                         if segment_duration:
1589                             ms_info['segment_duration'] = int(segment_duration)
1590                     media_template = segment_template.get('media')
1591                     if media_template:
1592                         ms_info['media_template'] = media_template
1593                     initialization = segment_template.get('initialization')
1594                     if initialization:
1595                         ms_info['initialization_url'] = initialization
1596                     else:
1597                         initialization = segment_template.find(_add_ns('Initialization'))
1598                         if initialization is not None:
1599                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1600             return ms_info
1601
1602         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1603         formats = []
1604         for period in mpd_doc.findall(_add_ns('Period')):
1605             period_duration = parse_duration(period.get('duration')) or mpd_duration
1606             period_ms_info = extract_multisegment_info(period, {
1607                 'start_number': 1,
1608                 'timescale': 1,
1609             })
1610             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1611                 if is_drm_protected(adaptation_set):
1612                     continue
1613                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1614                 for representation in adaptation_set.findall(_add_ns('Representation')):
1615                     if is_drm_protected(representation):
1616                         continue
1617                     representation_attrib = adaptation_set.attrib.copy()
1618                     representation_attrib.update(representation.attrib)
1619                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1620                     mime_type = representation_attrib['mimeType']
1621                     content_type = mime_type.split('/')[0]
1622                     if content_type == 'text':
1623                         # TODO implement WebVTT downloading
1624                         pass
1625                     elif content_type == 'video' or content_type == 'audio':
1626                         base_url = ''
1627                         for element in (representation, adaptation_set, period, mpd_doc):
1628                             base_url_e = element.find(_add_ns('BaseURL'))
1629                             if base_url_e is not None:
1630                                 base_url = base_url_e.text + base_url
1631                                 if re.match(r'^https?://', base_url):
1632                                     break
1633                         if mpd_base_url and not re.match(r'^https?://', base_url):
1634                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1635                                 mpd_base_url += '/'
1636                             base_url = mpd_base_url + base_url
1637                         representation_id = representation_attrib.get('id')
1638                         lang = representation_attrib.get('lang')
1639                         url_el = representation.find(_add_ns('BaseURL'))
1640                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1641                         f = {
1642                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1643                             'url': base_url,
1644                             'ext': mimetype2ext(mime_type),
1645                             'width': int_or_none(representation_attrib.get('width')),
1646                             'height': int_or_none(representation_attrib.get('height')),
1647                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1648                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1649                             'fps': int_or_none(representation_attrib.get('frameRate')),
1650                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1651                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1652                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1653                             'format_note': 'DASH %s' % content_type,
1654                             'filesize': filesize,
1655                         }
1656                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1657                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1658                             if 'total_number' not in representation_ms_info and 'segment_duration':
1659                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1660                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1661                             media_template = representation_ms_info['media_template']
1662                             media_template = media_template.replace('$RepresentationID$', representation_id)
1663                             media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1664                             media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1665                             media_template.replace('$$', '$')
1666
1667                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1668                             # can't be used at the same time
1669                             if '%(Number' in media_template:
1670                                 representation_ms_info['segment_urls'] = [
1671                                     media_template % {
1672                                         'Number': segment_number,
1673                                         'Bandwidth': representation_attrib.get('bandwidth'),
1674                                     }
1675                                     for segment_number in range(
1676                                         representation_ms_info['start_number'],
1677                                         representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1678                             else:
1679                                 representation_ms_info['segment_urls'] = []
1680                                 segment_time = 0
1681
1682                                 def add_segment_url():
1683                                     representation_ms_info['segment_urls'].append(
1684                                         media_template % {
1685                                             'Time': segment_time,
1686                                             'Bandwidth': representation_attrib.get('bandwidth'),
1687                                         }
1688                                     )
1689
1690                                 for num, s in enumerate(representation_ms_info['s']):
1691                                     segment_time = s.get('t') or segment_time
1692                                     add_segment_url()
1693                                     for r in range(s.get('r', 0)):
1694                                         segment_time += s['d']
1695                                         add_segment_url()
1696                                     segment_time += s['d']
1697                         if 'segment_urls' in representation_ms_info:
1698                             f.update({
1699                                 'segment_urls': representation_ms_info['segment_urls'],
1700                                 'protocol': 'http_dash_segments',
1701                             })
1702                             if 'initialization_url' in representation_ms_info:
1703                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1704                                 f.update({
1705                                     'initialization_url': initialization_url,
1706                                 })
1707                                 if not f.get('url'):
1708                                     f['url'] = initialization_url
1709                         try:
1710                             existing_format = next(
1711                                 fo for fo in formats
1712                                 if fo['format_id'] == representation_id)
1713                         except StopIteration:
1714                             full_info = formats_dict.get(representation_id, {}).copy()
1715                             full_info.update(f)
1716                             formats.append(full_info)
1717                         else:
1718                             existing_format.update(f)
1719                     else:
1720                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1721         return formats
1722
1723     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
1724         def absolute_url(video_url):
1725             return compat_urlparse.urljoin(base_url, video_url)
1726
1727         def parse_content_type(content_type):
1728             if not content_type:
1729                 return {}
1730             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1731             if ctr:
1732                 mimetype, codecs = ctr.groups()
1733                 f = parse_codecs(codecs)
1734                 f['ext'] = mimetype2ext(mimetype)
1735                 return f
1736             return {}
1737
1738         def _media_formats(src, cur_media_type):
1739             full_url = absolute_url(src)
1740             if determine_ext(full_url) == 'm3u8':
1741                 is_plain_url = False
1742                 formats = self._extract_m3u8_formats(
1743                     full_url, video_id, ext='mp4',
1744                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1745             else:
1746                 is_plain_url = True
1747                 formats = [{
1748                     'url': full_url,
1749                     'vcodec': 'none' if cur_media_type == 'audio' else None,
1750                 }]
1751             return is_plain_url, formats
1752
1753         entries = []
1754         for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
1755             media_info = {
1756                 'formats': [],
1757                 'subtitles': {},
1758             }
1759             media_attributes = extract_attributes(media_tag)
1760             src = media_attributes.get('src')
1761             if src:
1762                 _, formats = _media_formats(src, media_type)
1763                 media_info['formats'].extend(formats)
1764             media_info['thumbnail'] = media_attributes.get('poster')
1765             if media_content:
1766                 for source_tag in re.findall(r'<source[^>]+>', media_content):
1767                     source_attributes = extract_attributes(source_tag)
1768                     src = source_attributes.get('src')
1769                     if not src:
1770                         continue
1771                     is_plain_url, formats = _media_formats(src, media_type)
1772                     if is_plain_url:
1773                         f = parse_content_type(source_attributes.get('type'))
1774                         f.update(formats[0])
1775                         media_info['formats'].append(f)
1776                     else:
1777                         media_info['formats'].extend(formats)
1778                 for track_tag in re.findall(r'<track[^>]+>', media_content):
1779                     track_attributes = extract_attributes(track_tag)
1780                     kind = track_attributes.get('kind')
1781                     if not kind or kind == 'subtitles':
1782                         src = track_attributes.get('src')
1783                         if not src:
1784                             continue
1785                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1786                         media_info['subtitles'].setdefault(lang, []).append({
1787                             'url': absolute_url(src),
1788                         })
1789             if media_info['formats']:
1790                 entries.append(media_info)
1791         return entries
1792
1793     def _extract_akamai_formats(self, manifest_url, video_id):
1794         formats = []
1795         f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1796         formats.extend(self._extract_f4m_formats(
1797             update_url_query(f4m_url, {'hdcore': '3.7.0'}),
1798             video_id, f4m_id='hds', fatal=False))
1799         m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1800         formats.extend(self._extract_m3u8_formats(
1801             m3u8_url, video_id, 'mp4', 'm3u8_native',
1802             m3u8_id='hls', fatal=False))
1803         return formats
1804
1805     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1806         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1807         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1808         http_base_url = 'http' + url_base
1809         formats = []
1810         if 'm3u8' not in skip_protocols:
1811             formats.extend(self._extract_m3u8_formats(
1812                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
1813                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
1814         if 'f4m' not in skip_protocols:
1815             formats.extend(self._extract_f4m_formats(
1816                 http_base_url + '/manifest.f4m',
1817                 video_id, f4m_id='hds', fatal=False))
1818         if re.search(r'(?:/smil:|\.smil)', url_base):
1819             if 'dash' not in skip_protocols:
1820                 formats.extend(self._extract_mpd_formats(
1821                     http_base_url + '/manifest.mpd',
1822                     video_id, mpd_id='dash', fatal=False))
1823             if 'smil' not in skip_protocols:
1824                 rtmp_formats = self._extract_smil_formats(
1825                     http_base_url + '/jwplayer.smil',
1826                     video_id, fatal=False)
1827                 for rtmp_format in rtmp_formats:
1828                     rtsp_format = rtmp_format.copy()
1829                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
1830                     del rtsp_format['play_path']
1831                     del rtsp_format['ext']
1832                     rtsp_format.update({
1833                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
1834                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
1835                         'protocol': 'rtsp',
1836                     })
1837                     formats.extend([rtmp_format, rtsp_format])
1838         else:
1839             for protocol in ('rtmp', 'rtsp'):
1840                 if protocol not in skip_protocols:
1841                     formats.append({
1842                         'url': protocol + url_base,
1843                         'format_id': protocol,
1844                         'protocol': protocol,
1845                     })
1846         return formats
1847
1848     def _live_title(self, name):
1849         """ Generate the title for a live video """
1850         now = datetime.datetime.now()
1851         now_str = now.strftime('%Y-%m-%d %H:%M')
1852         return name + ' ' + now_str
1853
1854     def _int(self, v, name, fatal=False, **kwargs):
1855         res = int_or_none(v, **kwargs)
1856         if 'get_attr' in kwargs:
1857             print(getattr(v, kwargs['get_attr']))
1858         if res is None:
1859             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1860             if fatal:
1861                 raise ExtractorError(msg)
1862             else:
1863                 self._downloader.report_warning(msg)
1864         return res
1865
1866     def _float(self, v, name, fatal=False, **kwargs):
1867         res = float_or_none(v, **kwargs)
1868         if res is None:
1869             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1870             if fatal:
1871                 raise ExtractorError(msg)
1872             else:
1873                 self._downloader.report_warning(msg)
1874         return res
1875
1876     def _set_cookie(self, domain, name, value, expire_time=None):
1877         cookie = compat_cookiejar.Cookie(
1878             0, name, value, None, None, domain, None,
1879             None, '/', True, False, expire_time, '', None, None, None)
1880         self._downloader.cookiejar.set_cookie(cookie)
1881
1882     def _get_cookies(self, url):
1883         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1884         req = sanitized_Request(url)
1885         self._downloader.cookiejar.add_cookie_header(req)
1886         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1887
1888     def get_testcases(self, include_onlymatching=False):
1889         t = getattr(self, '_TEST', None)
1890         if t:
1891             assert not hasattr(self, '_TESTS'), \
1892                 '%s has _TEST and _TESTS' % type(self).__name__
1893             tests = [t]
1894         else:
1895             tests = getattr(self, '_TESTS', [])
1896         for t in tests:
1897             if not include_onlymatching and t.get('only_matching', False):
1898                 continue
1899             t['name'] = type(self).__name__[:-len('IE')]
1900             yield t
1901
1902     def is_suitable(self, age_limit):
1903         """ Test whether the extractor is generally suitable for the given
1904         age limit (i.e. pornographic sites are not, all others usually are) """
1905
1906         any_restricted = False
1907         for tc in self.get_testcases(include_onlymatching=False):
1908             if tc.get('playlist', []):
1909                 tc = tc['playlist'][0]
1910             is_restricted = age_restricted(
1911                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1912             if not is_restricted:
1913                 return True
1914             any_restricted = any_restricted or is_restricted
1915         return not any_restricted
1916
1917     def extract_subtitles(self, *args, **kwargs):
1918         if (self._downloader.params.get('writesubtitles', False) or
1919                 self._downloader.params.get('listsubtitles')):
1920             return self._get_subtitles(*args, **kwargs)
1921         return {}
1922
1923     def _get_subtitles(self, *args, **kwargs):
1924         raise NotImplementedError('This method must be implemented by subclasses')
1925
1926     @staticmethod
1927     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1928         """ Merge subtitle items for one language. Items with duplicated URLs
1929         will be dropped. """
1930         list1_urls = set([item['url'] for item in subtitle_list1])
1931         ret = list(subtitle_list1)
1932         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1933         return ret
1934
1935     @classmethod
1936     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1937         """ Merge two subtitle dictionaries, language by language. """
1938         ret = dict(subtitle_dict1)
1939         for lang in subtitle_dict2:
1940             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1941         return ret
1942
1943     def extract_automatic_captions(self, *args, **kwargs):
1944         if (self._downloader.params.get('writeautomaticsub', False) or
1945                 self._downloader.params.get('listsubtitles')):
1946             return self._get_automatic_captions(*args, **kwargs)
1947         return {}
1948
1949     def _get_automatic_captions(self, *args, **kwargs):
1950         raise NotImplementedError('This method must be implemented by subclasses')
1951
1952     def mark_watched(self, *args, **kwargs):
1953         if (self._downloader.params.get('mark_watched', False) and
1954                 (self._get_login_info()[0] is not None or
1955                     self._downloader.params.get('cookiefile') is not None)):
1956             self._mark_watched(*args, **kwargs)
1957
1958     def _mark_watched(self, *args, **kwargs):
1959         raise NotImplementedError('This method must be implemented by subclasses')
1960
1961     def geo_verification_headers(self):
1962         headers = {}
1963         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
1964         if geo_verification_proxy:
1965             headers['Ytdl-request-proxy'] = geo_verification_proxy
1966         return headers
1967
1968
1969 class SearchInfoExtractor(InfoExtractor):
1970     """
1971     Base class for paged search queries extractors.
1972     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1973     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1974     """
1975
1976     @classmethod
1977     def _make_valid_url(cls):
1978         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1979
1980     @classmethod
1981     def suitable(cls, url):
1982         return re.match(cls._make_valid_url(), url) is not None
1983
1984     def _real_extract(self, query):
1985         mobj = re.match(self._make_valid_url(), query)
1986         if mobj is None:
1987             raise ExtractorError('Invalid search query "%s"' % query)
1988
1989         prefix = mobj.group('prefix')
1990         query = mobj.group('query')
1991         if prefix == '':
1992             return self._get_n_results(query, 1)
1993         elif prefix == 'all':
1994             return self._get_n_results(query, self._MAX_RESULTS)
1995         else:
1996             n = int(prefix)
1997             if n <= 0:
1998                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1999             elif n > self._MAX_RESULTS:
2000                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2001                 n = self._MAX_RESULTS
2002             return self._get_n_results(query, n)
2003
2004     def _get_n_results(self, query, n):
2005         """Get a specified number of results for a query"""
2006         raise NotImplementedError('This method must be implemented by subclasses')
2007
2008     @property
2009     def SEARCH_KEY(self):
2010         return self._SEARCH_KEY