_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse_urlencode,
  25     compat_urllib_request,
  26     compat_urlparse,
  27 )
  28 from ..downloader.f4m import remove_encrypted_media
  29 from ..utils import (
  30     NO_DEFAULT,
  31     age_restricted,
  32     bug_reports_message,
  33     clean_html,
  34     compiled_regex_type,
  35     determine_ext,
  36     error_to_compat_str,
  37     ExtractorError,
  38     fix_xml_ampersands,
  39     float_or_none,
  40     int_or_none,
  41     parse_iso8601,
  42     RegexNotFoundError,
  43     sanitize_filename,
  44     sanitized_Request,
  45     unescapeHTML,
  46     unified_strdate,
  47     unified_timestamp,
  48     url_basename,
  49     xpath_element,
  50     xpath_text,
  51     xpath_with_ns,
  52     determine_protocol,
  53     parse_duration,
  54     mimetype2ext,
  55     update_Request,
  56     update_url_query,
  57     parse_m3u8_attributes,
  58     extract_attributes,
  59     parse_codecs,
  60 )
  61
  62
  63 class InfoExtractor(object):
  64     """Information Extractor class.
  65
  66     Information extractors are the classes that, given a URL, extract
  67     information about the video (or videos) the URL refers to. This
  68     information includes the real video URL, the video title, author and
  69     others. The information is stored in a dictionary which is then
  70     passed to the YoutubeDL. The YoutubeDL processes this
  71     information possibly downloading the video to the file system, among
  72     other possible outcomes.
  73
  74     The type field determines the type of the result.
  75     By far the most common value (and the default if _type is missing) is
  76     "video", which indicates a single video.
  77
  78     For a video, the dictionaries must include the following fields:
  79
  80     id:             Video identifier.
  81     title:          Video title, unescaped.
  82
  83     Additionally, it must contain either a formats entry or a url one:
  84
  85     formats:        A list of dictionaries for each format available, ordered
  86                     from worst to best quality.
  87
  88                     Potential fields:
  89                     * url        Mandatory. The URL of the video file
  90                     * ext        Will be calculated from URL if missing
  91                     * format     A human-readable description of the format
  92                                  ("mp4 container with h264/opus").
  93                                  Calculated from the format_id, width, height.
  94                                  and format_note fields if missing.
  95                     * format_id  A short description of the format
  96                                  ("mp4_h264_opus" or "19").
  97                                 Technically optional, but strongly recommended.
  98                     * format_note Additional info about the format
  99                                  ("3D" or "DASH video")
 100                     * width      Width of the video, if known
 101                     * height     Height of the video, if known
 102                     * resolution Textual description of width and height
 103                     * tbr        Average bitrate of audio and video in KBit/s
 104                     * abr        Average audio bitrate in KBit/s
 105                     * acodec     Name of the audio codec in use
 106                     * asr        Audio sampling rate in Hertz
 107                     * vbr        Average video bitrate in KBit/s
 108                     * fps        Frame rate
 109                     * vcodec     Name of the video codec in use
 110                     * container  Name of the container format
 111                     * filesize   The number of bytes, if known in advance
 112                     * filesize_approx  An estimate for the number of bytes
 113                     * player_url SWF Player URL (used for rtmpdump).
 114                     * protocol   The protocol that will be used for the actual
 115                                  download, lower-case.
 116                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 117                                  "m3u8", "m3u8_native" or "http_dash_segments".
 118                     * preference Order number of this format. If this field is
 119                                  present and not None, the formats get sorted
 120                                  by this field, regardless of all other values.
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                                  < -1000 to hide the format (if there is
 124                                     another one which is strictly better)
 125                     * language   Language code, e.g. "de" or "en-US".
 126                     * language_preference  Is this in the language mentioned in
 127                                  the URL?
 128                                  10 if it's what the URL is about,
 129                                  -1 for default (don't know),
 130                                  -10 otherwise, other values reserved for now.
 131                     * quality    Order number of the video quality of this
 132                                  format, irrespective of the file format.
 133                                  -1 for default (order by other properties),
 134                                  -2 or smaller for less than default.
 135                     * source_preference  Order number for this video source
 136                                   (quality takes higher priority)
 137                                  -1 for default (order by other properties),
 138                                  -2 or smaller for less than default.
 139                     * http_headers  A dictionary of additional HTTP headers
 140                                  to add to the request.
 141                     * stretched_ratio  If given and not 1, indicates that the
 142                                  video's pixels are not square.
 143                                  width : height ratio as float.
 144                     * no_resume  The server does not support resuming the
 145                                  (HTTP or RTMP) download. Boolean.
 146
 147     url:            Final video URL.
 148     ext:            Video filename extension.
 149     format:         The video format, defaults to ext (used for --get-format)
 150     player_url:     SWF Player URL (used for rtmpdump).
 151
 152     The following fields are optional:
 153
 154     alt_title:      A secondary title of the video.
 155     display_id      An alternative identifier for the video, not necessarily
 156                     unique, but available before title. Typically, id is
 157                     something like "4234987", title "Dancing naked mole rats",
 158                     and display_id "dancing-naked-mole-rats"
 159     thumbnails:     A list of dictionaries, with the following entries:
 160                         * "id" (optional, string) - Thumbnail format ID
 161                         * "url"
 162                         * "preference" (optional, int) - quality of the image
 163                         * "width" (optional, int)
 164                         * "height" (optional, int)
 165                         * "resolution" (optional, string "{width}x{height"},
 166                                         deprecated)
 167                         * "filesize" (optional, int)
 168     thumbnail:      Full URL to a video thumbnail image.
 169     description:    Full video description.
 170     uploader:       Full name of the video uploader.
 171     license:        License name the video is licensed under.
 172     creator:        The creator of the video.
 173     release_date:   The date (YYYYMMDD) when the video was released.
 174     timestamp:      UNIX timestamp of the moment the video became available.
 175     upload_date:    Video upload date (YYYYMMDD).
 176                     If not explicitly set, calculated from timestamp.
 177     uploader_id:    Nickname or id of the video uploader.
 178     uploader_url:   Full URL to a personal webpage of the video uploader.
 179     location:       Physical location where the video was filmed.
 180     subtitles:      The available subtitles as a dictionary in the format
 181                     {language: subformats}. "subformats" is a list sorted from
 182                     lower to higher preference, each element is a dictionary
 183                     with the "ext" entry and one of:
 184                         * "data": The subtitles file contents
 185                         * "url": A URL pointing to the subtitles file
 186                     "ext" will be calculated from URL if missing
 187     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 188                     automatically generated captions
 189     duration:       Length of the video in seconds, as an integer or float.
 190     view_count:     How many users have watched the video on the platform.
 191     like_count:     Number of positive ratings of the video
 192     dislike_count:  Number of negative ratings of the video
 193     repost_count:   Number of reposts of the video
 194     average_rating: Average rating give by users, the scale used depends on the webpage
 195     comment_count:  Number of comments on the video
 196     comments:       A list of comments, each with one or more of the following
 197                     properties (all but one of text or html optional):
 198                         * "author" - human-readable name of the comment author
 199                         * "author_id" - user ID of the comment author
 200                         * "id" - Comment ID
 201                         * "html" - Comment as HTML
 202                         * "text" - Plain text of the comment
 203                         * "timestamp" - UNIX timestamp of comment
 204                         * "parent" - ID of the comment this one is replying to.
 205                                      Set to "root" to indicate that this is a
 206                                      comment to the original video.
 207     age_limit:      Age restriction for the video, as an integer (years)
 208     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 209                     should allow to get the same result again. (It will be set
 210                     by YoutubeDL if it's missing)
 211     categories:     A list of categories that the video falls in, for example
 212                     ["Sports", "Berlin"]
 213     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 214     is_live:        True, False, or None (=unknown). Whether this video is a
 215                     live stream that goes on instead of a fixed-length video.
 216     start_time:     Time in seconds where the reproduction should start, as
 217                     specified in the URL.
 218     end_time:       Time in seconds where the reproduction should end, as
 219                     specified in the URL.
 220
 221     The following fields should only be used when the video belongs to some logical
 222     chapter or section:
 223
 224     chapter:        Name or title of the chapter the video belongs to.
 225     chapter_number: Number of the chapter the video belongs to, as an integer.
 226     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 227
 228     The following fields should only be used when the video is an episode of some
 229     series or programme:
 230
 231     series:         Title of the series or programme the video episode belongs to.
 232     season:         Title of the season the video episode belongs to.
 233     season_number:  Number of the season the video episode belongs to, as an integer.
 234     season_id:      Id of the season the video episode belongs to, as a unicode string.
 235     episode:        Title of the video episode. Unlike mandatory video title field,
 236                     this field should denote the exact title of the video episode
 237                     without any kind of decoration.
 238     episode_number: Number of the video episode within a season, as an integer.
 239     episode_id:     Id of the video episode, as a unicode string.
 240
 241     The following fields should only be used when the media is a track or a part of
 242     a music album:
 243
 244     track:          Title of the track.
 245     track_number:   Number of the track within an album or a disc, as an integer.
 246     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 247                     as a unicode string.
 248     artist:         Artist(s) of the track.
 249     genre:          Genre(s) of the track.
 250     album:          Title of the album the track belongs to.
 251     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 252     album_artist:   List of all artists appeared on the album (e.g.
 253                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 254                     and compilations).
 255     disc_number:    Number of the disc or other physical medium the track belongs to,
 256                     as an integer.
 257     release_year:   Year (YYYY) when the album was released.
 258
 259     Unless mentioned otherwise, the fields should be Unicode strings.
 260
 261     Unless mentioned otherwise, None is equivalent to absence of information.
 262
 263
 264     _type "playlist" indicates multiple videos.
 265     There must be a key "entries", which is a list, an iterable, or a PagedList
 266     object, each element of which is a valid dictionary by this specification.
 267
 268     Additionally, playlists can have "title", "description" and "id" attributes
 269     with the same semantics as videos (see above).
 270
 271
 272     _type "multi_video" indicates that there are multiple videos that
 273     form a single show, for examples multiple acts of an opera or TV episode.
 274     It must have an entries key like a playlist and contain all the keys
 275     required for a video at the same time.
 276
 277
 278     _type "url" indicates that the video must be extracted from another
 279     location, possibly by a different extractor. Its only required key is:
 280     "url" - the next URL to extract.
 281     The key "ie_key" can be set to the class name (minus the trailing "IE",
 282     e.g. "Youtube") if the extractor class is known in advance.
 283     Additionally, the dictionary may have any properties of the resolved entity
 284     known in advance, for example "title" if the title of the referred video is
 285     known ahead of time.
 286
 287
 288     _type "url_transparent" entities have the same specification as "url", but
 289     indicate that the given additional information is more precise than the one
 290     associated with the resolved URL.
 291     This is useful when a site employs a video service that hosts the video and
 292     its technical metadata, but that video service does not embed a useful
 293     title, description etc.
 294
 295
 296     Subclasses of this one should re-define the _real_initialize() and
 297     _real_extract() methods and define a _VALID_URL regexp.
 298     Probably, they should also be added to the list of extractors.
 299
 300     Finally, the _WORKING attribute should be set to False for broken IEs
 301     in order to warn the users and skip the tests.
 302     """
 303
 304     _ready = False
 305     _downloader = None
 306     _WORKING = True
 307
 308     def __init__(self, downloader=None):
 309         """Constructor. Receives an optional downloader."""
 310         self._ready = False
 311         self.set_downloader(downloader)
 312
 313     @classmethod
 314     def suitable(cls, url):
 315         """Receives a URL and returns True if suitable for this IE."""
 316
 317         # This does not use has/getattr intentionally - we want to know whether
 318         # we have cached the regexp for *this* class, whereas getattr would also
 319         # match the superclass
 320         if '_VALID_URL_RE' not in cls.__dict__:
 321             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 322         return cls._VALID_URL_RE.match(url) is not None
 323
 324     @classmethod
 325     def _match_id(cls, url):
 326         if '_VALID_URL_RE' not in cls.__dict__:
 327             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 328         m = cls._VALID_URL_RE.match(url)
 329         assert m
 330         return m.group('id')
 331
 332     @classmethod
 333     def working(cls):
 334         """Getter method for _WORKING."""
 335         return cls._WORKING
 336
 337     def initialize(self):
 338         """Initializes an instance (authentication, etc)."""
 339         if not self._ready:
 340             self._real_initialize()
 341             self._ready = True
 342
 343     def extract(self, url):
 344         """Extracts URL information and returns it in list of dicts."""
 345         try:
 346             self.initialize()
 347             return self._real_extract(url)
 348         except ExtractorError:
 349             raise
 350         except compat_http_client.IncompleteRead as e:
 351             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 352         except (KeyError, StopIteration) as e:
 353             raise ExtractorError('An extractor error has occurred.', cause=e)
 354
 355     def set_downloader(self, downloader):
 356         """Sets the downloader for this IE."""
 357         self._downloader = downloader
 358
 359     def _real_initialize(self):
 360         """Real initialization process. Redefine in subclasses."""
 361         pass
 362
 363     def _real_extract(self, url):
 364         """Real extraction process. Redefine in subclasses."""
 365         pass
 366
 367     @classmethod
 368     def ie_key(cls):
 369         """A string for getting the InfoExtractor with get_info_extractor"""
 370         return compat_str(cls.__name__[:-2])
 371
 372     @property
 373     def IE_NAME(self):
 374         return compat_str(type(self).__name__[:-2])
 375
 376     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 377         """ Returns the response handle """
 378         if note is None:
 379             self.report_download_webpage(video_id)
 380         elif note is not False:
 381             if video_id is None:
 382                 self.to_screen('%s' % (note,))
 383             else:
 384                 self.to_screen('%s: %s' % (video_id, note))
 385         if isinstance(url_or_request, compat_urllib_request.Request):
 386             url_or_request = update_Request(
 387                 url_or_request, data=data, headers=headers, query=query)
 388         else:
 389             if query:
 390                 url_or_request = update_url_query(url_or_request, query)
 391             if data is not None or headers:
 392                 url_or_request = sanitized_Request(url_or_request, data, headers)
 393         try:
 394             return self._downloader.urlopen(url_or_request)
 395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 396             if errnote is False:
 397                 return False
 398             if errnote is None:
 399                 errnote = 'Unable to download webpage'
 400
 401             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 402             if fatal:
 403                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 404             else:
 405                 self._downloader.report_warning(errmsg)
 406                 return False
 407
 408     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 409         """ Returns a tuple (page content as string, URL handle) """
 410         # Strip hashes from the URL (#1038)
 411         if isinstance(url_or_request, (compat_str, str)):
 412             url_or_request = url_or_request.partition('#')[0]
 413
 414         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 415         if urlh is False:
 416             assert not fatal
 417             return False
 418         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 419         return (content, urlh)
 420
 421     @staticmethod
 422     def _guess_encoding_from_content(content_type, webpage_bytes):
 423         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 424         if m:
 425             encoding = m.group(1)
 426         else:
 427             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 428                           webpage_bytes[:1024])
 429             if m:
 430                 encoding = m.group(1).decode('ascii')
 431             elif webpage_bytes.startswith(b'\xff\xfe'):
 432                 encoding = 'utf-16'
 433             else:
 434                 encoding = 'utf-8'
 435
 436         return encoding
 437
 438     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 439         content_type = urlh.headers.get('Content-Type', '')
 440         webpage_bytes = urlh.read()
 441         if prefix is not None:
 442             webpage_bytes = prefix + webpage_bytes
 443         if not encoding:
 444             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 445         if self._downloader.params.get('dump_intermediate_pages', False):
 446             try:
 447                 url = url_or_request.get_full_url()
 448             except AttributeError:
 449                 url = url_or_request
 450             self.to_screen('Dumping request to ' + url)
 451             dump = base64.b64encode(webpage_bytes).decode('ascii')
 452             self._downloader.to_screen(dump)
 453         if self._downloader.params.get('write_pages', False):
 454             try:
 455                 url = url_or_request.get_full_url()
 456             except AttributeError:
 457                 url = url_or_request
 458             basen = '%s_%s' % (video_id, url)
 459             if len(basen) > 240:
 460                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 461                 basen = basen[:240 - len(h)] + h
 462             raw_filename = basen + '.dump'
 463             filename = sanitize_filename(raw_filename, restricted=True)
 464             self.to_screen('Saving request to ' + filename)
 465             # Working around MAX_PATH limitation on Windows (see
 466             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 467             if compat_os_name == 'nt':
 468                 absfilepath = os.path.abspath(filename)
 469                 if len(absfilepath) > 259:
 470                     filename = '\\\\?\\' + absfilepath
 471             with open(filename, 'wb') as outf:
 472                 outf.write(webpage_bytes)
 473
 474         try:
 475             content = webpage_bytes.decode(encoding, 'replace')
 476         except LookupError:
 477             content = webpage_bytes.decode('utf-8', 'replace')
 478
 479         if ('<title>Access to this site is blocked</title>' in content and
 480                 'Websense' in content[:512]):
 481             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 482             blocked_iframe = self._html_search_regex(
 483                 r'<iframe src="([^"]+)"', content,
 484                 'Websense information URL', default=None)
 485             if blocked_iframe:
 486                 msg += ' Visit %s for more details' % blocked_iframe
 487             raise ExtractorError(msg, expected=True)
 488         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 489             msg = (
 490                 'Access to this webpage has been blocked by Indian censorship. '
 491                 'Use a VPN or proxy server (with --proxy) to route around it.')
 492             block_msg = self._html_search_regex(
 493                 r'</h1><p>(.*?)</p>',
 494                 content, 'block message', default=None)
 495             if block_msg:
 496                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 497             raise ExtractorError(msg, expected=True)
 498
 499         return content
 500
 501     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 502         """ Returns the data of the page as a string """
 503         success = False
 504         try_count = 0
 505         while success is False:
 506             try:
 507                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 508                 success = True
 509             except compat_http_client.IncompleteRead as e:
 510                 try_count += 1
 511                 if try_count >= tries:
 512                     raise e
 513                 self._sleep(timeout, video_id)
 514         if res is False:
 515             return res
 516         else:
 517             content, _ = res
 518             return content
 519
 520     def _download_xml(self, url_or_request, video_id,
 521                       note='Downloading XML', errnote='Unable to download XML',
 522                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 523         """Return the xml as an xml.etree.ElementTree.Element"""
 524         xml_string = self._download_webpage(
 525             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 526         if xml_string is False:
 527             return xml_string
 528         if transform_source:
 529             xml_string = transform_source(xml_string)
 530         return compat_etree_fromstring(xml_string.encode('utf-8'))
 531
 532     def _download_json(self, url_or_request, video_id,
 533                        note='Downloading JSON metadata',
 534                        errnote='Unable to download JSON metadata',
 535                        transform_source=None,
 536                        fatal=True, encoding=None, data=None, headers={}, query={}):
 537         json_string = self._download_webpage(
 538             url_or_request, video_id, note, errnote, fatal=fatal,
 539             encoding=encoding, data=data, headers=headers, query=query)
 540         if (not fatal) and json_string is False:
 541             return None
 542         return self._parse_json(
 543             json_string, video_id, transform_source=transform_source, fatal=fatal)
 544
 545     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 546         if transform_source:
 547             json_string = transform_source(json_string)
 548         try:
 549             return json.loads(json_string)
 550         except ValueError as ve:
 551             errmsg = '%s: Failed to parse JSON ' % video_id
 552             if fatal:
 553                 raise ExtractorError(errmsg, cause=ve)
 554             else:
 555                 self.report_warning(errmsg + str(ve))
 556
 557     def report_warning(self, msg, video_id=None):
 558         idstr = '' if video_id is None else '%s: ' % video_id
 559         self._downloader.report_warning(
 560             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 561
 562     def to_screen(self, msg):
 563         """Print msg to screen, prefixing it with '[ie_name]'"""
 564         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 565
 566     def report_extraction(self, id_or_name):
 567         """Report information extraction."""
 568         self.to_screen('%s: Extracting information' % id_or_name)
 569
 570     def report_download_webpage(self, video_id):
 571         """Report webpage download."""
 572         self.to_screen('%s: Downloading webpage' % video_id)
 573
 574     def report_age_confirmation(self):
 575         """Report attempt to confirm age."""
 576         self.to_screen('Confirming age')
 577
 578     def report_login(self):
 579         """Report attempt to log in."""
 580         self.to_screen('Logging in')
 581
 582     @staticmethod
 583     def raise_login_required(msg='This video is only available for registered users'):
 584         raise ExtractorError(
 585             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 586             expected=True)
 587
 588     @staticmethod
 589     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 590         raise ExtractorError(
 591             '%s. You might want to use --proxy to workaround.' % msg,
 592             expected=True)
 593
 594     # Methods for following #608
 595     @staticmethod
 596     def url_result(url, ie=None, video_id=None, video_title=None):
 597         """Returns a URL that points to a page that should be processed"""
 598         # TODO: ie should be the class used for getting the info
 599         video_info = {'_type': 'url',
 600                       'url': url,
 601                       'ie_key': ie}
 602         if video_id is not None:
 603             video_info['id'] = video_id
 604         if video_title is not None:
 605             video_info['title'] = video_title
 606         return video_info
 607
 608     @staticmethod
 609     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 610         """Returns a playlist"""
 611         video_info = {'_type': 'playlist',
 612                       'entries': entries}
 613         if playlist_id:
 614             video_info['id'] = playlist_id
 615         if playlist_title:
 616             video_info['title'] = playlist_title
 617         if playlist_description:
 618             video_info['description'] = playlist_description
 619         return video_info
 620
 621     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 622         """
 623         Perform a regex search on the given string, using a single or a list of
 624         patterns returning the first matching group.
 625         In case of failure return a default value or raise a WARNING or a
 626         RegexNotFoundError, depending on fatal, specifying the field name.
 627         """
 628         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 629             mobj = re.search(pattern, string, flags)
 630         else:
 631             for p in pattern:
 632                 mobj = re.search(p, string, flags)
 633                 if mobj:
 634                     break
 635
 636         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 637             _name = '\033[0;34m%s\033[0m' % name
 638         else:
 639             _name = name
 640
 641         if mobj:
 642             if group is None:
 643                 # return the first matching group
 644                 return next(g for g in mobj.groups() if g is not None)
 645             else:
 646                 return mobj.group(group)
 647         elif default is not NO_DEFAULT:
 648             return default
 649         elif fatal:
 650             raise RegexNotFoundError('Unable to extract %s' % _name)
 651         else:
 652             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 653             return None
 654
 655     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 656         """
 657         Like _search_regex, but strips HTML tags and unescapes entities.
 658         """
 659         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 660         if res:
 661             return clean_html(res).strip()
 662         else:
 663             return res
 664
 665     def _get_login_info(self):
 666         """
 667         Get the login info as (username, password)
 668         It will look in the netrc file using the _NETRC_MACHINE value
 669         If there's no info available, return (None, None)
 670         """
 671         if self._downloader is None:
 672             return (None, None)
 673
 674         username = None
 675         password = None
 676         downloader_params = self._downloader.params
 677
 678         # Attempt to use provided username and password or .netrc data
 679         if downloader_params.get('username') is not None:
 680             username = downloader_params['username']
 681             password = downloader_params['password']
 682         elif downloader_params.get('usenetrc', False):
 683             try:
 684                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 685                 if info is not None:
 686                     username = info[0]
 687                     password = info[2]
 688                 else:
 689                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 690             except (IOError, netrc.NetrcParseError) as err:
 691                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 692
 693         return (username, password)
 694
 695     def _get_tfa_info(self, note='two-factor verification code'):
 696         """
 697         Get the two-factor authentication info
 698         TODO - asking the user will be required for sms/phone verify
 699         currently just uses the command line option
 700         If there's no info available, return None
 701         """
 702         if self._downloader is None:
 703             return None
 704         downloader_params = self._downloader.params
 705
 706         if downloader_params.get('twofactor') is not None:
 707             return downloader_params['twofactor']
 708
 709         return compat_getpass('Type %s and press [Return]: ' % note)
 710
 711     # Helper functions for extracting OpenGraph info
 712     @staticmethod
 713     def _og_regexes(prop):
 714         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 715         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 716                        % {'prop': re.escape(prop)})
 717         template = r'<meta[^>]+?%s[^>]+?%s'
 718         return [
 719             template % (property_re, content_re),
 720             template % (content_re, property_re),
 721         ]
 722
 723     @staticmethod
 724     def _meta_regex(prop):
 725         return r'''(?isx)<meta
 726                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 727                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 728
 729     def _og_search_property(self, prop, html, name=None, **kargs):
 730         if name is None:
 731             name = 'OpenGraph %s' % prop
 732         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 733         if escaped is None:
 734             return None
 735         return unescapeHTML(escaped)
 736
 737     def _og_search_thumbnail(self, html, **kargs):
 738         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 739
 740     def _og_search_description(self, html, **kargs):
 741         return self._og_search_property('description', html, fatal=False, **kargs)
 742
 743     def _og_search_title(self, html, **kargs):
 744         return self._og_search_property('title', html, **kargs)
 745
 746     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 747         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 748         if secure:
 749             regexes = self._og_regexes('video:secure_url') + regexes
 750         return self._html_search_regex(regexes, html, name, **kargs)
 751
 752     def _og_search_url(self, html, **kargs):
 753         return self._og_search_property('url', html, **kargs)
 754
 755     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 756         if not isinstance(name, (list, tuple)):
 757             name = [name]
 758         if display_name is None:
 759             display_name = name[0]
 760         return self._html_search_regex(
 761             [self._meta_regex(n) for n in name],
 762             html, display_name, fatal=fatal, group='content', **kwargs)
 763
 764     def _dc_search_uploader(self, html):
 765         return self._html_search_meta('dc.creator', html, 'uploader')
 766
 767     def _rta_search(self, html):
 768         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 769         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 770                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 771                      html):
 772             return 18
 773         return 0
 774
 775     def _media_rating_search(self, html):
 776         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 777         rating = self._html_search_meta('rating', html)
 778
 779         if not rating:
 780             return None
 781
 782         RATING_TABLE = {
 783             'safe for kids': 0,
 784             'general': 8,
 785             '14 years': 14,
 786             'mature': 17,
 787             'restricted': 19,
 788         }
 789         return RATING_TABLE.get(rating.lower())
 790
 791     def _family_friendly_search(self, html):
 792         # See http://schema.org/VideoObject
 793         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 794
 795         if not family_friendly:
 796             return None
 797
 798         RATING_TABLE = {
 799             '1': 0,
 800             'true': 0,
 801             '0': 18,
 802             'false': 18,
 803         }
 804         return RATING_TABLE.get(family_friendly.lower())
 805
 806     def _twitter_search_player(self, html):
 807         return self._html_search_meta('twitter:player', html,
 808                                       'twitter card player')
 809
 810     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 811         json_ld = self._search_regex(
 812             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 813             html, 'JSON-LD', group='json_ld', **kwargs)
 814         if not json_ld:
 815             return {}
 816         return self._json_ld(
 817             json_ld, video_id, fatal=kwargs.get('fatal', True),
 818             expected_type=expected_type)
 819
 820     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 821         if isinstance(json_ld, compat_str):
 822             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 823         if not json_ld:
 824             return {}
 825         info = {}
 826         if json_ld.get('@context') == 'http://schema.org':
 827             item_type = json_ld.get('@type')
 828             if expected_type is not None and expected_type != item_type:
 829                 return info
 830             if item_type == 'TVEpisode':
 831                 info.update({
 832                     'episode': unescapeHTML(json_ld.get('name')),
 833                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 834                     'description': unescapeHTML(json_ld.get('description')),
 835                 })
 836                 part_of_season = json_ld.get('partOfSeason')
 837                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 838                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 839                 part_of_series = json_ld.get('partOfSeries')
 840                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 841                     info['series'] = unescapeHTML(part_of_series.get('name'))
 842             elif item_type == 'Article':
 843                 info.update({
 844                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 845                     'title': unescapeHTML(json_ld.get('headline')),
 846                     'description': unescapeHTML(json_ld.get('articleBody')),
 847                 })
 848             elif item_type == 'VideoObject':
 849                 info.update({
 850                     'url': json_ld.get('contentUrl'),
 851                     'title': unescapeHTML(json_ld.get('name')),
 852                     'description': unescapeHTML(json_ld.get('description')),
 853                     'thumbnail': json_ld.get('thumbnailUrl'),
 854                     'duration': parse_duration(json_ld.get('duration')),
 855                     'timestamp': unified_timestamp(json_ld.get('uploadDate')),
 856                     'filesize': float_or_none(json_ld.get('contentSize')),
 857                     'tbr': int_or_none(json_ld.get('bitrate')),
 858                     'width': int_or_none(json_ld.get('width')),
 859                     'height': int_or_none(json_ld.get('height')),
 860                 })
 861         return dict((k, v) for k, v in info.items() if v is not None)
 862
 863     @staticmethod
 864     def _hidden_inputs(html):
 865         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 866         hidden_inputs = {}
 867         for input in re.findall(r'(?i)<input([^>]+)>', html):
 868             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 869                 continue
 870             name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
 871             if not name:
 872                 continue
 873             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 874             if not value:
 875                 continue
 876             hidden_inputs[name.group('value')] = value.group('value')
 877         return hidden_inputs
 878
 879     def _form_hidden_inputs(self, form_id, html):
 880         form = self._search_regex(
 881             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 882             html, '%s form' % form_id, group='form')
 883         return self._hidden_inputs(form)
 884
 885     def _sort_formats(self, formats, field_preference=None):
 886         if not formats:
 887             raise ExtractorError('No video formats found')
 888
 889         for f in formats:
 890             # Automatically determine tbr when missing based on abr and vbr (improves
 891             # formats sorting in some cases)
 892             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 893                 f['tbr'] = f['abr'] + f['vbr']
 894
 895         def _formats_key(f):
 896             # TODO remove the following workaround
 897             from ..utils import determine_ext
 898             if not f.get('ext') and 'url' in f:
 899                 f['ext'] = determine_ext(f['url'])
 900
 901             if isinstance(field_preference, (list, tuple)):
 902                 return tuple(
 903                     f.get(field)
 904                     if f.get(field) is not None
 905                     else ('' if field == 'format_id' else -1)
 906                     for field in field_preference)
 907
 908             preference = f.get('preference')
 909             if preference is None:
 910                 preference = 0
 911                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 912                     preference -= 0.5
 913
 914             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 915
 916             if f.get('vcodec') == 'none':  # audio only
 917                 preference -= 50
 918                 if self._downloader.params.get('prefer_free_formats'):
 919                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 920                 else:
 921                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 922                 ext_preference = 0
 923                 try:
 924                     audio_ext_preference = ORDER.index(f['ext'])
 925                 except ValueError:
 926                     audio_ext_preference = -1
 927             else:
 928                 if f.get('acodec') == 'none':  # video only
 929                     preference -= 40
 930                 if self._downloader.params.get('prefer_free_formats'):
 931                     ORDER = ['flv', 'mp4', 'webm']
 932                 else:
 933                     ORDER = ['webm', 'flv', 'mp4']
 934                 try:
 935                     ext_preference = ORDER.index(f['ext'])
 936                 except ValueError:
 937                     ext_preference = -1
 938                 audio_ext_preference = 0
 939
 940             return (
 941                 preference,
 942                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 943                 f.get('quality') if f.get('quality') is not None else -1,
 944                 f.get('tbr') if f.get('tbr') is not None else -1,
 945                 f.get('filesize') if f.get('filesize') is not None else -1,
 946                 f.get('vbr') if f.get('vbr') is not None else -1,
 947                 f.get('height') if f.get('height') is not None else -1,
 948                 f.get('width') if f.get('width') is not None else -1,
 949                 proto_preference,
 950                 ext_preference,
 951                 f.get('abr') if f.get('abr') is not None else -1,
 952                 audio_ext_preference,
 953                 f.get('fps') if f.get('fps') is not None else -1,
 954                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 955                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 956                 f.get('format_id') if f.get('format_id') is not None else '',
 957             )
 958         formats.sort(key=_formats_key)
 959
 960     def _check_formats(self, formats, video_id):
 961         if formats:
 962             formats[:] = filter(
 963                 lambda f: self._is_valid_url(
 964                     f['url'], video_id,
 965                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 966                 formats)
 967
 968     @staticmethod
 969     def _remove_duplicate_formats(formats):
 970         format_urls = set()
 971         unique_formats = []
 972         for f in formats:
 973             if f['url'] not in format_urls:
 974                 format_urls.add(f['url'])
 975                 unique_formats.append(f)
 976         formats[:] = unique_formats
 977
 978     def _is_valid_url(self, url, video_id, item='video'):
 979         url = self._proto_relative_url(url, scheme='http:')
 980         # For now assume non HTTP(S) URLs always valid
 981         if not (url.startswith('http://') or url.startswith('https://')):
 982             return True
 983         try:
 984             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 985             return True
 986         except ExtractorError as e:
 987             if isinstance(e.cause, compat_urllib_error.URLError):
 988                 self.to_screen(
 989                     '%s: %s URL is invalid, skipping' % (video_id, item))
 990                 return False
 991             raise
 992
 993     def http_scheme(self):
 994         """ Either "http:" or "https:", depending on the user's preferences """
 995         return (
 996             'http:'
 997             if self._downloader.params.get('prefer_insecure', False)
 998             else 'https:')
 999
1000     def _proto_relative_url(self, url, scheme=None):
1001         if url is None:
1002             return url
1003         if url.startswith('//'):
1004             if scheme is None:
1005                 scheme = self.http_scheme()
1006             return scheme + url
1007         else:
1008             return url
1009
1010     def _sleep(self, timeout, video_id, msg_template=None):
1011         if msg_template is None:
1012             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1013         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1014         self.to_screen(msg)
1015         time.sleep(timeout)
1016
1017     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1018                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1019                              fatal=True, m3u8_id=None):
1020         manifest = self._download_xml(
1021             manifest_url, video_id, 'Downloading f4m manifest',
1022             'Unable to download f4m manifest',
1023             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1024             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1025             transform_source=transform_source,
1026             fatal=fatal)
1027
1028         if manifest is False:
1029             return []
1030
1031         return self._parse_f4m_formats(
1032             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1033             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1034
1035     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1036                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1037                            fatal=True, m3u8_id=None):
1038         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1039         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1040         if akamai_pv is not None and ';' in akamai_pv.text:
1041             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1042             if playerVerificationChallenge.strip() != '':
1043                 return []
1044
1045         formats = []
1046         manifest_version = '1.0'
1047         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1048         if not media_nodes:
1049             manifest_version = '2.0'
1050             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1051         # Remove unsupported DRM protected media from final formats
1052         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1053         media_nodes = remove_encrypted_media(media_nodes)
1054         if not media_nodes:
1055             return formats
1056         base_url = xpath_text(
1057             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1058             'base URL', default=None)
1059         if base_url:
1060             base_url = base_url.strip()
1061
1062         bootstrap_info = xpath_element(
1063             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1064             'bootstrap info', default=None)
1065
1066         for i, media_el in enumerate(media_nodes):
1067             tbr = int_or_none(media_el.attrib.get('bitrate'))
1068             width = int_or_none(media_el.attrib.get('width'))
1069             height = int_or_none(media_el.attrib.get('height'))
1070             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1071             # If <bootstrapInfo> is present, the specified f4m is a
1072             # stream-level manifest, and only set-level manifests may refer to
1073             # external resources.  See section 11.4 and section 4 of F4M spec
1074             if bootstrap_info is None:
1075                 media_url = None
1076                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1077                 if manifest_version == '2.0':
1078                     media_url = media_el.attrib.get('href')
1079                 if media_url is None:
1080                     media_url = media_el.attrib.get('url')
1081                 if not media_url:
1082                     continue
1083                 manifest_url = (
1084                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1085                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1086                 # If media_url is itself a f4m manifest do the recursive extraction
1087                 # since bitrates in parent manifest (this one) and media_url manifest
1088                 # may differ leading to inability to resolve the format by requested
1089                 # bitrate in f4m downloader
1090                 ext = determine_ext(manifest_url)
1091                 if ext == 'f4m':
1092                     f4m_formats = self._extract_f4m_formats(
1093                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1094                         transform_source=transform_source, fatal=fatal)
1095                     # Sometimes stream-level manifest contains single media entry that
1096                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1097                     # At the same time parent's media entry in set-level manifest may
1098                     # contain it. We will copy it from parent in such cases.
1099                     if len(f4m_formats) == 1:
1100                         f = f4m_formats[0]
1101                         f.update({
1102                             'tbr': f.get('tbr') or tbr,
1103                             'width': f.get('width') or width,
1104                             'height': f.get('height') or height,
1105                             'format_id': f.get('format_id') if not tbr else format_id,
1106                         })
1107                     formats.extend(f4m_formats)
1108                     continue
1109                 elif ext == 'm3u8':
1110                     formats.extend(self._extract_m3u8_formats(
1111                         manifest_url, video_id, 'mp4', preference=preference,
1112                         m3u8_id=m3u8_id, fatal=fatal))
1113                     continue
1114             formats.append({
1115                 'format_id': format_id,
1116                 'url': manifest_url,
1117                 'ext': 'flv' if bootstrap_info is not None else None,
1118                 'tbr': tbr,
1119                 'width': width,
1120                 'height': height,
1121                 'preference': preference,
1122             })
1123         return formats
1124
1125     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1126         return {
1127             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1128             'url': m3u8_url,
1129             'ext': ext,
1130             'protocol': 'm3u8',
1131             'preference': preference - 1 if preference else -1,
1132             'resolution': 'multiple',
1133             'format_note': 'Quality selection URL',
1134         }
1135
1136     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1137                               entry_protocol='m3u8', preference=None,
1138                               m3u8_id=None, note=None, errnote=None,
1139                               fatal=True, live=False):
1140
1141         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1142
1143         format_url = lambda u: (
1144             u
1145             if re.match(r'^https?://', u)
1146             else compat_urlparse.urljoin(m3u8_url, u))
1147
1148         res = self._download_webpage_handle(
1149             m3u8_url, video_id,
1150             note=note or 'Downloading m3u8 information',
1151             errnote=errnote or 'Failed to download m3u8 information',
1152             fatal=fatal)
1153         if res is False:
1154             return []
1155         m3u8_doc, urlh = res
1156         m3u8_url = urlh.geturl()
1157
1158         # We should try extracting formats only from master playlists [1], i.e.
1159         # playlists that describe available qualities. On the other hand media
1160         # playlists [2] should be returned as is since they contain just the media
1161         # without qualities renditions.
1162         # Fortunately, master playlist can be easily distinguished from media
1163         # playlist based on particular tags availability. As of [1, 2] master
1164         # playlist tags MUST NOT appear in a media playist and vice versa.
1165         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1166         # and MUST NOT appear in master playlist thus we can clearly detect media
1167         # playlist with this criterion.
1168         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1169         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1170         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1171         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1172             return [{
1173                 'url': m3u8_url,
1174                 'format_id': m3u8_id,
1175                 'ext': ext,
1176                 'protocol': entry_protocol,
1177                 'preference': preference,
1178             }]
1179         last_info = None
1180         last_media = None
1181         for line in m3u8_doc.splitlines():
1182             if line.startswith('#EXT-X-STREAM-INF:'):
1183                 last_info = parse_m3u8_attributes(line)
1184             elif line.startswith('#EXT-X-MEDIA:'):
1185                 last_media = parse_m3u8_attributes(line)
1186             elif line.startswith('#') or not line.strip():
1187                 continue
1188             else:
1189                 if last_info is None:
1190                     formats.append({'url': format_url(line)})
1191                     continue
1192                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1193                 format_id = []
1194                 if m3u8_id:
1195                     format_id.append(m3u8_id)
1196                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
1197                 # Despite specification does not mention NAME attribute for
1198                 # EXT-X-STREAM-INF it still sometimes may be present
1199                 stream_name = last_info.get('NAME') or last_media_name
1200                 # Bandwidth of live streams may differ over time thus making
1201                 # format_id unpredictable. So it's better to keep provided
1202                 # format_id intact.
1203                 if not live:
1204                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1205                 f = {
1206                     'format_id': '-'.join(format_id),
1207                     'url': format_url(line.strip()),
1208                     'tbr': tbr,
1209                     'ext': ext,
1210                     'protocol': entry_protocol,
1211                     'preference': preference,
1212                 }
1213                 resolution = last_info.get('RESOLUTION')
1214                 if resolution:
1215                     width_str, height_str = resolution.split('x')
1216                     f['width'] = int(width_str)
1217                     f['height'] = int(height_str)
1218                 codecs = last_info.get('CODECS')
1219                 if codecs:
1220                     vcodec, acodec = [None] * 2
1221                     va_codecs = codecs.split(',')
1222                     if len(va_codecs) == 1:
1223                         # Audio only entries usually come with single codec and
1224                         # no resolution. For more robustness we also check it to
1225                         # be mp4 audio.
1226                         if not resolution and va_codecs[0].startswith('mp4a'):
1227                             vcodec, acodec = 'none', va_codecs[0]
1228                         else:
1229                             vcodec = va_codecs[0]
1230                     else:
1231                         vcodec, acodec = va_codecs[:2]
1232                     f.update({
1233                         'acodec': acodec,
1234                         'vcodec': vcodec,
1235                     })
1236                 if last_media is not None:
1237                     f['m3u8_media'] = last_media
1238                     last_media = None
1239                 formats.append(f)
1240                 last_info = {}
1241         return formats
1242
1243     @staticmethod
1244     def _xpath_ns(path, namespace=None):
1245         if not namespace:
1246             return path
1247         out = []
1248         for c in path.split('/'):
1249             if not c or c == '.':
1250                 out.append(c)
1251             else:
1252                 out.append('{%s}%s' % (namespace, c))
1253         return '/'.join(out)
1254
1255     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1256         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1257
1258         if smil is False:
1259             assert not fatal
1260             return []
1261
1262         namespace = self._parse_smil_namespace(smil)
1263
1264         return self._parse_smil_formats(
1265             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1266
1267     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1268         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1269         if smil is False:
1270             return {}
1271         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1272
1273     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1274         return self._download_xml(
1275             smil_url, video_id, 'Downloading SMIL file',
1276             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1277
1278     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1279         namespace = self._parse_smil_namespace(smil)
1280
1281         formats = self._parse_smil_formats(
1282             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1283         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1284
1285         video_id = os.path.splitext(url_basename(smil_url))[0]
1286         title = None
1287         description = None
1288         upload_date = None
1289         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1290             name = meta.attrib.get('name')
1291             content = meta.attrib.get('content')
1292             if not name or not content:
1293                 continue
1294             if not title and name == 'title':
1295                 title = content
1296             elif not description and name in ('description', 'abstract'):
1297                 description = content
1298             elif not upload_date and name == 'date':
1299                 upload_date = unified_strdate(content)
1300
1301         thumbnails = [{
1302             'id': image.get('type'),
1303             'url': image.get('src'),
1304             'width': int_or_none(image.get('width')),
1305             'height': int_or_none(image.get('height')),
1306         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1307
1308         return {
1309             'id': video_id,
1310             'title': title or video_id,
1311             'description': description,
1312             'upload_date': upload_date,
1313             'thumbnails': thumbnails,
1314             'formats': formats,
1315             'subtitles': subtitles,
1316         }
1317
1318     def _parse_smil_namespace(self, smil):
1319         return self._search_regex(
1320             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1321
1322     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1323         base = smil_url
1324         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1325             b = meta.get('base') or meta.get('httpBase')
1326             if b:
1327                 base = b
1328                 break
1329
1330         formats = []
1331         rtmp_count = 0
1332         http_count = 0
1333         m3u8_count = 0
1334
1335         srcs = []
1336         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1337         for medium in media:
1338             src = medium.get('src')
1339             if not src or src in srcs:
1340                 continue
1341             srcs.append(src)
1342
1343             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1344             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1345             width = int_or_none(medium.get('width'))
1346             height = int_or_none(medium.get('height'))
1347             proto = medium.get('proto')
1348             ext = medium.get('ext')
1349             src_ext = determine_ext(src)
1350             streamer = medium.get('streamer') or base
1351
1352             if proto == 'rtmp' or streamer.startswith('rtmp'):
1353                 rtmp_count += 1
1354                 formats.append({
1355                     'url': streamer,
1356                     'play_path': src,
1357                     'ext': 'flv',
1358                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1359                     'tbr': bitrate,
1360                     'filesize': filesize,
1361                     'width': width,
1362                     'height': height,
1363                 })
1364                 if transform_rtmp_url:
1365                     streamer, src = transform_rtmp_url(streamer, src)
1366                     formats[-1].update({
1367                         'url': streamer,
1368                         'play_path': src,
1369                     })
1370                 continue
1371
1372             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1373             src_url = src_url.strip()
1374
1375             if proto == 'm3u8' or src_ext == 'm3u8':
1376                 m3u8_formats = self._extract_m3u8_formats(
1377                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1378                 if len(m3u8_formats) == 1:
1379                     m3u8_count += 1
1380                     m3u8_formats[0].update({
1381                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1382                         'tbr': bitrate,
1383                         'width': width,
1384                         'height': height,
1385                     })
1386                 formats.extend(m3u8_formats)
1387                 continue
1388
1389             if src_ext == 'f4m':
1390                 f4m_url = src_url
1391                 if not f4m_params:
1392                     f4m_params = {
1393                         'hdcore': '3.2.0',
1394                         'plugin': 'flowplayer-3.2.0.1',
1395                     }
1396                 f4m_url += '&' if '?' in f4m_url else '?'
1397                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1398                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1399                 continue
1400
1401             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1402                 http_count += 1
1403                 formats.append({
1404                     'url': src_url,
1405                     'ext': ext or src_ext or 'flv',
1406                     'format_id': 'http-%d' % (bitrate or http_count),
1407                     'tbr': bitrate,
1408                     'filesize': filesize,
1409                     'width': width,
1410                     'height': height,
1411                 })
1412                 continue
1413
1414         return formats
1415
1416     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1417         urls = []
1418         subtitles = {}
1419         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1420             src = textstream.get('src')
1421             if not src or src in urls:
1422                 continue
1423             urls.append(src)
1424             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1425             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1426             subtitles.setdefault(lang, []).append({
1427                 'url': src,
1428                 'ext': ext,
1429             })
1430         return subtitles
1431
1432     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1433         xspf = self._download_xml(
1434             playlist_url, playlist_id, 'Downloading xpsf playlist',
1435             'Unable to download xspf manifest', fatal=fatal)
1436         if xspf is False:
1437             return []
1438         return self._parse_xspf(xspf, playlist_id)
1439
1440     def _parse_xspf(self, playlist, playlist_id):
1441         NS_MAP = {
1442             'xspf': 'http://xspf.org/ns/0/',
1443             's1': 'http://static.streamone.nl/player/ns/0',
1444         }
1445
1446         entries = []
1447         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1448             title = xpath_text(
1449                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1450             description = xpath_text(
1451                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1452             thumbnail = xpath_text(
1453                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1454             duration = float_or_none(
1455                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1456
1457             formats = [{
1458                 'url': location.text,
1459                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1460                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1461                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1462             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1463             self._sort_formats(formats)
1464
1465             entries.append({
1466                 'id': playlist_id,
1467                 'title': title,
1468                 'description': description,
1469                 'thumbnail': thumbnail,
1470                 'duration': duration,
1471                 'formats': formats,
1472             })
1473         return entries
1474
1475     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1476         res = self._download_webpage_handle(
1477             mpd_url, video_id,
1478             note=note or 'Downloading MPD manifest',
1479             errnote=errnote or 'Failed to download MPD manifest',
1480             fatal=fatal)
1481         if res is False:
1482             return []
1483         mpd, urlh = res
1484         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1485
1486         return self._parse_mpd_formats(
1487             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1488
1489     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1490         if mpd_doc.get('type') == 'dynamic':
1491             return []
1492
1493         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1494
1495         def _add_ns(path):
1496             return self._xpath_ns(path, namespace)
1497
1498         def is_drm_protected(element):
1499             return element.find(_add_ns('ContentProtection')) is not None
1500
1501         def extract_multisegment_info(element, ms_parent_info):
1502             ms_info = ms_parent_info.copy()
1503             segment_list = element.find(_add_ns('SegmentList'))
1504             if segment_list is not None:
1505                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1506                 if segment_urls_e:
1507                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1508                 initialization = segment_list.find(_add_ns('Initialization'))
1509                 if initialization is not None:
1510                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1511             else:
1512                 segment_template = element.find(_add_ns('SegmentTemplate'))
1513                 if segment_template is not None:
1514                     start_number = segment_template.get('startNumber')
1515                     if start_number:
1516                         ms_info['start_number'] = int(start_number)
1517                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1518                     if segment_timeline is not None:
1519                         s_e = segment_timeline.findall(_add_ns('S'))
1520                         if s_e:
1521                             ms_info['total_number'] = 0
1522                             for s in s_e:
1523                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1524                     else:
1525                         timescale = segment_template.get('timescale')
1526                         if timescale:
1527                             ms_info['timescale'] = int(timescale)
1528                         segment_duration = segment_template.get('duration')
1529                         if segment_duration:
1530                             ms_info['segment_duration'] = int(segment_duration)
1531                     media_template = segment_template.get('media')
1532                     if media_template:
1533                         ms_info['media_template'] = media_template
1534                     initialization = segment_template.get('initialization')
1535                     if initialization:
1536                         ms_info['initialization_url'] = initialization
1537                     else:
1538                         initialization = segment_template.find(_add_ns('Initialization'))
1539                         if initialization is not None:
1540                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1541             return ms_info
1542
1543         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1544         formats = []
1545         for period in mpd_doc.findall(_add_ns('Period')):
1546             period_duration = parse_duration(period.get('duration')) or mpd_duration
1547             period_ms_info = extract_multisegment_info(period, {
1548                 'start_number': 1,
1549                 'timescale': 1,
1550             })
1551             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1552                 if is_drm_protected(adaptation_set):
1553                     continue
1554                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1555                 for representation in adaptation_set.findall(_add_ns('Representation')):
1556                     if is_drm_protected(representation):
1557                         continue
1558                     representation_attrib = adaptation_set.attrib.copy()
1559                     representation_attrib.update(representation.attrib)
1560                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1561                     mime_type = representation_attrib['mimeType']
1562                     content_type = mime_type.split('/')[0]
1563                     if content_type == 'text':
1564                         # TODO implement WebVTT downloading
1565                         pass
1566                     elif content_type == 'video' or content_type == 'audio':
1567                         base_url = ''
1568                         for element in (representation, adaptation_set, period, mpd_doc):
1569                             base_url_e = element.find(_add_ns('BaseURL'))
1570                             if base_url_e is not None:
1571                                 base_url = base_url_e.text + base_url
1572                                 if re.match(r'^https?://', base_url):
1573                                     break
1574                         if mpd_base_url and not re.match(r'^https?://', base_url):
1575                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1576                                 mpd_base_url += '/'
1577                             base_url = mpd_base_url + base_url
1578                         representation_id = representation_attrib.get('id')
1579                         lang = representation_attrib.get('lang')
1580                         url_el = representation.find(_add_ns('BaseURL'))
1581                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1582                         f = {
1583                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1584                             'url': base_url,
1585                             'ext': mimetype2ext(mime_type),
1586                             'width': int_or_none(representation_attrib.get('width')),
1587                             'height': int_or_none(representation_attrib.get('height')),
1588                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1589                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1590                             'fps': int_or_none(representation_attrib.get('frameRate')),
1591                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1592                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1593                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1594                             'format_note': 'DASH %s' % content_type,
1595                             'filesize': filesize,
1596                         }
1597                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1598                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1599                             if 'total_number' not in representation_ms_info and 'segment_duration':
1600                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1601                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1602                             media_template = representation_ms_info['media_template']
1603                             media_template = media_template.replace('$RepresentationID$', representation_id)
1604                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
1605                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1606                             media_template.replace('$$', '$')
1607                             representation_ms_info['segment_urls'] = [
1608                                 media_template % {
1609                                     'Number': segment_number,
1610                                     'Bandwidth': representation_attrib.get('bandwidth')}
1611                                 for segment_number in range(
1612                                     representation_ms_info['start_number'],
1613                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1614                         if 'segment_urls' in representation_ms_info:
1615                             f.update({
1616                                 'segment_urls': representation_ms_info['segment_urls'],
1617                                 'protocol': 'http_dash_segments',
1618                             })
1619                             if 'initialization_url' in representation_ms_info:
1620                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1621                                 f.update({
1622                                     'initialization_url': initialization_url,
1623                                 })
1624                                 if not f.get('url'):
1625                                     f['url'] = initialization_url
1626                         try:
1627                             existing_format = next(
1628                                 fo for fo in formats
1629                                 if fo['format_id'] == representation_id)
1630                         except StopIteration:
1631                             full_info = formats_dict.get(representation_id, {}).copy()
1632                             full_info.update(f)
1633                             formats.append(full_info)
1634                         else:
1635                             existing_format.update(f)
1636                     else:
1637                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1638         return formats
1639
1640     def _parse_html5_media_entries(self, base_url, webpage):
1641         def absolute_url(video_url):
1642             return compat_urlparse.urljoin(base_url, video_url)
1643
1644         def parse_content_type(content_type):
1645             if not content_type:
1646                 return {}
1647             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1648             if ctr:
1649                 mimetype, codecs = ctr.groups()
1650                 f = parse_codecs(codecs)
1651                 f['ext'] = mimetype2ext(mimetype)
1652                 return f
1653             return {}
1654
1655         entries = []
1656         for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
1657             media_info = {
1658                 'formats': [],
1659                 'subtitles': {},
1660             }
1661             media_attributes = extract_attributes(media_tag)
1662             src = media_attributes.get('src')
1663             if src:
1664                 media_info['formats'].append({
1665                     'url': absolute_url(src),
1666                     'vcodec': 'none' if media_type == 'audio' else None,
1667                 })
1668             media_info['thumbnail'] = media_attributes.get('poster')
1669             if media_content:
1670                 for source_tag in re.findall(r'<source[^>]+>', media_content):
1671                     source_attributes = extract_attributes(source_tag)
1672                     src = source_attributes.get('src')
1673                     if not src:
1674                         continue
1675                     f = parse_content_type(source_attributes.get('type'))
1676                     f.update({
1677                         'url': absolute_url(src),
1678                         'vcodec': 'none' if media_type == 'audio' else None,
1679                     })
1680                     media_info['formats'].append(f)
1681                 for track_tag in re.findall(r'<track[^>]+>', media_content):
1682                     track_attributes = extract_attributes(track_tag)
1683                     kind = track_attributes.get('kind')
1684                     if not kind or kind == 'subtitles':
1685                         src = track_attributes.get('src')
1686                         if not src:
1687                             continue
1688                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1689                         media_info['subtitles'].setdefault(lang, []).append({
1690                             'url': absolute_url(src),
1691                         })
1692             if media_info['formats']:
1693                 entries.append(media_info)
1694         return entries
1695
1696     def _live_title(self, name):
1697         """ Generate the title for a live video """
1698         now = datetime.datetime.now()
1699         now_str = now.strftime('%Y-%m-%d %H:%M')
1700         return name + ' ' + now_str
1701
1702     def _int(self, v, name, fatal=False, **kwargs):
1703         res = int_or_none(v, **kwargs)
1704         if 'get_attr' in kwargs:
1705             print(getattr(v, kwargs['get_attr']))
1706         if res is None:
1707             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1708             if fatal:
1709                 raise ExtractorError(msg)
1710             else:
1711                 self._downloader.report_warning(msg)
1712         return res
1713
1714     def _float(self, v, name, fatal=False, **kwargs):
1715         res = float_or_none(v, **kwargs)
1716         if res is None:
1717             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1718             if fatal:
1719                 raise ExtractorError(msg)
1720             else:
1721                 self._downloader.report_warning(msg)
1722         return res
1723
1724     def _set_cookie(self, domain, name, value, expire_time=None):
1725         cookie = compat_cookiejar.Cookie(
1726             0, name, value, None, None, domain, None,
1727             None, '/', True, False, expire_time, '', None, None, None)
1728         self._downloader.cookiejar.set_cookie(cookie)
1729
1730     def _get_cookies(self, url):
1731         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1732         req = sanitized_Request(url)
1733         self._downloader.cookiejar.add_cookie_header(req)
1734         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1735
1736     def get_testcases(self, include_onlymatching=False):
1737         t = getattr(self, '_TEST', None)
1738         if t:
1739             assert not hasattr(self, '_TESTS'), \
1740                 '%s has _TEST and _TESTS' % type(self).__name__
1741             tests = [t]
1742         else:
1743             tests = getattr(self, '_TESTS', [])
1744         for t in tests:
1745             if not include_onlymatching and t.get('only_matching', False):
1746                 continue
1747             t['name'] = type(self).__name__[:-len('IE')]
1748             yield t
1749
1750     def is_suitable(self, age_limit):
1751         """ Test whether the extractor is generally suitable for the given
1752         age limit (i.e. pornographic sites are not, all others usually are) """
1753
1754         any_restricted = False
1755         for tc in self.get_testcases(include_onlymatching=False):
1756             if 'playlist' in tc:
1757                 tc = tc['playlist'][0]
1758             is_restricted = age_restricted(
1759                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1760             if not is_restricted:
1761                 return True
1762             any_restricted = any_restricted or is_restricted
1763         return not any_restricted
1764
1765     def extract_subtitles(self, *args, **kwargs):
1766         if (self._downloader.params.get('writesubtitles', False) or
1767                 self._downloader.params.get('listsubtitles')):
1768             return self._get_subtitles(*args, **kwargs)
1769         return {}
1770
1771     def _get_subtitles(self, *args, **kwargs):
1772         raise NotImplementedError('This method must be implemented by subclasses')
1773
1774     @staticmethod
1775     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1776         """ Merge subtitle items for one language. Items with duplicated URLs
1777         will be dropped. """
1778         list1_urls = set([item['url'] for item in subtitle_list1])
1779         ret = list(subtitle_list1)
1780         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1781         return ret
1782
1783     @classmethod
1784     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1785         """ Merge two subtitle dictionaries, language by language. """
1786         ret = dict(subtitle_dict1)
1787         for lang in subtitle_dict2:
1788             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1789         return ret
1790
1791     def extract_automatic_captions(self, *args, **kwargs):
1792         if (self._downloader.params.get('writeautomaticsub', False) or
1793                 self._downloader.params.get('listsubtitles')):
1794             return self._get_automatic_captions(*args, **kwargs)
1795         return {}
1796
1797     def _get_automatic_captions(self, *args, **kwargs):
1798         raise NotImplementedError('This method must be implemented by subclasses')
1799
1800     def mark_watched(self, *args, **kwargs):
1801         if (self._downloader.params.get('mark_watched', False) and
1802                 (self._get_login_info()[0] is not None or
1803                     self._downloader.params.get('cookiefile') is not None)):
1804             self._mark_watched(*args, **kwargs)
1805
1806     def _mark_watched(self, *args, **kwargs):
1807         raise NotImplementedError('This method must be implemented by subclasses')
1808
1809     def geo_verification_headers(self):
1810         headers = {}
1811         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
1812         if geo_verification_proxy:
1813             headers['Ytdl-request-proxy'] = geo_verification_proxy
1814         return headers
1815
1816
1817 class SearchInfoExtractor(InfoExtractor):
1818     """
1819     Base class for paged search queries extractors.
1820     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1821     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1822     """
1823
1824     @classmethod
1825     def _make_valid_url(cls):
1826         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1827
1828     @classmethod
1829     def suitable(cls, url):
1830         return re.match(cls._make_valid_url(), url) is not None
1831
1832     def _real_extract(self, query):
1833         mobj = re.match(self._make_valid_url(), query)
1834         if mobj is None:
1835             raise ExtractorError('Invalid search query "%s"' % query)
1836
1837         prefix = mobj.group('prefix')
1838         query = mobj.group('query')
1839         if prefix == '':
1840             return self._get_n_results(query, 1)
1841         elif prefix == 'all':
1842             return self._get_n_results(query, self._MAX_RESULTS)
1843         else:
1844             n = int(prefix)
1845             if n <= 0:
1846                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1847             elif n > self._MAX_RESULTS:
1848                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1849                 n = self._MAX_RESULTS
1850             return self._get_n_results(query, n)
1851
1852     def _get_n_results(self, query, n):
1853         """Get a specified number of results for a query"""
1854         raise NotImplementedError('This method must be implemented by subclasses')
1855
1856     @property
1857     def SEARCH_KEY(self):
1858         return self._SEARCH_KEY