_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     url_basename,
  43     xpath_text,
  44     xpath_with_ns,
  45 )
  46
  47
  48 class InfoExtractor(object):
  49     """Information Extractor class.
  50
  51     Information extractors are the classes that, given a URL, extract
  52     information about the video (or videos) the URL refers to. This
  53     information includes the real video URL, the video title, author and
  54     others. The information is stored in a dictionary which is then
  55     passed to the YoutubeDL. The YoutubeDL processes this
  56     information possibly downloading the video to the file system, among
  57     other possible outcomes.
  58
  59     The type field determines the type of the result.
  60     By far the most common value (and the default if _type is missing) is
  61     "video", which indicates a single video.
  62
  63     For a video, the dictionaries must include the following fields:
  64
  65     id:             Video identifier.
  66     title:          Video title, unescaped.
  67
  68     Additionally, it must contain either a formats entry or a url one:
  69
  70     formats:        A list of dictionaries for each format available, ordered
  71                     from worst to best quality.
  72
  73                     Potential fields:
  74                     * url        Mandatory. The URL of the video file
  75                     * ext        Will be calculated from URL if missing
  76                     * format     A human-readable description of the format
  77                                  ("mp4 container with h264/opus").
  78                                  Calculated from the format_id, width, height.
  79                                  and format_note fields if missing.
  80                     * format_id  A short description of the format
  81                                  ("mp4_h264_opus" or "19").
  82                                 Technically optional, but strongly recommended.
  83                     * format_note Additional info about the format
  84                                  ("3D" or "DASH video")
  85                     * width      Width of the video, if known
  86                     * height     Height of the video, if known
  87                     * resolution Textual description of width and height
  88                     * tbr        Average bitrate of audio and video in KBit/s
  89                     * abr        Average audio bitrate in KBit/s
  90                     * acodec     Name of the audio codec in use
  91                     * asr        Audio sampling rate in Hertz
  92                     * vbr        Average video bitrate in KBit/s
  93                     * fps        Frame rate
  94                     * vcodec     Name of the video codec in use
  95                     * container  Name of the container format
  96                     * filesize   The number of bytes, if known in advance
  97                     * filesize_approx  An estimate for the number of bytes
  98                     * player_url SWF Player URL (used for rtmpdump).
  99                     * protocol   The protocol that will be used for the actual
 100                                  download, lower-case.
 101                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 102                                  "m3u8", or "m3u8_native".
 103                     * preference Order number of this format. If this field is
 104                                  present and not None, the formats get sorted
 105                                  by this field, regardless of all other values.
 106                                  -1 for default (order by other properties),
 107                                  -2 or smaller for less than default.
 108                                  < -1000 to hide the format (if there is
 109                                     another one which is strictly better)
 110                     * language_preference  Is this in the correct requested
 111                                  language?
 112                                  10 if it's what the URL is about,
 113                                  -1 for default (don't know),
 114                                  -10 otherwise, other values reserved for now.
 115                     * quality    Order number of the video quality of this
 116                                  format, irrespective of the file format.
 117                                  -1 for default (order by other properties),
 118                                  -2 or smaller for less than default.
 119                     * source_preference  Order number for this video source
 120                                   (quality takes higher priority)
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                     * http_headers  A dictionary of additional HTTP headers
 124                                  to add to the request.
 125                     * stretched_ratio  If given and not 1, indicates that the
 126                                  video's pixels are not square.
 127                                  width : height ratio as float.
 128                     * no_resume  The server does not support resuming the
 129                                  (HTTP or RTMP) download. Boolean.
 130
 131     url:            Final video URL.
 132     ext:            Video filename extension.
 133     format:         The video format, defaults to ext (used for --get-format)
 134     player_url:     SWF Player URL (used for rtmpdump).
 135
 136     The following fields are optional:
 137
 138     alt_title:      A secondary title of the video.
 139     display_id      An alternative identifier for the video, not necessarily
 140                     unique, but available before title. Typically, id is
 141                     something like "4234987", title "Dancing naked mole rats",
 142                     and display_id "dancing-naked-mole-rats"
 143     thumbnails:     A list of dictionaries, with the following entries:
 144                         * "id" (optional, string) - Thumbnail format ID
 145                         * "url"
 146                         * "preference" (optional, int) - quality of the image
 147                         * "width" (optional, int)
 148                         * "height" (optional, int)
 149                         * "resolution" (optional, string "{width}x{height"},
 150                                         deprecated)
 151     thumbnail:      Full URL to a video thumbnail image.
 152     description:    Full video description.
 153     uploader:       Full name of the video uploader.
 154     creator:        The main artist who created the video.
 155     timestamp:      UNIX timestamp of the moment the video became available.
 156     upload_date:    Video upload date (YYYYMMDD).
 157                     If not explicitly set, calculated from timestamp.
 158     uploader_id:    Nickname or id of the video uploader.
 159     location:       Physical location where the video was filmed.
 160     subtitles:      The available subtitles as a dictionary in the format
 161                     {language: subformats}. "subformats" is a list sorted from
 162                     lower to higher preference, each element is a dictionary
 163                     with the "ext" entry and one of:
 164                         * "data": The subtitles file contents
 165                         * "url": A URL pointing to the subtitles file
 166     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 167                     automatically generated captions
 168     duration:       Length of the video in seconds, as an integer.
 169     view_count:     How many users have watched the video on the platform.
 170     like_count:     Number of positive ratings of the video
 171     dislike_count:  Number of negative ratings of the video
 172     average_rating: Average rating give by users, the scale used depends on the webpage
 173     comment_count:  Number of comments on the video
 174     comments:       A list of comments, each with one or more of the following
 175                     properties (all but one of text or html optional):
 176                         * "author" - human-readable name of the comment author
 177                         * "author_id" - user ID of the comment author
 178                         * "id" - Comment ID
 179                         * "html" - Comment as HTML
 180                         * "text" - Plain text of the comment
 181                         * "timestamp" - UNIX timestamp of comment
 182                         * "parent" - ID of the comment this one is replying to.
 183                                      Set to "root" to indicate that this is a
 184                                      comment to the original video.
 185     age_limit:      Age restriction for the video, as an integer (years)
 186     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 187                     should allow to get the same result again. (It will be set
 188                     by YoutubeDL if it's missing)
 189     categories:     A list of categories that the video falls in, for example
 190                     ["Sports", "Berlin"]
 191     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 192     is_live:        True, False, or None (=unknown). Whether this video is a
 193                     live stream that goes on instead of a fixed-length video.
 194     start_time:     Time in seconds where the reproduction should start, as
 195                     specified in the URL.
 196     end_time:       Time in seconds where the reproduction should end, as
 197                     specified in the URL.
 198
 199     Unless mentioned otherwise, the fields should be Unicode strings.
 200
 201     Unless mentioned otherwise, None is equivalent to absence of information.
 202
 203
 204     _type "playlist" indicates multiple videos.
 205     There must be a key "entries", which is a list, an iterable, or a PagedList
 206     object, each element of which is a valid dictionary by this specification.
 207
 208     Additionally, playlists can have "title", "description" and "id" attributes
 209     with the same semantics as videos (see above).
 210
 211
 212     _type "multi_video" indicates that there are multiple videos that
 213     form a single show, for examples multiple acts of an opera or TV episode.
 214     It must have an entries key like a playlist and contain all the keys
 215     required for a video at the same time.
 216
 217
 218     _type "url" indicates that the video must be extracted from another
 219     location, possibly by a different extractor. Its only required key is:
 220     "url" - the next URL to extract.
 221     The key "ie_key" can be set to the class name (minus the trailing "IE",
 222     e.g. "Youtube") if the extractor class is known in advance.
 223     Additionally, the dictionary may have any properties of the resolved entity
 224     known in advance, for example "title" if the title of the referred video is
 225     known ahead of time.
 226
 227
 228     _type "url_transparent" entities have the same specification as "url", but
 229     indicate that the given additional information is more precise than the one
 230     associated with the resolved URL.
 231     This is useful when a site employs a video service that hosts the video and
 232     its technical metadata, but that video service does not embed a useful
 233     title, description etc.
 234
 235
 236     Subclasses of this one should re-define the _real_initialize() and
 237     _real_extract() methods and define a _VALID_URL regexp.
 238     Probably, they should also be added to the list of extractors.
 239
 240     Finally, the _WORKING attribute should be set to False for broken IEs
 241     in order to warn the users and skip the tests.
 242     """
 243
 244     _ready = False
 245     _downloader = None
 246     _WORKING = True
 247
 248     def __init__(self, downloader=None):
 249         """Constructor. Receives an optional downloader."""
 250         self._ready = False
 251         self.set_downloader(downloader)
 252
 253     @classmethod
 254     def suitable(cls, url):
 255         """Receives a URL and returns True if suitable for this IE."""
 256
 257         # This does not use has/getattr intentionally - we want to know whether
 258         # we have cached the regexp for *this* class, whereas getattr would also
 259         # match the superclass
 260         if '_VALID_URL_RE' not in cls.__dict__:
 261             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 262         return cls._VALID_URL_RE.match(url) is not None
 263
 264     @classmethod
 265     def _match_id(cls, url):
 266         if '_VALID_URL_RE' not in cls.__dict__:
 267             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 268         m = cls._VALID_URL_RE.match(url)
 269         assert m
 270         return m.group('id')
 271
 272     @classmethod
 273     def working(cls):
 274         """Getter method for _WORKING."""
 275         return cls._WORKING
 276
 277     def initialize(self):
 278         """Initializes an instance (authentication, etc)."""
 279         if not self._ready:
 280             self._real_initialize()
 281             self._ready = True
 282
 283     def extract(self, url):
 284         """Extracts URL information and returns it in list of dicts."""
 285         try:
 286             self.initialize()
 287             return self._real_extract(url)
 288         except ExtractorError:
 289             raise
 290         except compat_http_client.IncompleteRead as e:
 291             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 292         except (KeyError, StopIteration) as e:
 293             raise ExtractorError('An extractor error has occured.', cause=e)
 294
 295     def set_downloader(self, downloader):
 296         """Sets the downloader for this IE."""
 297         self._downloader = downloader
 298
 299     def _real_initialize(self):
 300         """Real initialization process. Redefine in subclasses."""
 301         pass
 302
 303     def _real_extract(self, url):
 304         """Real extraction process. Redefine in subclasses."""
 305         pass
 306
 307     @classmethod
 308     def ie_key(cls):
 309         """A string for getting the InfoExtractor with get_info_extractor"""
 310         return cls.__name__[:-2]
 311
 312     @property
 313     def IE_NAME(self):
 314         return type(self).__name__[:-2]
 315
 316     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 317         """ Returns the response handle """
 318         if note is None:
 319             self.report_download_webpage(video_id)
 320         elif note is not False:
 321             if video_id is None:
 322                 self.to_screen('%s' % (note,))
 323             else:
 324                 self.to_screen('%s: %s' % (video_id, note))
 325         try:
 326             return self._downloader.urlopen(url_or_request)
 327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 328             if errnote is False:
 329                 return False
 330             if errnote is None:
 331                 errnote = 'Unable to download webpage'
 332             errmsg = '%s: %s' % (errnote, compat_str(err))
 333             if fatal:
 334                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 335             else:
 336                 self._downloader.report_warning(errmsg)
 337                 return False
 338
 339     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 340         """ Returns a tuple (page content as string, URL handle) """
 341         # Strip hashes from the URL (#1038)
 342         if isinstance(url_or_request, (compat_str, str)):
 343             url_or_request = url_or_request.partition('#')[0]
 344
 345         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 346         if urlh is False:
 347             assert not fatal
 348             return False
 349         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 350         return (content, urlh)
 351
 352     @staticmethod
 353     def _guess_encoding_from_content(content_type, webpage_bytes):
 354         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 355         if m:
 356             encoding = m.group(1)
 357         else:
 358             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 359                           webpage_bytes[:1024])
 360             if m:
 361                 encoding = m.group(1).decode('ascii')
 362             elif webpage_bytes.startswith(b'\xff\xfe'):
 363                 encoding = 'utf-16'
 364             else:
 365                 encoding = 'utf-8'
 366
 367         return encoding
 368
 369     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 370         content_type = urlh.headers.get('Content-Type', '')
 371         webpage_bytes = urlh.read()
 372         if prefix is not None:
 373             webpage_bytes = prefix + webpage_bytes
 374         if not encoding:
 375             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 376         if self._downloader.params.get('dump_intermediate_pages', False):
 377             try:
 378                 url = url_or_request.get_full_url()
 379             except AttributeError:
 380                 url = url_or_request
 381             self.to_screen('Dumping request to ' + url)
 382             dump = base64.b64encode(webpage_bytes).decode('ascii')
 383             self._downloader.to_screen(dump)
 384         if self._downloader.params.get('write_pages', False):
 385             try:
 386                 url = url_or_request.get_full_url()
 387             except AttributeError:
 388                 url = url_or_request
 389             basen = '%s_%s' % (video_id, url)
 390             if len(basen) > 240:
 391                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 392                 basen = basen[:240 - len(h)] + h
 393             raw_filename = basen + '.dump'
 394             filename = sanitize_filename(raw_filename, restricted=True)
 395             self.to_screen('Saving request to ' + filename)
 396             # Working around MAX_PATH limitation on Windows (see
 397             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 398             if os.name == 'nt':
 399                 absfilepath = os.path.abspath(filename)
 400                 if len(absfilepath) > 259:
 401                     filename = '\\\\?\\' + absfilepath
 402             with open(filename, 'wb') as outf:
 403                 outf.write(webpage_bytes)
 404
 405         try:
 406             content = webpage_bytes.decode(encoding, 'replace')
 407         except LookupError:
 408             content = webpage_bytes.decode('utf-8', 'replace')
 409
 410         if ('<title>Access to this site is blocked</title>' in content and
 411                 'Websense' in content[:512]):
 412             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 413             blocked_iframe = self._html_search_regex(
 414                 r'<iframe src="([^"]+)"', content,
 415                 'Websense information URL', default=None)
 416             if blocked_iframe:
 417                 msg += ' Visit %s for more details' % blocked_iframe
 418             raise ExtractorError(msg, expected=True)
 419         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 420             msg = (
 421                 'Access to this webpage has been blocked by Indian censorship. '
 422                 'Use a VPN or proxy server (with --proxy) to route around it.')
 423             block_msg = self._html_search_regex(
 424                 r'</h1><p>(.*?)</p>',
 425                 content, 'block message', default=None)
 426             if block_msg:
 427                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 428             raise ExtractorError(msg, expected=True)
 429
 430         return content
 431
 432     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 433         """ Returns the data of the page as a string """
 434         success = False
 435         try_count = 0
 436         while success is False:
 437             try:
 438                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 439                 success = True
 440             except compat_http_client.IncompleteRead as e:
 441                 try_count += 1
 442                 if try_count >= tries:
 443                     raise e
 444                 self._sleep(timeout, video_id)
 445         if res is False:
 446             return res
 447         else:
 448             content, _ = res
 449             return content
 450
 451     def _download_xml(self, url_or_request, video_id,
 452                       note='Downloading XML', errnote='Unable to download XML',
 453                       transform_source=None, fatal=True, encoding=None):
 454         """Return the xml as an xml.etree.ElementTree.Element"""
 455         xml_string = self._download_webpage(
 456             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 457         if xml_string is False:
 458             return xml_string
 459         if transform_source:
 460             xml_string = transform_source(xml_string)
 461         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 462
 463     def _download_json(self, url_or_request, video_id,
 464                        note='Downloading JSON metadata',
 465                        errnote='Unable to download JSON metadata',
 466                        transform_source=None,
 467                        fatal=True, encoding=None):
 468         json_string = self._download_webpage(
 469             url_or_request, video_id, note, errnote, fatal=fatal,
 470             encoding=encoding)
 471         if (not fatal) and json_string is False:
 472             return None
 473         return self._parse_json(
 474             json_string, video_id, transform_source=transform_source, fatal=fatal)
 475
 476     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 477         if transform_source:
 478             json_string = transform_source(json_string)
 479         try:
 480             return json.loads(json_string)
 481         except ValueError as ve:
 482             errmsg = '%s: Failed to parse JSON ' % video_id
 483             if fatal:
 484                 raise ExtractorError(errmsg, cause=ve)
 485             else:
 486                 self.report_warning(errmsg + str(ve))
 487
 488     def report_warning(self, msg, video_id=None):
 489         idstr = '' if video_id is None else '%s: ' % video_id
 490         self._downloader.report_warning(
 491             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 492
 493     def to_screen(self, msg):
 494         """Print msg to screen, prefixing it with '[ie_name]'"""
 495         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 496
 497     def report_extraction(self, id_or_name):
 498         """Report information extraction."""
 499         self.to_screen('%s: Extracting information' % id_or_name)
 500
 501     def report_download_webpage(self, video_id):
 502         """Report webpage download."""
 503         self.to_screen('%s: Downloading webpage' % video_id)
 504
 505     def report_age_confirmation(self):
 506         """Report attempt to confirm age."""
 507         self.to_screen('Confirming age')
 508
 509     def report_login(self):
 510         """Report attempt to log in."""
 511         self.to_screen('Logging in')
 512
 513     @staticmethod
 514     def raise_login_required(msg='This video is only available for registered users'):
 515         raise ExtractorError(
 516             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 517             expected=True)
 518
 519     # Methods for following #608
 520     @staticmethod
 521     def url_result(url, ie=None, video_id=None, video_title=None):
 522         """Returns a URL that points to a page that should be processed"""
 523         # TODO: ie should be the class used for getting the info
 524         video_info = {'_type': 'url',
 525                       'url': url,
 526                       'ie_key': ie}
 527         if video_id is not None:
 528             video_info['id'] = video_id
 529         if video_title is not None:
 530             video_info['title'] = video_title
 531         return video_info
 532
 533     @staticmethod
 534     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 535         """Returns a playlist"""
 536         video_info = {'_type': 'playlist',
 537                       'entries': entries}
 538         if playlist_id:
 539             video_info['id'] = playlist_id
 540         if playlist_title:
 541             video_info['title'] = playlist_title
 542         if playlist_description:
 543             video_info['description'] = playlist_description
 544         return video_info
 545
 546     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 547         """
 548         Perform a regex search on the given string, using a single or a list of
 549         patterns returning the first matching group.
 550         In case of failure return a default value or raise a WARNING or a
 551         RegexNotFoundError, depending on fatal, specifying the field name.
 552         """
 553         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 554             mobj = re.search(pattern, string, flags)
 555         else:
 556             for p in pattern:
 557                 mobj = re.search(p, string, flags)
 558                 if mobj:
 559                     break
 560
 561         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 562             _name = '\033[0;34m%s\033[0m' % name
 563         else:
 564             _name = name
 565
 566         if mobj:
 567             if group is None:
 568                 # return the first matching group
 569                 return next(g for g in mobj.groups() if g is not None)
 570             else:
 571                 return mobj.group(group)
 572         elif default is not NO_DEFAULT:
 573             return default
 574         elif fatal:
 575             raise RegexNotFoundError('Unable to extract %s' % _name)
 576         else:
 577             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 578             return None
 579
 580     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 581         """
 582         Like _search_regex, but strips HTML tags and unescapes entities.
 583         """
 584         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 585         if res:
 586             return clean_html(res).strip()
 587         else:
 588             return res
 589
 590     def _get_login_info(self):
 591         """
 592         Get the login info as (username, password)
 593         It will look in the netrc file using the _NETRC_MACHINE value
 594         If there's no info available, return (None, None)
 595         """
 596         if self._downloader is None:
 597             return (None, None)
 598
 599         username = None
 600         password = None
 601         downloader_params = self._downloader.params
 602
 603         # Attempt to use provided username and password or .netrc data
 604         if downloader_params.get('username', None) is not None:
 605             username = downloader_params['username']
 606             password = downloader_params['password']
 607         elif downloader_params.get('usenetrc', False):
 608             try:
 609                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 610                 if info is not None:
 611                     username = info[0]
 612                     password = info[2]
 613                 else:
 614                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 615             except (IOError, netrc.NetrcParseError) as err:
 616                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 617
 618         return (username, password)
 619
 620     def _get_tfa_info(self, note='two-factor verification code'):
 621         """
 622         Get the two-factor authentication info
 623         TODO - asking the user will be required for sms/phone verify
 624         currently just uses the command line option
 625         If there's no info available, return None
 626         """
 627         if self._downloader is None:
 628             return None
 629         downloader_params = self._downloader.params
 630
 631         if downloader_params.get('twofactor', None) is not None:
 632             return downloader_params['twofactor']
 633
 634         return compat_getpass('Type %s and press [Return]: ' % note)
 635
 636     # Helper functions for extracting OpenGraph info
 637     @staticmethod
 638     def _og_regexes(prop):
 639         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 640         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 641         template = r'<meta[^>]+?%s[^>]+?%s'
 642         return [
 643             template % (property_re, content_re),
 644             template % (content_re, property_re),
 645         ]
 646
 647     @staticmethod
 648     def _meta_regex(prop):
 649         return r'''(?isx)<meta
 650                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 651                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 652
 653     def _og_search_property(self, prop, html, name=None, **kargs):
 654         if name is None:
 655             name = 'OpenGraph %s' % prop
 656         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 657         if escaped is None:
 658             return None
 659         return unescapeHTML(escaped)
 660
 661     def _og_search_thumbnail(self, html, **kargs):
 662         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 663
 664     def _og_search_description(self, html, **kargs):
 665         return self._og_search_property('description', html, fatal=False, **kargs)
 666
 667     def _og_search_title(self, html, **kargs):
 668         return self._og_search_property('title', html, **kargs)
 669
 670     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 671         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 672         if secure:
 673             regexes = self._og_regexes('video:secure_url') + regexes
 674         return self._html_search_regex(regexes, html, name, **kargs)
 675
 676     def _og_search_url(self, html, **kargs):
 677         return self._og_search_property('url', html, **kargs)
 678
 679     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 680         if display_name is None:
 681             display_name = name
 682         return self._html_search_regex(
 683             self._meta_regex(name),
 684             html, display_name, fatal=fatal, group='content', **kwargs)
 685
 686     def _dc_search_uploader(self, html):
 687         return self._html_search_meta('dc.creator', html, 'uploader')
 688
 689     def _rta_search(self, html):
 690         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 691         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 692                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 693                      html):
 694             return 18
 695         return 0
 696
 697     def _media_rating_search(self, html):
 698         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 699         rating = self._html_search_meta('rating', html)
 700
 701         if not rating:
 702             return None
 703
 704         RATING_TABLE = {
 705             'safe for kids': 0,
 706             'general': 8,
 707             '14 years': 14,
 708             'mature': 17,
 709             'restricted': 19,
 710         }
 711         return RATING_TABLE.get(rating.lower(), None)
 712
 713     def _family_friendly_search(self, html):
 714         # See http://schema.org/VideoObject
 715         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 716
 717         if not family_friendly:
 718             return None
 719
 720         RATING_TABLE = {
 721             '1': 0,
 722             'true': 0,
 723             '0': 18,
 724             'false': 18,
 725         }
 726         return RATING_TABLE.get(family_friendly.lower(), None)
 727
 728     def _twitter_search_player(self, html):
 729         return self._html_search_meta('twitter:player', html,
 730                                       'twitter card player')
 731
 732     @staticmethod
 733     def _hidden_inputs(html):
 734         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 735         hidden_inputs = {}
 736         for input in re.findall(r'(?i)<input([^>]+)>', html):
 737             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 738                 continue
 739             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 740             if not name:
 741                 continue
 742             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 743             if not value:
 744                 continue
 745             hidden_inputs[name.group('value')] = value.group('value')
 746         return hidden_inputs
 747
 748     def _form_hidden_inputs(self, form_id, html):
 749         form = self._search_regex(
 750             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 751             html, '%s form' % form_id, group='form')
 752         return self._hidden_inputs(form)
 753
 754     def _sort_formats(self, formats, field_preference=None):
 755         if not formats:
 756             raise ExtractorError('No video formats found')
 757
 758         def _formats_key(f):
 759             # TODO remove the following workaround
 760             from ..utils import determine_ext
 761             if not f.get('ext') and 'url' in f:
 762                 f['ext'] = determine_ext(f['url'])
 763
 764             if isinstance(field_preference, (list, tuple)):
 765                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 766
 767             preference = f.get('preference')
 768             if preference is None:
 769                 proto = f.get('protocol')
 770                 if proto is None:
 771                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 772
 773                 preference = 0 if proto in ['http', 'https'] else -0.1
 774                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 775                     preference -= 0.5
 776
 777             if f.get('vcodec') == 'none':  # audio only
 778                 if self._downloader.params.get('prefer_free_formats'):
 779                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 780                 else:
 781                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 782                 ext_preference = 0
 783                 try:
 784                     audio_ext_preference = ORDER.index(f['ext'])
 785                 except ValueError:
 786                     audio_ext_preference = -1
 787             else:
 788                 if self._downloader.params.get('prefer_free_formats'):
 789                     ORDER = ['flv', 'mp4', 'webm']
 790                 else:
 791                     ORDER = ['webm', 'flv', 'mp4']
 792                 try:
 793                     ext_preference = ORDER.index(f['ext'])
 794                 except ValueError:
 795                     ext_preference = -1
 796                 audio_ext_preference = 0
 797
 798             return (
 799                 preference,
 800                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 801                 f.get('quality') if f.get('quality') is not None else -1,
 802                 f.get('tbr') if f.get('tbr') is not None else -1,
 803                 f.get('filesize') if f.get('filesize') is not None else -1,
 804                 f.get('vbr') if f.get('vbr') is not None else -1,
 805                 f.get('height') if f.get('height') is not None else -1,
 806                 f.get('width') if f.get('width') is not None else -1,
 807                 ext_preference,
 808                 f.get('abr') if f.get('abr') is not None else -1,
 809                 audio_ext_preference,
 810                 f.get('fps') if f.get('fps') is not None else -1,
 811                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 812                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 813                 f.get('format_id') if f.get('format_id') is not None else '',
 814             )
 815         formats.sort(key=_formats_key)
 816
 817     def _check_formats(self, formats, video_id):
 818         if formats:
 819             formats[:] = filter(
 820                 lambda f: self._is_valid_url(
 821                     f['url'], video_id,
 822                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 823                 formats)
 824
 825     def _is_valid_url(self, url, video_id, item='video'):
 826         url = self._proto_relative_url(url, scheme='http:')
 827         # For now assume non HTTP(S) URLs always valid
 828         if not (url.startswith('http://') or url.startswith('https://')):
 829             return True
 830         try:
 831             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 832             return True
 833         except ExtractorError as e:
 834             if isinstance(e.cause, compat_HTTPError):
 835                 self.to_screen(
 836                     '%s: %s URL is invalid, skipping' % (video_id, item))
 837                 return False
 838             raise
 839
 840     def http_scheme(self):
 841         """ Either "http:" or "https:", depending on the user's preferences """
 842         return (
 843             'http:'
 844             if self._downloader.params.get('prefer_insecure', False)
 845             else 'https:')
 846
 847     def _proto_relative_url(self, url, scheme=None):
 848         if url is None:
 849             return url
 850         if url.startswith('//'):
 851             if scheme is None:
 852                 scheme = self.http_scheme()
 853             return scheme + url
 854         else:
 855             return url
 856
 857     def _sleep(self, timeout, video_id, msg_template=None):
 858         if msg_template is None:
 859             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 860         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 861         self.to_screen(msg)
 862         time.sleep(timeout)
 863
 864     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 865                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 866         manifest = self._download_xml(
 867             manifest_url, video_id, 'Downloading f4m manifest',
 868             'Unable to download f4m manifest',
 869             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 870             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 871             transform_source=transform_source)
 872
 873         formats = []
 874         manifest_version = '1.0'
 875         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 876         if not media_nodes:
 877             manifest_version = '2.0'
 878             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 879         for i, media_el in enumerate(media_nodes):
 880             if manifest_version == '2.0':
 881                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 882                 if not media_url:
 883                     continue
 884                 manifest_url = (
 885                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 886                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 887                 # If media_url is itself a f4m manifest do the recursive extraction
 888                 # since bitrates in parent manifest (this one) and media_url manifest
 889                 # may differ leading to inability to resolve the format by requested
 890                 # bitrate in f4m downloader
 891                 if determine_ext(manifest_url) == 'f4m':
 892                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 893                     continue
 894             tbr = int_or_none(media_el.attrib.get('bitrate'))
 895             formats.append({
 896                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 897                 'url': manifest_url,
 898                 'ext': 'flv',
 899                 'tbr': tbr,
 900                 'width': int_or_none(media_el.attrib.get('width')),
 901                 'height': int_or_none(media_el.attrib.get('height')),
 902                 'preference': preference,
 903             })
 904         self._sort_formats(formats)
 905
 906         return formats
 907
 908     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 909                               entry_protocol='m3u8', preference=None,
 910                               m3u8_id=None, note=None, errnote=None,
 911                               fatal=True):
 912
 913         formats = [{
 914             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 915             'url': m3u8_url,
 916             'ext': ext,
 917             'protocol': 'm3u8',
 918             'preference': preference - 1 if preference else -1,
 919             'resolution': 'multiple',
 920             'format_note': 'Quality selection URL',
 921         }]
 922
 923         format_url = lambda u: (
 924             u
 925             if re.match(r'^https?://', u)
 926             else compat_urlparse.urljoin(m3u8_url, u))
 927
 928         m3u8_doc = self._download_webpage(
 929             m3u8_url, video_id,
 930             note=note or 'Downloading m3u8 information',
 931             errnote=errnote or 'Failed to download m3u8 information',
 932             fatal=fatal)
 933         if m3u8_doc is False:
 934             return m3u8_doc
 935         last_info = None
 936         last_media = None
 937         kv_rex = re.compile(
 938             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 939         for line in m3u8_doc.splitlines():
 940             if line.startswith('#EXT-X-STREAM-INF:'):
 941                 last_info = {}
 942                 for m in kv_rex.finditer(line):
 943                     v = m.group('val')
 944                     if v.startswith('"'):
 945                         v = v[1:-1]
 946                     last_info[m.group('key')] = v
 947             elif line.startswith('#EXT-X-MEDIA:'):
 948                 last_media = {}
 949                 for m in kv_rex.finditer(line):
 950                     v = m.group('val')
 951                     if v.startswith('"'):
 952                         v = v[1:-1]
 953                     last_media[m.group('key')] = v
 954             elif line.startswith('#') or not line.strip():
 955                 continue
 956             else:
 957                 if last_info is None:
 958                     formats.append({'url': format_url(line)})
 959                     continue
 960                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 961                 format_id = []
 962                 if m3u8_id:
 963                     format_id.append(m3u8_id)
 964                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 965                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 966                 f = {
 967                     'format_id': '-'.join(format_id),
 968                     'url': format_url(line.strip()),
 969                     'tbr': tbr,
 970                     'ext': ext,
 971                     'protocol': entry_protocol,
 972                     'preference': preference,
 973                 }
 974                 codecs = last_info.get('CODECS')
 975                 if codecs:
 976                     # TODO: looks like video codec is not always necessarily goes first
 977                     va_codecs = codecs.split(',')
 978                     if va_codecs[0]:
 979                         f['vcodec'] = va_codecs[0].partition('.')[0]
 980                     if len(va_codecs) > 1 and va_codecs[1]:
 981                         f['acodec'] = va_codecs[1].partition('.')[0]
 982                 resolution = last_info.get('RESOLUTION')
 983                 if resolution:
 984                     width_str, height_str = resolution.split('x')
 985                     f['width'] = int(width_str)
 986                     f['height'] = int(height_str)
 987                 if last_media is not None:
 988                     f['m3u8_media'] = last_media
 989                     last_media = None
 990                 formats.append(f)
 991                 last_info = {}
 992         self._sort_formats(formats)
 993         return formats
 994
 995     @staticmethod
 996     def _xpath_ns(path, namespace=None):
 997         if not namespace:
 998             return path
 999         out = []
1000         for c in path.split('/'):
1001             if not c or c == '.':
1002                 out.append(c)
1003             else:
1004                 out.append('{%s}%s' % (namespace, c))
1005         return '/'.join(out)
1006
1007     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1008         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1009
1010         if smil is False:
1011             assert not fatal
1012             return []
1013
1014         namespace = self._parse_smil_namespace(smil)
1015
1016         return self._parse_smil_formats(
1017             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1018
1019     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1020         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1021         if smil is False:
1022             return {}
1023         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1024
1025     def _download_smil(self, smil_url, video_id, fatal=True):
1026         return self._download_xml(
1027             smil_url, video_id, 'Downloading SMIL file',
1028             'Unable to download SMIL file', fatal=fatal)
1029
1030     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1031         namespace = self._parse_smil_namespace(smil)
1032
1033         formats = self._parse_smil_formats(
1034             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1035         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1036
1037         video_id = os.path.splitext(url_basename(smil_url))[0]
1038         title = None
1039         description = None
1040         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1041             name = meta.attrib.get('name')
1042             content = meta.attrib.get('content')
1043             if not name or not content:
1044                 continue
1045             if not title and name == 'title':
1046                 title = content
1047             elif not description and name in ('description', 'abstract'):
1048                 description = content
1049
1050         return {
1051             'id': video_id,
1052             'title': title or video_id,
1053             'description': description,
1054             'formats': formats,
1055             'subtitles': subtitles,
1056         }
1057
1058     def _parse_smil_namespace(self, smil):
1059         return self._search_regex(
1060             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1061
1062     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1063         base = smil_url
1064         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1065             b = meta.get('base') or meta.get('httpBase')
1066             if b:
1067                 base = b
1068                 break
1069
1070         formats = []
1071         rtmp_count = 0
1072         http_count = 0
1073
1074         videos = smil.findall(self._xpath_ns('.//video', namespace))
1075         for video in videos:
1076             src = video.get('src')
1077             if not src:
1078                 continue
1079
1080             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1081             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1082             width = int_or_none(video.get('width'))
1083             height = int_or_none(video.get('height'))
1084             proto = video.get('proto')
1085             ext = video.get('ext')
1086             src_ext = determine_ext(src)
1087             streamer = video.get('streamer') or base
1088
1089             if proto == 'rtmp' or streamer.startswith('rtmp'):
1090                 rtmp_count += 1
1091                 formats.append({
1092                     'url': streamer,
1093                     'play_path': src,
1094                     'ext': 'flv',
1095                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1096                     'tbr': bitrate,
1097                     'filesize': filesize,
1098                     'width': width,
1099                     'height': height,
1100                 })
1101                 if transform_rtmp_url:
1102                     streamer, src = transform_rtmp_url(streamer, src)
1103                     formats[-1].update({
1104                         'url': streamer,
1105                         'play_path': src,
1106                     })
1107                 continue
1108
1109             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1110
1111             if proto == 'm3u8' or src_ext == 'm3u8':
1112                 formats.extend(self._extract_m3u8_formats(
1113                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1114                 continue
1115
1116             if src_ext == 'f4m':
1117                 f4m_url = src_url
1118                 if not f4m_params:
1119                     f4m_params = {
1120                         'hdcore': '3.2.0',
1121                         'plugin': 'flowplayer-3.2.0.1',
1122                     }
1123                 f4m_url += '&' if '?' in f4m_url else '?'
1124                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1125                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1126                 continue
1127
1128             if src_url.startswith('http'):
1129                 http_count += 1
1130                 formats.append({
1131                     'url': src_url,
1132                     'ext': ext or src_ext or 'flv',
1133                     'format_id': 'http-%d' % (bitrate or http_count),
1134                     'tbr': bitrate,
1135                     'filesize': filesize,
1136                     'width': width,
1137                     'height': height,
1138                 })
1139                 continue
1140
1141         self._sort_formats(formats)
1142
1143         return formats
1144
1145     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1146         subtitles = {}
1147         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1148             src = textstream.get('src')
1149             if not src:
1150                 continue
1151             ext = textstream.get('ext') or determine_ext(src)
1152             if not ext:
1153                 type_ = textstream.get('type')
1154                 SUBTITLES_TYPES = {
1155                     'text/vtt': 'vtt',
1156                     'text/srt': 'srt',
1157                     'application/smptett+xml': 'tt',
1158                 }
1159                 if type_ in SUBTITLES_TYPES:
1160                     ext = SUBTITLES_TYPES[type_]
1161             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1162             subtitles.setdefault(lang, []).append({
1163                 'url': src,
1164                 'ext': ext,
1165             })
1166         return subtitles
1167
1168     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1169         xspf = self._download_xml(
1170             playlist_url, playlist_id, 'Downloading xpsf playlist',
1171             'Unable to download xspf manifest', fatal=fatal)
1172         if xspf is False:
1173             return []
1174         return self._parse_xspf(xspf, playlist_id)
1175
1176     def _parse_xspf(self, playlist, playlist_id):
1177         NS_MAP = {
1178             'xspf': 'http://xspf.org/ns/0/',
1179             's1': 'http://static.streamone.nl/player/ns/0',
1180         }
1181
1182         entries = []
1183         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1184             title = xpath_text(
1185                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1186             description = xpath_text(
1187                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1188             thumbnail = xpath_text(
1189                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1190             duration = float_or_none(
1191                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1192
1193             formats = [{
1194                 'url': location.text,
1195                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1196                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1197                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1198             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1199             self._sort_formats(formats)
1200
1201             entries.append({
1202                 'id': playlist_id,
1203                 'title': title,
1204                 'description': description,
1205                 'thumbnail': thumbnail,
1206                 'duration': duration,
1207                 'formats': formats,
1208             })
1209         return entries
1210
1211     def _live_title(self, name):
1212         """ Generate the title for a live video """
1213         now = datetime.datetime.now()
1214         now_str = now.strftime("%Y-%m-%d %H:%M")
1215         return name + ' ' + now_str
1216
1217     def _int(self, v, name, fatal=False, **kwargs):
1218         res = int_or_none(v, **kwargs)
1219         if 'get_attr' in kwargs:
1220             print(getattr(v, kwargs['get_attr']))
1221         if res is None:
1222             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1223             if fatal:
1224                 raise ExtractorError(msg)
1225             else:
1226                 self._downloader.report_warning(msg)
1227         return res
1228
1229     def _float(self, v, name, fatal=False, **kwargs):
1230         res = float_or_none(v, **kwargs)
1231         if res is None:
1232             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1233             if fatal:
1234                 raise ExtractorError(msg)
1235             else:
1236                 self._downloader.report_warning(msg)
1237         return res
1238
1239     def _set_cookie(self, domain, name, value, expire_time=None):
1240         cookie = compat_cookiejar.Cookie(
1241             0, name, value, None, None, domain, None,
1242             None, '/', True, False, expire_time, '', None, None, None)
1243         self._downloader.cookiejar.set_cookie(cookie)
1244
1245     def _get_cookies(self, url):
1246         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1247         req = compat_urllib_request.Request(url)
1248         self._downloader.cookiejar.add_cookie_header(req)
1249         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1250
1251     def get_testcases(self, include_onlymatching=False):
1252         t = getattr(self, '_TEST', None)
1253         if t:
1254             assert not hasattr(self, '_TESTS'), \
1255                 '%s has _TEST and _TESTS' % type(self).__name__
1256             tests = [t]
1257         else:
1258             tests = getattr(self, '_TESTS', [])
1259         for t in tests:
1260             if not include_onlymatching and t.get('only_matching', False):
1261                 continue
1262             t['name'] = type(self).__name__[:-len('IE')]
1263             yield t
1264
1265     def is_suitable(self, age_limit):
1266         """ Test whether the extractor is generally suitable for the given
1267         age limit (i.e. pornographic sites are not, all others usually are) """
1268
1269         any_restricted = False
1270         for tc in self.get_testcases(include_onlymatching=False):
1271             if 'playlist' in tc:
1272                 tc = tc['playlist'][0]
1273             is_restricted = age_restricted(
1274                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1275             if not is_restricted:
1276                 return True
1277             any_restricted = any_restricted or is_restricted
1278         return not any_restricted
1279
1280     def extract_subtitles(self, *args, **kwargs):
1281         if (self._downloader.params.get('writesubtitles', False) or
1282                 self._downloader.params.get('listsubtitles')):
1283             return self._get_subtitles(*args, **kwargs)
1284         return {}
1285
1286     def _get_subtitles(self, *args, **kwargs):
1287         raise NotImplementedError("This method must be implemented by subclasses")
1288
1289     @staticmethod
1290     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1291         """ Merge subtitle items for one language. Items with duplicated URLs
1292         will be dropped. """
1293         list1_urls = set([item['url'] for item in subtitle_list1])
1294         ret = list(subtitle_list1)
1295         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1296         return ret
1297
1298     @classmethod
1299     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1300         """ Merge two subtitle dictionaries, language by language. """
1301         ret = dict(subtitle_dict1)
1302         for lang in subtitle_dict2:
1303             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1304         return ret
1305
1306     def extract_automatic_captions(self, *args, **kwargs):
1307         if (self._downloader.params.get('writeautomaticsub', False) or
1308                 self._downloader.params.get('listsubtitles')):
1309             return self._get_automatic_captions(*args, **kwargs)
1310         return {}
1311
1312     def _get_automatic_captions(self, *args, **kwargs):
1313         raise NotImplementedError("This method must be implemented by subclasses")
1314
1315
1316 class SearchInfoExtractor(InfoExtractor):
1317     """
1318     Base class for paged search queries extractors.
1319     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1320     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1321     """
1322
1323     @classmethod
1324     def _make_valid_url(cls):
1325         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1326
1327     @classmethod
1328     def suitable(cls, url):
1329         return re.match(cls._make_valid_url(), url) is not None
1330
1331     def _real_extract(self, query):
1332         mobj = re.match(self._make_valid_url(), query)
1333         if mobj is None:
1334             raise ExtractorError('Invalid search query "%s"' % query)
1335
1336         prefix = mobj.group('prefix')
1337         query = mobj.group('query')
1338         if prefix == '':
1339             return self._get_n_results(query, 1)
1340         elif prefix == 'all':
1341             return self._get_n_results(query, self._MAX_RESULTS)
1342         else:
1343             n = int(prefix)
1344             if n <= 0:
1345                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1346             elif n > self._MAX_RESULTS:
1347                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1348                 n = self._MAX_RESULTS
1349             return self._get_n_results(query, n)
1350
1351     def _get_n_results(self, query, n):
1352         """Get a specified number of results for a query"""
1353         raise NotImplementedError("This method must be implemented by subclasses")
1354
1355     @property
1356     def SEARCH_KEY(self):
1357         return self._SEARCH_KEY