git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     url_basename,
  43     xpath_text,
  44     xpath_with_ns,
  45 )
  46
  47
  48 class InfoExtractor(object):
  49     """Information Extractor class.
  50
  51     Information extractors are the classes that, given a URL, extract
  52     information about the video (or videos) the URL refers to. This
  53     information includes the real video URL, the video title, author and
  54     others. The information is stored in a dictionary which is then
  55     passed to the YoutubeDL. The YoutubeDL processes this
  56     information possibly downloading the video to the file system, among
  57     other possible outcomes.
  58
  59     The type field determines the type of the result.
  60     By far the most common value (and the default if _type is missing) is
  61     "video", which indicates a single video.
  62
  63     For a video, the dictionaries must include the following fields:
  64
  65     id:             Video identifier.
  66     title:          Video title, unescaped.
  67
  68     Additionally, it must contain either a formats entry or a url one:
  69
  70     formats:        A list of dictionaries for each format available, ordered
  71                     from worst to best quality.
  72
  73                     Potential fields:
  74                     * url        Mandatory. The URL of the video file
  75                     * ext        Will be calculated from URL if missing
  76                     * format     A human-readable description of the format
  77                                  ("mp4 container with h264/opus").
  78                                  Calculated from the format_id, width, height.
  79                                  and format_note fields if missing.
  80                     * format_id  A short description of the format
  81                                  ("mp4_h264_opus" or "19").
  82                                 Technically optional, but strongly recommended.
  83                     * format_note Additional info about the format
  84                                  ("3D" or "DASH video")
  85                     * width      Width of the video, if known
  86                     * height     Height of the video, if known
  87                     * resolution Textual description of width and height
  88                     * tbr        Average bitrate of audio and video in KBit/s
  89                     * abr        Average audio bitrate in KBit/s
  90                     * acodec     Name of the audio codec in use
  91                     * asr        Audio sampling rate in Hertz
  92                     * vbr        Average video bitrate in KBit/s
  93                     * fps        Frame rate
  94                     * vcodec     Name of the video codec in use
  95                     * container  Name of the container format
  96                     * filesize   The number of bytes, if known in advance
  97                     * filesize_approx  An estimate for the number of bytes
  98                     * player_url SWF Player URL (used for rtmpdump).
  99                     * protocol   The protocol that will be used for the actual
 100                                  download, lower-case.
 101                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 102                                  "m3u8", or "m3u8_native".
 103                     * preference Order number of this format. If this field is
 104                                  present and not None, the formats get sorted
 105                                  by this field, regardless of all other values.
 106                                  -1 for default (order by other properties),
 107                                  -2 or smaller for less than default.
 108                                  < -1000 to hide the format (if there is
 109                                     another one which is strictly better)
 110                     * language_preference  Is this in the correct requested
 111                                  language?
 112                                  10 if it's what the URL is about,
 113                                  -1 for default (don't know),
 114                                  -10 otherwise, other values reserved for now.
 115                     * quality    Order number of the video quality of this
 116                                  format, irrespective of the file format.
 117                                  -1 for default (order by other properties),
 118                                  -2 or smaller for less than default.
 119                     * source_preference  Order number for this video source
 120                                   (quality takes higher priority)
 121                                  -1 for default (order by other properties),
 122                                  -2 or smaller for less than default.
 123                     * http_headers  A dictionary of additional HTTP headers
 124                                  to add to the request.
 125                     * stretched_ratio  If given and not 1, indicates that the
 126                                  video's pixels are not square.
 127                                  width : height ratio as float.
 128                     * no_resume  The server does not support resuming the
 129                                  (HTTP or RTMP) download. Boolean.
 130
 131     url:            Final video URL.
 132     ext:            Video filename extension.
 133     format:         The video format, defaults to ext (used for --get-format)
 134     player_url:     SWF Player URL (used for rtmpdump).
 135
 136     The following fields are optional:
 137
 138     alt_title:      A secondary title of the video.
 139     display_id      An alternative identifier for the video, not necessarily
 140                     unique, but available before title. Typically, id is
 141                     something like "4234987", title "Dancing naked mole rats",
 142                     and display_id "dancing-naked-mole-rats"
 143     thumbnails:     A list of dictionaries, with the following entries:
 144                         * "id" (optional, string) - Thumbnail format ID
 145                         * "url"
 146                         * "preference" (optional, int) - quality of the image
 147                         * "width" (optional, int)
 148                         * "height" (optional, int)
 149                         * "resolution" (optional, string "{width}x{height"},
 150                                         deprecated)
 151     thumbnail:      Full URL to a video thumbnail image.
 152     description:    Full video description.
 153     uploader:       Full name of the video uploader.
 154     creator:        The main artist who created the video.
 155     timestamp:      UNIX timestamp of the moment the video became available.
 156     upload_date:    Video upload date (YYYYMMDD).
 157                     If not explicitly set, calculated from timestamp.
 158     uploader_id:    Nickname or id of the video uploader.
 159     location:       Physical location where the video was filmed.
 160     subtitles:      The available subtitles as a dictionary in the format
 161                     {language: subformats}. "subformats" is a list sorted from
 162                     lower to higher preference, each element is a dictionary
 163                     with the "ext" entry and one of:
 164                         * "data": The subtitles file contents
 165                         * "url": A URL pointing to the subtitles file
 166     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 167                     automatically generated captions
 168     duration:       Length of the video in seconds, as an integer.
 169     view_count:     How many users have watched the video on the platform.
 170     like_count:     Number of positive ratings of the video
 171     dislike_count:  Number of negative ratings of the video
 172     average_rating: Average rating give by users, the scale used depends on the webpage
 173     comment_count:  Number of comments on the video
 174     comments:       A list of comments, each with one or more of the following
 175                     properties (all but one of text or html optional):
 176                         * "author" - human-readable name of the comment author
 177                         * "author_id" - user ID of the comment author
 178                         * "id" - Comment ID
 179                         * "html" - Comment as HTML
 180                         * "text" - Plain text of the comment
 181                         * "timestamp" - UNIX timestamp of comment
 182                         * "parent" - ID of the comment this one is replying to.
 183                                      Set to "root" to indicate that this is a
 184                                      comment to the original video.
 185     age_limit:      Age restriction for the video, as an integer (years)
 186     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 187                     should allow to get the same result again. (It will be set
 188                     by YoutubeDL if it's missing)
 189     categories:     A list of categories that the video falls in, for example
 190                     ["Sports", "Berlin"]
 191     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 192     is_live:        True, False, or None (=unknown). Whether this video is a
 193                     live stream that goes on instead of a fixed-length video.
 194     start_time:     Time in seconds where the reproduction should start, as
 195                     specified in the URL.
 196     end_time:       Time in seconds where the reproduction should end, as
 197                     specified in the URL.
 198
 199     Unless mentioned otherwise, the fields should be Unicode strings.
 200
 201     Unless mentioned otherwise, None is equivalent to absence of information.
 202
 203
 204     _type "playlist" indicates multiple videos.
 205     There must be a key "entries", which is a list, an iterable, or a PagedList
 206     object, each element of which is a valid dictionary by this specification.
 207
 208     Additionally, playlists can have "title", "description" and "id" attributes
 209     with the same semantics as videos (see above).
 210
 211
 212     _type "multi_video" indicates that there are multiple videos that
 213     form a single show, for examples multiple acts of an opera or TV episode.
 214     It must have an entries key like a playlist and contain all the keys
 215     required for a video at the same time.
 216
 217
 218     _type "url" indicates that the video must be extracted from another
 219     location, possibly by a different extractor. Its only required key is:
 220     "url" - the next URL to extract.
 221     The key "ie_key" can be set to the class name (minus the trailing "IE",
 222     e.g. "Youtube") if the extractor class is known in advance.
 223     Additionally, the dictionary may have any properties of the resolved entity
 224     known in advance, for example "title" if the title of the referred video is
 225     known ahead of time.
 226
 227
 228     _type "url_transparent" entities have the same specification as "url", but
 229     indicate that the given additional information is more precise than the one
 230     associated with the resolved URL.
 231     This is useful when a site employs a video service that hosts the video and
 232     its technical metadata, but that video service does not embed a useful
 233     title, description etc.
 234
 235
 236     Subclasses of this one should re-define the _real_initialize() and
 237     _real_extract() methods and define a _VALID_URL regexp.
 238     Probably, they should also be added to the list of extractors.
 239
 240     Finally, the _WORKING attribute should be set to False for broken IEs
 241     in order to warn the users and skip the tests.
 242     """
 243
 244     _ready = False
 245     _downloader = None
 246     _WORKING = True
 247
 248     def __init__(self, downloader=None):
 249         """Constructor. Receives an optional downloader."""
 250         self._ready = False
 251         self.set_downloader(downloader)
 252
 253     @classmethod
 254     def suitable(cls, url):
 255         """Receives a URL and returns True if suitable for this IE."""
 256
 257         # This does not use has/getattr intentionally - we want to know whether
 258         # we have cached the regexp for *this* class, whereas getattr would also
 259         # match the superclass
 260         if '_VALID_URL_RE' not in cls.__dict__:
 261             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 262         return cls._VALID_URL_RE.match(url) is not None
 263
 264     @classmethod
 265     def _match_id(cls, url):
 266         if '_VALID_URL_RE' not in cls.__dict__:
 267             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 268         m = cls._VALID_URL_RE.match(url)
 269         assert m
 270         return m.group('id')
 271
 272     @classmethod
 273     def working(cls):
 274         """Getter method for _WORKING."""
 275         return cls._WORKING
 276
 277     def initialize(self):
 278         """Initializes an instance (authentication, etc)."""
 279         if not self._ready:
 280             self._real_initialize()
 281             self._ready = True
 282
 283     def extract(self, url):
 284         """Extracts URL information and returns it in list of dicts."""
 285         try:
 286             self.initialize()
 287             return self._real_extract(url)
 288         except ExtractorError:
 289             raise
 290         except compat_http_client.IncompleteRead as e:
 291             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 292         except (KeyError, StopIteration) as e:
 293             raise ExtractorError('An extractor error has occured.', cause=e)
 294
 295     def set_downloader(self, downloader):
 296         """Sets the downloader for this IE."""
 297         self._downloader = downloader
 298
 299     def _real_initialize(self):
 300         """Real initialization process. Redefine in subclasses."""
 301         pass
 302
 303     def _real_extract(self, url):
 304         """Real extraction process. Redefine in subclasses."""
 305         pass
 306
 307     @classmethod
 308     def ie_key(cls):
 309         """A string for getting the InfoExtractor with get_info_extractor"""
 310         return cls.__name__[:-2]
 311
 312     @property
 313     def IE_NAME(self):
 314         return type(self).__name__[:-2]
 315
 316     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 317         """ Returns the response handle """
 318         if note is None:
 319             self.report_download_webpage(video_id)
 320         elif note is not False:
 321             if video_id is None:
 322                 self.to_screen('%s' % (note,))
 323             else:
 324                 self.to_screen('%s: %s' % (video_id, note))
 325         try:
 326             return self._downloader.urlopen(url_or_request)
 327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 328             if errnote is False:
 329                 return False
 330             if errnote is None:
 331                 errnote = 'Unable to download webpage'
 332             errmsg = '%s: %s' % (errnote, compat_str(err))
 333             if fatal:
 334                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 335             else:
 336                 self._downloader.report_warning(errmsg)
 337                 return False
 338
 339     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 340         """ Returns a tuple (page content as string, URL handle) """
 341         # Strip hashes from the URL (#1038)
 342         if isinstance(url_or_request, (compat_str, str)):
 343             url_or_request = url_or_request.partition('#')[0]
 344
 345         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 346         if urlh is False:
 347             assert not fatal
 348             return False
 349         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 350         return (content, urlh)
 351
 352     @staticmethod
 353     def _guess_encoding_from_content(content_type, webpage_bytes):
 354         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 355         if m:
 356             encoding = m.group(1)
 357         else:
 358             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 359                           webpage_bytes[:1024])
 360             if m:
 361                 encoding = m.group(1).decode('ascii')
 362             elif webpage_bytes.startswith(b'\xff\xfe'):
 363                 encoding = 'utf-16'
 364             else:
 365                 encoding = 'utf-8'
 366
 367         return encoding
 368
 369     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 370         content_type = urlh.headers.get('Content-Type', '')
 371         webpage_bytes = urlh.read()
 372         if prefix is not None:
 373             webpage_bytes = prefix + webpage_bytes
 374         if not encoding:
 375             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 376         if self._downloader.params.get('dump_intermediate_pages', False):
 377             try:
 378                 url = url_or_request.get_full_url()
 379             except AttributeError:
 380                 url = url_or_request
 381             self.to_screen('Dumping request to ' + url)
 382             dump = base64.b64encode(webpage_bytes).decode('ascii')
 383             self._downloader.to_screen(dump)
 384         if self._downloader.params.get('write_pages', False):
 385             try:
 386                 url = url_or_request.get_full_url()
 387             except AttributeError:
 388                 url = url_or_request
 389             basen = '%s_%s' % (video_id, url)
 390             if len(basen) > 240:
 391                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 392                 basen = basen[:240 - len(h)] + h
 393             raw_filename = basen + '.dump'
 394             filename = sanitize_filename(raw_filename, restricted=True)
 395             self.to_screen('Saving request to ' + filename)
 396             # Working around MAX_PATH limitation on Windows (see
 397             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 398             if os.name == 'nt':
 399                 absfilepath = os.path.abspath(filename)
 400                 if len(absfilepath) > 259:
 401                     filename = '\\\\?\\' + absfilepath
 402             with open(filename, 'wb') as outf:
 403                 outf.write(webpage_bytes)
 404
 405         try:
 406             content = webpage_bytes.decode(encoding, 'replace')
 407         except LookupError:
 408             content = webpage_bytes.decode('utf-8', 'replace')
 409
 410         if ('<title>Access to this site is blocked</title>' in content and
 411                 'Websense' in content[:512]):
 412             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 413             blocked_iframe = self._html_search_regex(
 414                 r'<iframe src="([^"]+)"', content,
 415                 'Websense information URL', default=None)
 416             if blocked_iframe:
 417                 msg += ' Visit %s for more details' % blocked_iframe
 418             raise ExtractorError(msg, expected=True)
 419         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 420             msg = (
 421                 'Access to this webpage has been blocked by Indian censorship. '
 422                 'Use a VPN or proxy server (with --proxy) to route around it.')
 423             block_msg = self._html_search_regex(
 424                 r'</h1><p>(.*?)</p>',
 425                 content, 'block message', default=None)
 426             if block_msg:
 427                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 428             raise ExtractorError(msg, expected=True)
 429
 430         return content
 431
 432     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 433         """ Returns the data of the page as a string """
 434         success = False
 435         try_count = 0
 436         while success is False:
 437             try:
 438                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 439                 success = True
 440             except compat_http_client.IncompleteRead as e:
 441                 try_count += 1
 442                 if try_count >= tries:
 443                     raise e
 444                 self._sleep(timeout, video_id)
 445         if res is False:
 446             return res
 447         else:
 448             content, _ = res
 449             return content
 450
 451     def _download_xml(self, url_or_request, video_id,
 452                       note='Downloading XML', errnote='Unable to download XML',
 453                       transform_source=None, fatal=True, encoding=None):
 454         """Return the xml as an xml.etree.ElementTree.Element"""
 455         xml_string = self._download_webpage(
 456             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 457         if xml_string is False:
 458             return xml_string
 459         if transform_source:
 460             xml_string = transform_source(xml_string)
 461         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 462
 463     def _download_json(self, url_or_request, video_id,
 464                        note='Downloading JSON metadata',
 465                        errnote='Unable to download JSON metadata',
 466                        transform_source=None,
 467                        fatal=True, encoding=None):
 468         json_string = self._download_webpage(
 469             url_or_request, video_id, note, errnote, fatal=fatal,
 470             encoding=encoding)
 471         if (not fatal) and json_string is False:
 472             return None
 473         return self._parse_json(
 474             json_string, video_id, transform_source=transform_source, fatal=fatal)
 475
 476     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 477         if transform_source:
 478             json_string = transform_source(json_string)
 479         try:
 480             return json.loads(json_string)
 481         except ValueError as ve:
 482             errmsg = '%s: Failed to parse JSON ' % video_id
 483             if fatal:
 484                 raise ExtractorError(errmsg, cause=ve)
 485             else:
 486                 self.report_warning(errmsg + str(ve))
 487
 488     def report_warning(self, msg, video_id=None):
 489         idstr = '' if video_id is None else '%s: ' % video_id
 490         self._downloader.report_warning(
 491             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 492
 493     def to_screen(self, msg):
 494         """Print msg to screen, prefixing it with '[ie_name]'"""
 495         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 496
 497     def report_extraction(self, id_or_name):
 498         """Report information extraction."""
 499         self.to_screen('%s: Extracting information' % id_or_name)
 500
 501     def report_download_webpage(self, video_id):
 502         """Report webpage download."""
 503         self.to_screen('%s: Downloading webpage' % video_id)
 504
 505     def report_age_confirmation(self):
 506         """Report attempt to confirm age."""
 507         self.to_screen('Confirming age')
 508
 509     def report_login(self):
 510         """Report attempt to log in."""
 511         self.to_screen('Logging in')
 512
 513     # Methods for following #608
 514     @staticmethod
 515     def url_result(url, ie=None, video_id=None, video_title=None):
 516         """Returns a URL that points to a page that should be processed"""
 517         # TODO: ie should be the class used for getting the info
 518         video_info = {'_type': 'url',
 519                       'url': url,
 520                       'ie_key': ie}
 521         if video_id is not None:
 522             video_info['id'] = video_id
 523         if video_title is not None:
 524             video_info['title'] = video_title
 525         return video_info
 526
 527     @staticmethod
 528     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 529         """Returns a playlist"""
 530         video_info = {'_type': 'playlist',
 531                       'entries': entries}
 532         if playlist_id:
 533             video_info['id'] = playlist_id
 534         if playlist_title:
 535             video_info['title'] = playlist_title
 536         if playlist_description:
 537             video_info['description'] = playlist_description
 538         return video_info
 539
 540     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 541         """
 542         Perform a regex search on the given string, using a single or a list of
 543         patterns returning the first matching group.
 544         In case of failure return a default value or raise a WARNING or a
 545         RegexNotFoundError, depending on fatal, specifying the field name.
 546         """
 547         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 548             mobj = re.search(pattern, string, flags)
 549         else:
 550             for p in pattern:
 551                 mobj = re.search(p, string, flags)
 552                 if mobj:
 553                     break
 554
 555         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 556             _name = '\033[0;34m%s\033[0m' % name
 557         else:
 558             _name = name
 559
 560         if mobj:
 561             if group is None:
 562                 # return the first matching group
 563                 return next(g for g in mobj.groups() if g is not None)
 564             else:
 565                 return mobj.group(group)
 566         elif default is not NO_DEFAULT:
 567             return default
 568         elif fatal:
 569             raise RegexNotFoundError('Unable to extract %s' % _name)
 570         else:
 571             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 572             return None
 573
 574     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 575         """
 576         Like _search_regex, but strips HTML tags and unescapes entities.
 577         """
 578         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 579         if res:
 580             return clean_html(res).strip()
 581         else:
 582             return res
 583
 584     def _get_login_info(self):
 585         """
 586         Get the login info as (username, password)
 587         It will look in the netrc file using the _NETRC_MACHINE value
 588         If there's no info available, return (None, None)
 589         """
 590         if self._downloader is None:
 591             return (None, None)
 592
 593         username = None
 594         password = None
 595         downloader_params = self._downloader.params
 596
 597         # Attempt to use provided username and password or .netrc data
 598         if downloader_params.get('username', None) is not None:
 599             username = downloader_params['username']
 600             password = downloader_params['password']
 601         elif downloader_params.get('usenetrc', False):
 602             try:
 603                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 604                 if info is not None:
 605                     username = info[0]
 606                     password = info[2]
 607                 else:
 608                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 609             except (IOError, netrc.NetrcParseError) as err:
 610                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 611
 612         return (username, password)
 613
 614     def _get_tfa_info(self, note='two-factor verification code'):
 615         """
 616         Get the two-factor authentication info
 617         TODO - asking the user will be required for sms/phone verify
 618         currently just uses the command line option
 619         If there's no info available, return None
 620         """
 621         if self._downloader is None:
 622             return None
 623         downloader_params = self._downloader.params
 624
 625         if downloader_params.get('twofactor', None) is not None:
 626             return downloader_params['twofactor']
 627
 628         return compat_getpass('Type %s and press [Return]: ' % note)
 629
 630     # Helper functions for extracting OpenGraph info
 631     @staticmethod
 632     def _og_regexes(prop):
 633         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 634         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 635         template = r'<meta[^>]+?%s[^>]+?%s'
 636         return [
 637             template % (property_re, content_re),
 638             template % (content_re, property_re),
 639         ]
 640
 641     @staticmethod
 642     def _meta_regex(prop):
 643         return r'''(?isx)<meta
 644                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 645                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 646
 647     def _og_search_property(self, prop, html, name=None, **kargs):
 648         if name is None:
 649             name = 'OpenGraph %s' % prop
 650         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 651         if escaped is None:
 652             return None
 653         return unescapeHTML(escaped)
 654
 655     def _og_search_thumbnail(self, html, **kargs):
 656         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 657
 658     def _og_search_description(self, html, **kargs):
 659         return self._og_search_property('description', html, fatal=False, **kargs)
 660
 661     def _og_search_title(self, html, **kargs):
 662         return self._og_search_property('title', html, **kargs)
 663
 664     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 665         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 666         if secure:
 667             regexes = self._og_regexes('video:secure_url') + regexes
 668         return self._html_search_regex(regexes, html, name, **kargs)
 669
 670     def _og_search_url(self, html, **kargs):
 671         return self._og_search_property('url', html, **kargs)
 672
 673     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 674         if display_name is None:
 675             display_name = name
 676         return self._html_search_regex(
 677             self._meta_regex(name),
 678             html, display_name, fatal=fatal, group='content', **kwargs)
 679
 680     def _dc_search_uploader(self, html):
 681         return self._html_search_meta('dc.creator', html, 'uploader')
 682
 683     def _rta_search(self, html):
 684         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 685         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 686                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 687                      html):
 688             return 18
 689         return 0
 690
 691     def _media_rating_search(self, html):
 692         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 693         rating = self._html_search_meta('rating', html)
 694
 695         if not rating:
 696             return None
 697
 698         RATING_TABLE = {
 699             'safe for kids': 0,
 700             'general': 8,
 701             '14 years': 14,
 702             'mature': 17,
 703             'restricted': 19,
 704         }
 705         return RATING_TABLE.get(rating.lower(), None)
 706
 707     def _family_friendly_search(self, html):
 708         # See http://schema.org/VideoObject
 709         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 710
 711         if not family_friendly:
 712             return None
 713
 714         RATING_TABLE = {
 715             '1': 0,
 716             'true': 0,
 717             '0': 18,
 718             'false': 18,
 719         }
 720         return RATING_TABLE.get(family_friendly.lower(), None)
 721
 722     def _twitter_search_player(self, html):
 723         return self._html_search_meta('twitter:player', html,
 724                                       'twitter card player')
 725
 726     @staticmethod
 727     def _hidden_inputs(html):
 728         hidden_inputs = {}
 729         for input in re.findall(r'<input([^>]+)>', html):
 730             if not re.search(r'type=(["\'])hidden\1', input):
 731                 continue
 732             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 733             if not name:
 734                 continue
 735             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 736             if not value:
 737                 continue
 738             hidden_inputs[name.group('value')] = value.group('value')
 739         return hidden_inputs
 740
 741     def _form_hidden_inputs(self, form_id, html):
 742         form = self._search_regex(
 743             r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 744             html, '%s form' % form_id, group='form')
 745         return self._hidden_inputs(form)
 746
 747     def _sort_formats(self, formats, field_preference=None):
 748         if not formats:
 749             raise ExtractorError('No video formats found')
 750
 751         def _formats_key(f):
 752             # TODO remove the following workaround
 753             from ..utils import determine_ext
 754             if not f.get('ext') and 'url' in f:
 755                 f['ext'] = determine_ext(f['url'])
 756
 757             if isinstance(field_preference, (list, tuple)):
 758                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 759
 760             preference = f.get('preference')
 761             if preference is None:
 762                 proto = f.get('protocol')
 763                 if proto is None:
 764                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 765
 766                 preference = 0 if proto in ['http', 'https'] else -0.1
 767                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 768                     preference -= 0.5
 769
 770             if f.get('vcodec') == 'none':  # audio only
 771                 if self._downloader.params.get('prefer_free_formats'):
 772                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 773                 else:
 774                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 775                 ext_preference = 0
 776                 try:
 777                     audio_ext_preference = ORDER.index(f['ext'])
 778                 except ValueError:
 779                     audio_ext_preference = -1
 780             else:
 781                 if self._downloader.params.get('prefer_free_formats'):
 782                     ORDER = ['flv', 'mp4', 'webm']
 783                 else:
 784                     ORDER = ['webm', 'flv', 'mp4']
 785                 try:
 786                     ext_preference = ORDER.index(f['ext'])
 787                 except ValueError:
 788                     ext_preference = -1
 789                 audio_ext_preference = 0
 790
 791             return (
 792                 preference,
 793                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 794                 f.get('quality') if f.get('quality') is not None else -1,
 795                 f.get('tbr') if f.get('tbr') is not None else -1,
 796                 f.get('filesize') if f.get('filesize') is not None else -1,
 797                 f.get('vbr') if f.get('vbr') is not None else -1,
 798                 f.get('height') if f.get('height') is not None else -1,
 799                 f.get('width') if f.get('width') is not None else -1,
 800                 ext_preference,
 801                 f.get('abr') if f.get('abr') is not None else -1,
 802                 audio_ext_preference,
 803                 f.get('fps') if f.get('fps') is not None else -1,
 804                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 805                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 806                 f.get('format_id') if f.get('format_id') is not None else '',
 807             )
 808         formats.sort(key=_formats_key)
 809
 810     def _check_formats(self, formats, video_id):
 811         if formats:
 812             formats[:] = filter(
 813                 lambda f: self._is_valid_url(
 814                     f['url'], video_id,
 815                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 816                 formats)
 817
 818     def _is_valid_url(self, url, video_id, item='video'):
 819         url = self._proto_relative_url(url, scheme='http:')
 820         # For now assume non HTTP(S) URLs always valid
 821         if not (url.startswith('http://') or url.startswith('https://')):
 822             return True
 823         try:
 824             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 825             return True
 826         except ExtractorError as e:
 827             if isinstance(e.cause, compat_HTTPError):
 828                 self.to_screen(
 829                     '%s: %s URL is invalid, skipping' % (video_id, item))
 830                 return False
 831             raise
 832
 833     def http_scheme(self):
 834         """ Either "http:" or "https:", depending on the user's preferences """
 835         return (
 836             'http:'
 837             if self._downloader.params.get('prefer_insecure', False)
 838             else 'https:')
 839
 840     def _proto_relative_url(self, url, scheme=None):
 841         if url is None:
 842             return url
 843         if url.startswith('//'):
 844             if scheme is None:
 845                 scheme = self.http_scheme()
 846             return scheme + url
 847         else:
 848             return url
 849
 850     def _sleep(self, timeout, video_id, msg_template=None):
 851         if msg_template is None:
 852             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 853         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 854         self.to_screen(msg)
 855         time.sleep(timeout)
 856
 857     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 858                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 859         manifest = self._download_xml(
 860             manifest_url, video_id, 'Downloading f4m manifest',
 861             'Unable to download f4m manifest',
 862             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 863             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 864             transform_source=transform_source)
 865
 866         formats = []
 867         manifest_version = '1.0'
 868         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 869         if not media_nodes:
 870             manifest_version = '2.0'
 871             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 872         for i, media_el in enumerate(media_nodes):
 873             if manifest_version == '2.0':
 874                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 875                 if not media_url:
 876                     continue
 877                 manifest_url = (
 878                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 879                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 880                 # If media_url is itself a f4m manifest do the recursive extraction
 881                 # since bitrates in parent manifest (this one) and media_url manifest
 882                 # may differ leading to inability to resolve the format by requested
 883                 # bitrate in f4m downloader
 884                 if determine_ext(manifest_url) == 'f4m':
 885                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 886                     continue
 887             tbr = int_or_none(media_el.attrib.get('bitrate'))
 888             formats.append({
 889                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 890                 'url': manifest_url,
 891                 'ext': 'flv',
 892                 'tbr': tbr,
 893                 'width': int_or_none(media_el.attrib.get('width')),
 894                 'height': int_or_none(media_el.attrib.get('height')),
 895                 'preference': preference,
 896             })
 897         self._sort_formats(formats)
 898
 899         return formats
 900
 901     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 902                               entry_protocol='m3u8', preference=None,
 903                               m3u8_id=None, note=None, errnote=None,
 904                               fatal=True):
 905
 906         formats = [{
 907             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 908             'url': m3u8_url,
 909             'ext': ext,
 910             'protocol': 'm3u8',
 911             'preference': preference - 1 if preference else -1,
 912             'resolution': 'multiple',
 913             'format_note': 'Quality selection URL',
 914         }]
 915
 916         format_url = lambda u: (
 917             u
 918             if re.match(r'^https?://', u)
 919             else compat_urlparse.urljoin(m3u8_url, u))
 920
 921         m3u8_doc = self._download_webpage(
 922             m3u8_url, video_id,
 923             note=note or 'Downloading m3u8 information',
 924             errnote=errnote or 'Failed to download m3u8 information',
 925             fatal=fatal)
 926         if m3u8_doc is False:
 927             return m3u8_doc
 928         last_info = None
 929         last_media = None
 930         kv_rex = re.compile(
 931             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 932         for line in m3u8_doc.splitlines():
 933             if line.startswith('#EXT-X-STREAM-INF:'):
 934                 last_info = {}
 935                 for m in kv_rex.finditer(line):
 936                     v = m.group('val')
 937                     if v.startswith('"'):
 938                         v = v[1:-1]
 939                     last_info[m.group('key')] = v
 940             elif line.startswith('#EXT-X-MEDIA:'):
 941                 last_media = {}
 942                 for m in kv_rex.finditer(line):
 943                     v = m.group('val')
 944                     if v.startswith('"'):
 945                         v = v[1:-1]
 946                     last_media[m.group('key')] = v
 947             elif line.startswith('#') or not line.strip():
 948                 continue
 949             else:
 950                 if last_info is None:
 951                     formats.append({'url': format_url(line)})
 952                     continue
 953                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 954                 format_id = []
 955                 if m3u8_id:
 956                     format_id.append(m3u8_id)
 957                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 958                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 959                 f = {
 960                     'format_id': '-'.join(format_id),
 961                     'url': format_url(line.strip()),
 962                     'tbr': tbr,
 963                     'ext': ext,
 964                     'protocol': entry_protocol,
 965                     'preference': preference,
 966                 }
 967                 codecs = last_info.get('CODECS')
 968                 if codecs:
 969                     # TODO: looks like video codec is not always necessarily goes first
 970                     va_codecs = codecs.split(',')
 971                     if va_codecs[0]:
 972                         f['vcodec'] = va_codecs[0].partition('.')[0]
 973                     if len(va_codecs) > 1 and va_codecs[1]:
 974                         f['acodec'] = va_codecs[1].partition('.')[0]
 975                 resolution = last_info.get('RESOLUTION')
 976                 if resolution:
 977                     width_str, height_str = resolution.split('x')
 978                     f['width'] = int(width_str)
 979                     f['height'] = int(height_str)
 980                 if last_media is not None:
 981                     f['m3u8_media'] = last_media
 982                     last_media = None
 983                 formats.append(f)
 984                 last_info = {}
 985         self._sort_formats(formats)
 986         return formats
 987
 988     @staticmethod
 989     def _xpath_ns(path, namespace=None):
 990         if not namespace:
 991             return path
 992         out = []
 993         for c in path.split('/'):
 994             if not c or c == '.':
 995                 out.append(c)
 996             else:
 997                 out.append('{%s}%s' % (namespace, c))
 998         return '/'.join(out)
 999
1000     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1001         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1002
1003         if smil is False:
1004             assert not fatal
1005             return []
1006
1007         namespace = self._parse_smil_namespace(smil)
1008
1009         return self._parse_smil_formats(
1010             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1011
1012     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1013         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1014         if smil is False:
1015             return {}
1016         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1017
1018     def _download_smil(self, smil_url, video_id, fatal=True):
1019         return self._download_xml(
1020             smil_url, video_id, 'Downloading SMIL file',
1021             'Unable to download SMIL file', fatal=fatal)
1022
1023     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1024         namespace = self._parse_smil_namespace(smil)
1025
1026         formats = self._parse_smil_formats(
1027             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1028         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1029
1030         video_id = os.path.splitext(url_basename(smil_url))[0]
1031         title = None
1032         description = None
1033         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1034             name = meta.attrib.get('name')
1035             content = meta.attrib.get('content')
1036             if not name or not content:
1037                 continue
1038             if not title and name == 'title':
1039                 title = content
1040             elif not description and name in ('description', 'abstract'):
1041                 description = content
1042
1043         return {
1044             'id': video_id,
1045             'title': title or video_id,
1046             'description': description,
1047             'formats': formats,
1048             'subtitles': subtitles,
1049         }
1050
1051     def _parse_smil_namespace(self, smil):
1052         return self._search_regex(
1053             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1054
1055     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1056         base = smil_url
1057         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1058             b = meta.get('base') or meta.get('httpBase')
1059             if b:
1060                 base = b
1061                 break
1062
1063         formats = []
1064         rtmp_count = 0
1065         http_count = 0
1066
1067         videos = smil.findall(self._xpath_ns('.//video', namespace))
1068         for video in videos:
1069             src = video.get('src')
1070             if not src:
1071                 continue
1072
1073             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1074             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1075             width = int_or_none(video.get('width'))
1076             height = int_or_none(video.get('height'))
1077             proto = video.get('proto')
1078             ext = video.get('ext')
1079             src_ext = determine_ext(src)
1080             streamer = video.get('streamer') or base
1081
1082             if proto == 'rtmp' or streamer.startswith('rtmp'):
1083                 rtmp_count += 1
1084                 formats.append({
1085                     'url': streamer,
1086                     'play_path': src,
1087                     'ext': 'flv',
1088                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1089                     'tbr': bitrate,
1090                     'filesize': filesize,
1091                     'width': width,
1092                     'height': height,
1093                 })
1094                 if transform_rtmp_url:
1095                     streamer, src = transform_rtmp_url(streamer, src)
1096                     formats[-1].update({
1097                         'url': streamer,
1098                         'play_path': src,
1099                     })
1100                 continue
1101
1102             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1103
1104             if proto == 'm3u8' or src_ext == 'm3u8':
1105                 formats.extend(self._extract_m3u8_formats(
1106                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1107                 continue
1108
1109             if src_ext == 'f4m':
1110                 f4m_url = src_url
1111                 if not f4m_params:
1112                     f4m_params = {
1113                         'hdcore': '3.2.0',
1114                         'plugin': 'flowplayer-3.2.0.1',
1115                     }
1116                 f4m_url += '&' if '?' in f4m_url else '?'
1117                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1118                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1119                 continue
1120
1121             if src_url.startswith('http'):
1122                 http_count += 1
1123                 formats.append({
1124                     'url': src_url,
1125                     'ext': ext or src_ext or 'flv',
1126                     'format_id': 'http-%d' % (bitrate or http_count),
1127                     'tbr': bitrate,
1128                     'filesize': filesize,
1129                     'width': width,
1130                     'height': height,
1131                 })
1132                 continue
1133
1134         self._sort_formats(formats)
1135
1136         return formats
1137
1138     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1139         subtitles = {}
1140         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1141             src = textstream.get('src')
1142             if not src:
1143                 continue
1144             ext = textstream.get('ext') or determine_ext(src)
1145             if not ext:
1146                 type_ = textstream.get('type')
1147                 SUBTITLES_TYPES = {
1148                     'text/vtt': 'vtt',
1149                     'text/srt': 'srt',
1150                     'application/smptett+xml': 'tt',
1151                 }
1152                 if type_ in SUBTITLES_TYPES:
1153                     ext = SUBTITLES_TYPES[type_]
1154             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or subtitles_lang
1155             subtitles.setdefault(lang, []).append({
1156                 'url': src,
1157                 'ext': ext,
1158             })
1159         return subtitles
1160
1161     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1162         xspf = self._download_xml(
1163             playlist_url, playlist_id, 'Downloading xpsf playlist',
1164             'Unable to download xspf manifest', fatal=fatal)
1165         if xspf is False:
1166             return []
1167         return self._parse_xspf(xspf, playlist_id)
1168
1169     def _parse_xspf(self, playlist, playlist_id):
1170         NS_MAP = {
1171             'xspf': 'http://xspf.org/ns/0/',
1172             's1': 'http://static.streamone.nl/player/ns/0',
1173         }
1174
1175         entries = []
1176         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1177             title = xpath_text(
1178                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1179             description = xpath_text(
1180                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1181             thumbnail = xpath_text(
1182                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1183             duration = float_or_none(
1184                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1185
1186             formats = [{
1187                 'url': location.text,
1188                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1189                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1190                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1191             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1192             self._sort_formats(formats)
1193
1194             entries.append({
1195                 'id': playlist_id,
1196                 'title': title,
1197                 'description': description,
1198                 'thumbnail': thumbnail,
1199                 'duration': duration,
1200                 'formats': formats,
1201             })
1202         return entries
1203
1204     def _live_title(self, name):
1205         """ Generate the title for a live video """
1206         now = datetime.datetime.now()
1207         now_str = now.strftime("%Y-%m-%d %H:%M")
1208         return name + ' ' + now_str
1209
1210     def _int(self, v, name, fatal=False, **kwargs):
1211         res = int_or_none(v, **kwargs)
1212         if 'get_attr' in kwargs:
1213             print(getattr(v, kwargs['get_attr']))
1214         if res is None:
1215             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1216             if fatal:
1217                 raise ExtractorError(msg)
1218             else:
1219                 self._downloader.report_warning(msg)
1220         return res
1221
1222     def _float(self, v, name, fatal=False, **kwargs):
1223         res = float_or_none(v, **kwargs)
1224         if res is None:
1225             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1226             if fatal:
1227                 raise ExtractorError(msg)
1228             else:
1229                 self._downloader.report_warning(msg)
1230         return res
1231
1232     def _set_cookie(self, domain, name, value, expire_time=None):
1233         cookie = compat_cookiejar.Cookie(
1234             0, name, value, None, None, domain, None,
1235             None, '/', True, False, expire_time, '', None, None, None)
1236         self._downloader.cookiejar.set_cookie(cookie)
1237
1238     def _get_cookies(self, url):
1239         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1240         req = compat_urllib_request.Request(url)
1241         self._downloader.cookiejar.add_cookie_header(req)
1242         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1243
1244     def get_testcases(self, include_onlymatching=False):
1245         t = getattr(self, '_TEST', None)
1246         if t:
1247             assert not hasattr(self, '_TESTS'), \
1248                 '%s has _TEST and _TESTS' % type(self).__name__
1249             tests = [t]
1250         else:
1251             tests = getattr(self, '_TESTS', [])
1252         for t in tests:
1253             if not include_onlymatching and t.get('only_matching', False):
1254                 continue
1255             t['name'] = type(self).__name__[:-len('IE')]
1256             yield t
1257
1258     def is_suitable(self, age_limit):
1259         """ Test whether the extractor is generally suitable for the given
1260         age limit (i.e. pornographic sites are not, all others usually are) """
1261
1262         any_restricted = False
1263         for tc in self.get_testcases(include_onlymatching=False):
1264             if 'playlist' in tc:
1265                 tc = tc['playlist'][0]
1266             is_restricted = age_restricted(
1267                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1268             if not is_restricted:
1269                 return True
1270             any_restricted = any_restricted or is_restricted
1271         return not any_restricted
1272
1273     def extract_subtitles(self, *args, **kwargs):
1274         if (self._downloader.params.get('writesubtitles', False) or
1275                 self._downloader.params.get('listsubtitles')):
1276             return self._get_subtitles(*args, **kwargs)
1277         return {}
1278
1279     def _get_subtitles(self, *args, **kwargs):
1280         raise NotImplementedError("This method must be implemented by subclasses")
1281
1282     def extract_automatic_captions(self, *args, **kwargs):
1283         if (self._downloader.params.get('writeautomaticsub', False) or
1284                 self._downloader.params.get('listsubtitles')):
1285             return self._get_automatic_captions(*args, **kwargs)
1286         return {}
1287
1288     def _get_automatic_captions(self, *args, **kwargs):
1289         raise NotImplementedError("This method must be implemented by subclasses")
1290
1291
1292 class SearchInfoExtractor(InfoExtractor):
1293     """
1294     Base class for paged search queries extractors.
1295     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1296     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1297     """
1298
1299     @classmethod
1300     def _make_valid_url(cls):
1301         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1302
1303     @classmethod
1304     def suitable(cls, url):
1305         return re.match(cls._make_valid_url(), url) is not None
1306
1307     def _real_extract(self, query):
1308         mobj = re.match(self._make_valid_url(), query)
1309         if mobj is None:
1310             raise ExtractorError('Invalid search query "%s"' % query)
1311
1312         prefix = mobj.group('prefix')
1313         query = mobj.group('query')
1314         if prefix == '':
1315             return self._get_n_results(query, 1)
1316         elif prefix == 'all':
1317             return self._get_n_results(query, self._MAX_RESULTS)
1318         else:
1319             n = int(prefix)
1320             if n <= 0:
1321                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1322             elif n > self._MAX_RESULTS:
1323                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1324                 n = self._MAX_RESULTS
1325             return self._get_n_results(query, n)
1326
1327     def _get_n_results(self, query, n):
1328         """Get a specified number of results for a query"""
1329         raise NotImplementedError("This method must be implemented by subclasses")
1330
1331     @property
1332     def SEARCH_KEY(self):
1333         return self._SEARCH_KEY