_ Git - youtube-dl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_HTTPError,
  20     compat_http_client,
  21     compat_urllib_error,
  22     compat_urllib_parse,
  23     compat_urllib_parse_urlparse,
  24     compat_urllib_request,
  25     compat_urlparse,
  26     compat_str,
  27 )
  28 from ..utils import (
  29     NO_DEFAULT,
  30     age_restricted,
  31     bug_reports_message,
  32     clean_html,
  33     compiled_regex_type,
  34     determine_ext,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     unescapeHTML,
  42     unified_strdate,
  43     url_basename,
  44     xpath_text,
  45     xpath_with_ns,
  46 )
  47
  48
  49 class InfoExtractor(object):
  50     """Information Extractor class.
  51
  52     Information extractors are the classes that, given a URL, extract
  53     information about the video (or videos) the URL refers to. This
  54     information includes the real video URL, the video title, author and
  55     others. The information is stored in a dictionary which is then
  56     passed to the YoutubeDL. The YoutubeDL processes this
  57     information possibly downloading the video to the file system, among
  58     other possible outcomes.
  59
  60     The type field determines the type of the result.
  61     By far the most common value (and the default if _type is missing) is
  62     "video", which indicates a single video.
  63
  64     For a video, the dictionaries must include the following fields:
  65
  66     id:             Video identifier.
  67     title:          Video title, unescaped.
  68
  69     Additionally, it must contain either a formats entry or a url one:
  70
  71     formats:        A list of dictionaries for each format available, ordered
  72                     from worst to best quality.
  73
  74                     Potential fields:
  75                     * url        Mandatory. The URL of the video file
  76                     * ext        Will be calculated from URL if missing
  77                     * format     A human-readable description of the format
  78                                  ("mp4 container with h264/opus").
  79                                  Calculated from the format_id, width, height.
  80                                  and format_note fields if missing.
  81                     * format_id  A short description of the format
  82                                  ("mp4_h264_opus" or "19").
  83                                 Technically optional, but strongly recommended.
  84                     * format_note Additional info about the format
  85                                  ("3D" or "DASH video")
  86                     * width      Width of the video, if known
  87                     * height     Height of the video, if known
  88                     * resolution Textual description of width and height
  89                     * tbr        Average bitrate of audio and video in KBit/s
  90                     * abr        Average audio bitrate in KBit/s
  91                     * acodec     Name of the audio codec in use
  92                     * asr        Audio sampling rate in Hertz
  93                     * vbr        Average video bitrate in KBit/s
  94                     * fps        Frame rate
  95                     * vcodec     Name of the video codec in use
  96                     * container  Name of the container format
  97                     * filesize   The number of bytes, if known in advance
  98                     * filesize_approx  An estimate for the number of bytes
  99                     * player_url SWF Player URL (used for rtmpdump).
 100                     * protocol   The protocol that will be used for the actual
 101                                  download, lower-case.
 102                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 103                                  "m3u8", or "m3u8_native".
 104                     * preference Order number of this format. If this field is
 105                                  present and not None, the formats get sorted
 106                                  by this field, regardless of all other values.
 107                                  -1 for default (order by other properties),
 108                                  -2 or smaller for less than default.
 109                                  < -1000 to hide the format (if there is
 110                                     another one which is strictly better)
 111                     * language_preference  Is this in the correct requested
 112                                  language?
 113                                  10 if it's what the URL is about,
 114                                  -1 for default (don't know),
 115                                  -10 otherwise, other values reserved for now.
 116                     * quality    Order number of the video quality of this
 117                                  format, irrespective of the file format.
 118                                  -1 for default (order by other properties),
 119                                  -2 or smaller for less than default.
 120                     * source_preference  Order number for this video source
 121                                   (quality takes higher priority)
 122                                  -1 for default (order by other properties),
 123                                  -2 or smaller for less than default.
 124                     * http_headers  A dictionary of additional HTTP headers
 125                                  to add to the request.
 126                     * stretched_ratio  If given and not 1, indicates that the
 127                                  video's pixels are not square.
 128                                  width : height ratio as float.
 129                     * no_resume  The server does not support resuming the
 130                                  (HTTP or RTMP) download. Boolean.
 131
 132     url:            Final video URL.
 133     ext:            Video filename extension.
 134     format:         The video format, defaults to ext (used for --get-format)
 135     player_url:     SWF Player URL (used for rtmpdump).
 136
 137     The following fields are optional:
 138
 139     alt_title:      A secondary title of the video.
 140     display_id      An alternative identifier for the video, not necessarily
 141                     unique, but available before title. Typically, id is
 142                     something like "4234987", title "Dancing naked mole rats",
 143                     and display_id "dancing-naked-mole-rats"
 144     thumbnails:     A list of dictionaries, with the following entries:
 145                         * "id" (optional, string) - Thumbnail format ID
 146                         * "url"
 147                         * "preference" (optional, int) - quality of the image
 148                         * "width" (optional, int)
 149                         * "height" (optional, int)
 150                         * "resolution" (optional, string "{width}x{height"},
 151                                         deprecated)
 152     thumbnail:      Full URL to a video thumbnail image.
 153     description:    Full video description.
 154     uploader:       Full name of the video uploader.
 155     creator:        The main artist who created the video.
 156     release_date:   The date (YYYYMMDD) when the video was released.
 157     timestamp:      UNIX timestamp of the moment the video became available.
 158     upload_date:    Video upload date (YYYYMMDD).
 159                     If not explicitly set, calculated from timestamp.
 160     uploader_id:    Nickname or id of the video uploader.
 161     location:       Physical location where the video was filmed.
 162     subtitles:      The available subtitles as a dictionary in the format
 163                     {language: subformats}. "subformats" is a list sorted from
 164                     lower to higher preference, each element is a dictionary
 165                     with the "ext" entry and one of:
 166                         * "data": The subtitles file contents
 167                         * "url": A URL pointing to the subtitles file
 168     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 169                     automatically generated captions
 170     duration:       Length of the video in seconds, as an integer.
 171     view_count:     How many users have watched the video on the platform.
 172     like_count:     Number of positive ratings of the video
 173     dislike_count:  Number of negative ratings of the video
 174     average_rating: Average rating give by users, the scale used depends on the webpage
 175     comment_count:  Number of comments on the video
 176     comments:       A list of comments, each with one or more of the following
 177                     properties (all but one of text or html optional):
 178                         * "author" - human-readable name of the comment author
 179                         * "author_id" - user ID of the comment author
 180                         * "id" - Comment ID
 181                         * "html" - Comment as HTML
 182                         * "text" - Plain text of the comment
 183                         * "timestamp" - UNIX timestamp of comment
 184                         * "parent" - ID of the comment this one is replying to.
 185                                      Set to "root" to indicate that this is a
 186                                      comment to the original video.
 187     age_limit:      Age restriction for the video, as an integer (years)
 188     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 189                     should allow to get the same result again. (It will be set
 190                     by YoutubeDL if it's missing)
 191     categories:     A list of categories that the video falls in, for example
 192                     ["Sports", "Berlin"]
 193     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 194     is_live:        True, False, or None (=unknown). Whether this video is a
 195                     live stream that goes on instead of a fixed-length video.
 196     start_time:     Time in seconds where the reproduction should start, as
 197                     specified in the URL.
 198     end_time:       Time in seconds where the reproduction should end, as
 199                     specified in the URL.
 200
 201     Unless mentioned otherwise, the fields should be Unicode strings.
 202
 203     Unless mentioned otherwise, None is equivalent to absence of information.
 204
 205
 206     _type "playlist" indicates multiple videos.
 207     There must be a key "entries", which is a list, an iterable, or a PagedList
 208     object, each element of which is a valid dictionary by this specification.
 209
 210     Additionally, playlists can have "title", "description" and "id" attributes
 211     with the same semantics as videos (see above).
 212
 213
 214     _type "multi_video" indicates that there are multiple videos that
 215     form a single show, for examples multiple acts of an opera or TV episode.
 216     It must have an entries key like a playlist and contain all the keys
 217     required for a video at the same time.
 218
 219
 220     _type "url" indicates that the video must be extracted from another
 221     location, possibly by a different extractor. Its only required key is:
 222     "url" - the next URL to extract.
 223     The key "ie_key" can be set to the class name (minus the trailing "IE",
 224     e.g. "Youtube") if the extractor class is known in advance.
 225     Additionally, the dictionary may have any properties of the resolved entity
 226     known in advance, for example "title" if the title of the referred video is
 227     known ahead of time.
 228
 229
 230     _type "url_transparent" entities have the same specification as "url", but
 231     indicate that the given additional information is more precise than the one
 232     associated with the resolved URL.
 233     This is useful when a site employs a video service that hosts the video and
 234     its technical metadata, but that video service does not embed a useful
 235     title, description etc.
 236
 237
 238     Subclasses of this one should re-define the _real_initialize() and
 239     _real_extract() methods and define a _VALID_URL regexp.
 240     Probably, they should also be added to the list of extractors.
 241
 242     Finally, the _WORKING attribute should be set to False for broken IEs
 243     in order to warn the users and skip the tests.
 244     """
 245
 246     _ready = False
 247     _downloader = None
 248     _WORKING = True
 249
 250     def __init__(self, downloader=None):
 251         """Constructor. Receives an optional downloader."""
 252         self._ready = False
 253         self.set_downloader(downloader)
 254
 255     @classmethod
 256     def suitable(cls, url):
 257         """Receives a URL and returns True if suitable for this IE."""
 258
 259         # This does not use has/getattr intentionally - we want to know whether
 260         # we have cached the regexp for *this* class, whereas getattr would also
 261         # match the superclass
 262         if '_VALID_URL_RE' not in cls.__dict__:
 263             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 264         return cls._VALID_URL_RE.match(url) is not None
 265
 266     @classmethod
 267     def _match_id(cls, url):
 268         if '_VALID_URL_RE' not in cls.__dict__:
 269             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 270         m = cls._VALID_URL_RE.match(url)
 271         assert m
 272         return m.group('id')
 273
 274     @classmethod
 275     def working(cls):
 276         """Getter method for _WORKING."""
 277         return cls._WORKING
 278
 279     def initialize(self):
 280         """Initializes an instance (authentication, etc)."""
 281         if not self._ready:
 282             self._real_initialize()
 283             self._ready = True
 284
 285     def extract(self, url):
 286         """Extracts URL information and returns it in list of dicts."""
 287         try:
 288             self.initialize()
 289             return self._real_extract(url)
 290         except ExtractorError:
 291             raise
 292         except compat_http_client.IncompleteRead as e:
 293             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 294         except (KeyError, StopIteration) as e:
 295             raise ExtractorError('An extractor error has occured.', cause=e)
 296
 297     def set_downloader(self, downloader):
 298         """Sets the downloader for this IE."""
 299         self._downloader = downloader
 300
 301     def _real_initialize(self):
 302         """Real initialization process. Redefine in subclasses."""
 303         pass
 304
 305     def _real_extract(self, url):
 306         """Real extraction process. Redefine in subclasses."""
 307         pass
 308
 309     @classmethod
 310     def ie_key(cls):
 311         """A string for getting the InfoExtractor with get_info_extractor"""
 312         return cls.__name__[:-2]
 313
 314     @property
 315     def IE_NAME(self):
 316         return type(self).__name__[:-2]
 317
 318     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 319         """ Returns the response handle """
 320         if note is None:
 321             self.report_download_webpage(video_id)
 322         elif note is not False:
 323             if video_id is None:
 324                 self.to_screen('%s' % (note,))
 325             else:
 326                 self.to_screen('%s: %s' % (video_id, note))
 327         try:
 328             return self._downloader.urlopen(url_or_request)
 329         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 330             if errnote is False:
 331                 return False
 332             if errnote is None:
 333                 errnote = 'Unable to download webpage'
 334             errmsg = '%s: %s' % (errnote, compat_str(err))
 335             if fatal:
 336                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 337             else:
 338                 self._downloader.report_warning(errmsg)
 339                 return False
 340
 341     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 342         """ Returns a tuple (page content as string, URL handle) """
 343         # Strip hashes from the URL (#1038)
 344         if isinstance(url_or_request, (compat_str, str)):
 345             url_or_request = url_or_request.partition('#')[0]
 346
 347         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 348         if urlh is False:
 349             assert not fatal
 350             return False
 351         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 352         return (content, urlh)
 353
 354     @staticmethod
 355     def _guess_encoding_from_content(content_type, webpage_bytes):
 356         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 357         if m:
 358             encoding = m.group(1)
 359         else:
 360             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 361                           webpage_bytes[:1024])
 362             if m:
 363                 encoding = m.group(1).decode('ascii')
 364             elif webpage_bytes.startswith(b'\xff\xfe'):
 365                 encoding = 'utf-16'
 366             else:
 367                 encoding = 'utf-8'
 368
 369         return encoding
 370
 371     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 372         content_type = urlh.headers.get('Content-Type', '')
 373         webpage_bytes = urlh.read()
 374         if prefix is not None:
 375             webpage_bytes = prefix + webpage_bytes
 376         if not encoding:
 377             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 378         if self._downloader.params.get('dump_intermediate_pages', False):
 379             try:
 380                 url = url_or_request.get_full_url()
 381             except AttributeError:
 382                 url = url_or_request
 383             self.to_screen('Dumping request to ' + url)
 384             dump = base64.b64encode(webpage_bytes).decode('ascii')
 385             self._downloader.to_screen(dump)
 386         if self._downloader.params.get('write_pages', False):
 387             try:
 388                 url = url_or_request.get_full_url()
 389             except AttributeError:
 390                 url = url_or_request
 391             basen = '%s_%s' % (video_id, url)
 392             if len(basen) > 240:
 393                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 394                 basen = basen[:240 - len(h)] + h
 395             raw_filename = basen + '.dump'
 396             filename = sanitize_filename(raw_filename, restricted=True)
 397             self.to_screen('Saving request to ' + filename)
 398             # Working around MAX_PATH limitation on Windows (see
 399             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 400             if os.name == 'nt':
 401                 absfilepath = os.path.abspath(filename)
 402                 if len(absfilepath) > 259:
 403                     filename = '\\\\?\\' + absfilepath
 404             with open(filename, 'wb') as outf:
 405                 outf.write(webpage_bytes)
 406
 407         try:
 408             content = webpage_bytes.decode(encoding, 'replace')
 409         except LookupError:
 410             content = webpage_bytes.decode('utf-8', 'replace')
 411
 412         if ('<title>Access to this site is blocked</title>' in content and
 413                 'Websense' in content[:512]):
 414             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 415             blocked_iframe = self._html_search_regex(
 416                 r'<iframe src="([^"]+)"', content,
 417                 'Websense information URL', default=None)
 418             if blocked_iframe:
 419                 msg += ' Visit %s for more details' % blocked_iframe
 420             raise ExtractorError(msg, expected=True)
 421         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 422             msg = (
 423                 'Access to this webpage has been blocked by Indian censorship. '
 424                 'Use a VPN or proxy server (with --proxy) to route around it.')
 425             block_msg = self._html_search_regex(
 426                 r'</h1><p>(.*?)</p>',
 427                 content, 'block message', default=None)
 428             if block_msg:
 429                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 430             raise ExtractorError(msg, expected=True)
 431
 432         return content
 433
 434     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 435         """ Returns the data of the page as a string """
 436         success = False
 437         try_count = 0
 438         while success is False:
 439             try:
 440                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 441                 success = True
 442             except compat_http_client.IncompleteRead as e:
 443                 try_count += 1
 444                 if try_count >= tries:
 445                     raise e
 446                 self._sleep(timeout, video_id)
 447         if res is False:
 448             return res
 449         else:
 450             content, _ = res
 451             return content
 452
 453     def _download_xml(self, url_or_request, video_id,
 454                       note='Downloading XML', errnote='Unable to download XML',
 455                       transform_source=None, fatal=True, encoding=None):
 456         """Return the xml as an xml.etree.ElementTree.Element"""
 457         xml_string = self._download_webpage(
 458             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 459         if xml_string is False:
 460             return xml_string
 461         if transform_source:
 462             xml_string = transform_source(xml_string)
 463         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 464
 465     def _download_json(self, url_or_request, video_id,
 466                        note='Downloading JSON metadata',
 467                        errnote='Unable to download JSON metadata',
 468                        transform_source=None,
 469                        fatal=True, encoding=None):
 470         json_string = self._download_webpage(
 471             url_or_request, video_id, note, errnote, fatal=fatal,
 472             encoding=encoding)
 473         if (not fatal) and json_string is False:
 474             return None
 475         return self._parse_json(
 476             json_string, video_id, transform_source=transform_source, fatal=fatal)
 477
 478     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 479         if transform_source:
 480             json_string = transform_source(json_string)
 481         try:
 482             return json.loads(json_string)
 483         except ValueError as ve:
 484             errmsg = '%s: Failed to parse JSON ' % video_id
 485             if fatal:
 486                 raise ExtractorError(errmsg, cause=ve)
 487             else:
 488                 self.report_warning(errmsg + str(ve))
 489
 490     def report_warning(self, msg, video_id=None):
 491         idstr = '' if video_id is None else '%s: ' % video_id
 492         self._downloader.report_warning(
 493             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 494
 495     def to_screen(self, msg):
 496         """Print msg to screen, prefixing it with '[ie_name]'"""
 497         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 498
 499     def report_extraction(self, id_or_name):
 500         """Report information extraction."""
 501         self.to_screen('%s: Extracting information' % id_or_name)
 502
 503     def report_download_webpage(self, video_id):
 504         """Report webpage download."""
 505         self.to_screen('%s: Downloading webpage' % video_id)
 506
 507     def report_age_confirmation(self):
 508         """Report attempt to confirm age."""
 509         self.to_screen('Confirming age')
 510
 511     def report_login(self):
 512         """Report attempt to log in."""
 513         self.to_screen('Logging in')
 514
 515     @staticmethod
 516     def raise_login_required(msg='This video is only available for registered users'):
 517         raise ExtractorError(
 518             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 519             expected=True)
 520
 521     @staticmethod
 522     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 523         raise ExtractorError(
 524             '%s. You might want to use --proxy to workaround.' % msg,
 525             expected=True)
 526
 527     # Methods for following #608
 528     @staticmethod
 529     def url_result(url, ie=None, video_id=None, video_title=None):
 530         """Returns a URL that points to a page that should be processed"""
 531         # TODO: ie should be the class used for getting the info
 532         video_info = {'_type': 'url',
 533                       'url': url,
 534                       'ie_key': ie}
 535         if video_id is not None:
 536             video_info['id'] = video_id
 537         if video_title is not None:
 538             video_info['title'] = video_title
 539         return video_info
 540
 541     @staticmethod
 542     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 543         """Returns a playlist"""
 544         video_info = {'_type': 'playlist',
 545                       'entries': entries}
 546         if playlist_id:
 547             video_info['id'] = playlist_id
 548         if playlist_title:
 549             video_info['title'] = playlist_title
 550         if playlist_description:
 551             video_info['description'] = playlist_description
 552         return video_info
 553
 554     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 555         """
 556         Perform a regex search on the given string, using a single or a list of
 557         patterns returning the first matching group.
 558         In case of failure return a default value or raise a WARNING or a
 559         RegexNotFoundError, depending on fatal, specifying the field name.
 560         """
 561         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 562             mobj = re.search(pattern, string, flags)
 563         else:
 564             for p in pattern:
 565                 mobj = re.search(p, string, flags)
 566                 if mobj:
 567                     break
 568
 569         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 570             _name = '\033[0;34m%s\033[0m' % name
 571         else:
 572             _name = name
 573
 574         if mobj:
 575             if group is None:
 576                 # return the first matching group
 577                 return next(g for g in mobj.groups() if g is not None)
 578             else:
 579                 return mobj.group(group)
 580         elif default is not NO_DEFAULT:
 581             return default
 582         elif fatal:
 583             raise RegexNotFoundError('Unable to extract %s' % _name)
 584         else:
 585             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 586             return None
 587
 588     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 589         """
 590         Like _search_regex, but strips HTML tags and unescapes entities.
 591         """
 592         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 593         if res:
 594             return clean_html(res).strip()
 595         else:
 596             return res
 597
 598     def _get_login_info(self):
 599         """
 600         Get the login info as (username, password)
 601         It will look in the netrc file using the _NETRC_MACHINE value
 602         If there's no info available, return (None, None)
 603         """
 604         if self._downloader is None:
 605             return (None, None)
 606
 607         username = None
 608         password = None
 609         downloader_params = self._downloader.params
 610
 611         # Attempt to use provided username and password or .netrc data
 612         if downloader_params.get('username', None) is not None:
 613             username = downloader_params['username']
 614             password = downloader_params['password']
 615         elif downloader_params.get('usenetrc', False):
 616             try:
 617                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 618                 if info is not None:
 619                     username = info[0]
 620                     password = info[2]
 621                 else:
 622                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 623             except (IOError, netrc.NetrcParseError) as err:
 624                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 625
 626         return (username, password)
 627
 628     def _get_tfa_info(self, note='two-factor verification code'):
 629         """
 630         Get the two-factor authentication info
 631         TODO - asking the user will be required for sms/phone verify
 632         currently just uses the command line option
 633         If there's no info available, return None
 634         """
 635         if self._downloader is None:
 636             return None
 637         downloader_params = self._downloader.params
 638
 639         if downloader_params.get('twofactor', None) is not None:
 640             return downloader_params['twofactor']
 641
 642         return compat_getpass('Type %s and press [Return]: ' % note)
 643
 644     # Helper functions for extracting OpenGraph info
 645     @staticmethod
 646     def _og_regexes(prop):
 647         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 648         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 649         template = r'<meta[^>]+?%s[^>]+?%s'
 650         return [
 651             template % (property_re, content_re),
 652             template % (content_re, property_re),
 653         ]
 654
 655     @staticmethod
 656     def _meta_regex(prop):
 657         return r'''(?isx)<meta
 658                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 659                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 660
 661     def _og_search_property(self, prop, html, name=None, **kargs):
 662         if name is None:
 663             name = 'OpenGraph %s' % prop
 664         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 665         if escaped is None:
 666             return None
 667         return unescapeHTML(escaped)
 668
 669     def _og_search_thumbnail(self, html, **kargs):
 670         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 671
 672     def _og_search_description(self, html, **kargs):
 673         return self._og_search_property('description', html, fatal=False, **kargs)
 674
 675     def _og_search_title(self, html, **kargs):
 676         return self._og_search_property('title', html, **kargs)
 677
 678     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 679         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 680         if secure:
 681             regexes = self._og_regexes('video:secure_url') + regexes
 682         return self._html_search_regex(regexes, html, name, **kargs)
 683
 684     def _og_search_url(self, html, **kargs):
 685         return self._og_search_property('url', html, **kargs)
 686
 687     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 688         if display_name is None:
 689             display_name = name
 690         return self._html_search_regex(
 691             self._meta_regex(name),
 692             html, display_name, fatal=fatal, group='content', **kwargs)
 693
 694     def _dc_search_uploader(self, html):
 695         return self._html_search_meta('dc.creator', html, 'uploader')
 696
 697     def _rta_search(self, html):
 698         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 699         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 700                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 701                      html):
 702             return 18
 703         return 0
 704
 705     def _media_rating_search(self, html):
 706         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 707         rating = self._html_search_meta('rating', html)
 708
 709         if not rating:
 710             return None
 711
 712         RATING_TABLE = {
 713             'safe for kids': 0,
 714             'general': 8,
 715             '14 years': 14,
 716             'mature': 17,
 717             'restricted': 19,
 718         }
 719         return RATING_TABLE.get(rating.lower(), None)
 720
 721     def _family_friendly_search(self, html):
 722         # See http://schema.org/VideoObject
 723         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 724
 725         if not family_friendly:
 726             return None
 727
 728         RATING_TABLE = {
 729             '1': 0,
 730             'true': 0,
 731             '0': 18,
 732             'false': 18,
 733         }
 734         return RATING_TABLE.get(family_friendly.lower(), None)
 735
 736     def _twitter_search_player(self, html):
 737         return self._html_search_meta('twitter:player', html,
 738                                       'twitter card player')
 739
 740     @staticmethod
 741     def _hidden_inputs(html):
 742         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 743         hidden_inputs = {}
 744         for input in re.findall(r'(?i)<input([^>]+)>', html):
 745             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 746                 continue
 747             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 748             if not name:
 749                 continue
 750             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 751             if not value:
 752                 continue
 753             hidden_inputs[name.group('value')] = value.group('value')
 754         return hidden_inputs
 755
 756     def _form_hidden_inputs(self, form_id, html):
 757         form = self._search_regex(
 758             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 759             html, '%s form' % form_id, group='form')
 760         return self._hidden_inputs(form)
 761
 762     def _sort_formats(self, formats, field_preference=None):
 763         if not formats:
 764             raise ExtractorError('No video formats found')
 765
 766         def _formats_key(f):
 767             # TODO remove the following workaround
 768             from ..utils import determine_ext
 769             if not f.get('ext') and 'url' in f:
 770                 f['ext'] = determine_ext(f['url'])
 771
 772             if isinstance(field_preference, (list, tuple)):
 773                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 774
 775             preference = f.get('preference')
 776             if preference is None:
 777                 proto = f.get('protocol')
 778                 if proto is None:
 779                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 780
 781                 preference = 0 if proto in ['http', 'https'] else -0.1
 782                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 783                     preference -= 0.5
 784
 785             if f.get('vcodec') == 'none':  # audio only
 786                 if self._downloader.params.get('prefer_free_formats'):
 787                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 788                 else:
 789                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 790                 ext_preference = 0
 791                 try:
 792                     audio_ext_preference = ORDER.index(f['ext'])
 793                 except ValueError:
 794                     audio_ext_preference = -1
 795             else:
 796                 if self._downloader.params.get('prefer_free_formats'):
 797                     ORDER = ['flv', 'mp4', 'webm']
 798                 else:
 799                     ORDER = ['webm', 'flv', 'mp4']
 800                 try:
 801                     ext_preference = ORDER.index(f['ext'])
 802                 except ValueError:
 803                     ext_preference = -1
 804                 audio_ext_preference = 0
 805
 806             return (
 807                 preference,
 808                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 809                 f.get('quality') if f.get('quality') is not None else -1,
 810                 f.get('tbr') if f.get('tbr') is not None else -1,
 811                 f.get('filesize') if f.get('filesize') is not None else -1,
 812                 f.get('vbr') if f.get('vbr') is not None else -1,
 813                 f.get('height') if f.get('height') is not None else -1,
 814                 f.get('width') if f.get('width') is not None else -1,
 815                 ext_preference,
 816                 f.get('abr') if f.get('abr') is not None else -1,
 817                 audio_ext_preference,
 818                 f.get('fps') if f.get('fps') is not None else -1,
 819                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 820                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 821                 f.get('format_id') if f.get('format_id') is not None else '',
 822             )
 823         formats.sort(key=_formats_key)
 824
 825     def _check_formats(self, formats, video_id):
 826         if formats:
 827             formats[:] = filter(
 828                 lambda f: self._is_valid_url(
 829                     f['url'], video_id,
 830                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 831                 formats)
 832
 833     def _is_valid_url(self, url, video_id, item='video'):
 834         url = self._proto_relative_url(url, scheme='http:')
 835         # For now assume non HTTP(S) URLs always valid
 836         if not (url.startswith('http://') or url.startswith('https://')):
 837             return True
 838         try:
 839             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 840             return True
 841         except ExtractorError as e:
 842             if isinstance(e.cause, compat_HTTPError):
 843                 self.to_screen(
 844                     '%s: %s URL is invalid, skipping' % (video_id, item))
 845                 return False
 846             raise
 847
 848     def http_scheme(self):
 849         """ Either "http:" or "https:", depending on the user's preferences """
 850         return (
 851             'http:'
 852             if self._downloader.params.get('prefer_insecure', False)
 853             else 'https:')
 854
 855     def _proto_relative_url(self, url, scheme=None):
 856         if url is None:
 857             return url
 858         if url.startswith('//'):
 859             if scheme is None:
 860                 scheme = self.http_scheme()
 861             return scheme + url
 862         else:
 863             return url
 864
 865     def _sleep(self, timeout, video_id, msg_template=None):
 866         if msg_template is None:
 867             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 868         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 869         self.to_screen(msg)
 870         time.sleep(timeout)
 871
 872     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 873                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 874         manifest = self._download_xml(
 875             manifest_url, video_id, 'Downloading f4m manifest',
 876             'Unable to download f4m manifest',
 877             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 878             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 879             transform_source=transform_source)
 880
 881         formats = []
 882         manifest_version = '1.0'
 883         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 884         if not media_nodes:
 885             manifest_version = '2.0'
 886             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 887         for i, media_el in enumerate(media_nodes):
 888             if manifest_version == '2.0':
 889                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 890                 if not media_url:
 891                     continue
 892                 manifest_url = (
 893                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 894                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 895                 # If media_url is itself a f4m manifest do the recursive extraction
 896                 # since bitrates in parent manifest (this one) and media_url manifest
 897                 # may differ leading to inability to resolve the format by requested
 898                 # bitrate in f4m downloader
 899                 if determine_ext(manifest_url) == 'f4m':
 900                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 901                     continue
 902             tbr = int_or_none(media_el.attrib.get('bitrate'))
 903             formats.append({
 904                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 905                 'url': manifest_url,
 906                 'ext': 'flv',
 907                 'tbr': tbr,
 908                 'width': int_or_none(media_el.attrib.get('width')),
 909                 'height': int_or_none(media_el.attrib.get('height')),
 910                 'preference': preference,
 911             })
 912         self._sort_formats(formats)
 913
 914         return formats
 915
 916     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 917                               entry_protocol='m3u8', preference=None,
 918                               m3u8_id=None, note=None, errnote=None,
 919                               fatal=True):
 920
 921         formats = [{
 922             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 923             'url': m3u8_url,
 924             'ext': ext,
 925             'protocol': 'm3u8',
 926             'preference': preference - 1 if preference else -1,
 927             'resolution': 'multiple',
 928             'format_note': 'Quality selection URL',
 929         }]
 930
 931         format_url = lambda u: (
 932             u
 933             if re.match(r'^https?://', u)
 934             else compat_urlparse.urljoin(m3u8_url, u))
 935
 936         m3u8_doc = self._download_webpage(
 937             m3u8_url, video_id,
 938             note=note or 'Downloading m3u8 information',
 939             errnote=errnote or 'Failed to download m3u8 information',
 940             fatal=fatal)
 941         if m3u8_doc is False:
 942             return m3u8_doc
 943         last_info = None
 944         last_media = None
 945         kv_rex = re.compile(
 946             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 947         for line in m3u8_doc.splitlines():
 948             if line.startswith('#EXT-X-STREAM-INF:'):
 949                 last_info = {}
 950                 for m in kv_rex.finditer(line):
 951                     v = m.group('val')
 952                     if v.startswith('"'):
 953                         v = v[1:-1]
 954                     last_info[m.group('key')] = v
 955             elif line.startswith('#EXT-X-MEDIA:'):
 956                 last_media = {}
 957                 for m in kv_rex.finditer(line):
 958                     v = m.group('val')
 959                     if v.startswith('"'):
 960                         v = v[1:-1]
 961                     last_media[m.group('key')] = v
 962             elif line.startswith('#') or not line.strip():
 963                 continue
 964             else:
 965                 if last_info is None:
 966                     formats.append({'url': format_url(line)})
 967                     continue
 968                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 969                 format_id = []
 970                 if m3u8_id:
 971                     format_id.append(m3u8_id)
 972                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 973                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 974                 f = {
 975                     'format_id': '-'.join(format_id),
 976                     'url': format_url(line.strip()),
 977                     'tbr': tbr,
 978                     'ext': ext,
 979                     'protocol': entry_protocol,
 980                     'preference': preference,
 981                 }
 982                 codecs = last_info.get('CODECS')
 983                 if codecs:
 984                     # TODO: looks like video codec is not always necessarily goes first
 985                     va_codecs = codecs.split(',')
 986                     if va_codecs[0]:
 987                         f['vcodec'] = va_codecs[0].partition('.')[0]
 988                     if len(va_codecs) > 1 and va_codecs[1]:
 989                         f['acodec'] = va_codecs[1].partition('.')[0]
 990                 resolution = last_info.get('RESOLUTION')
 991                 if resolution:
 992                     width_str, height_str = resolution.split('x')
 993                     f['width'] = int(width_str)
 994                     f['height'] = int(height_str)
 995                 if last_media is not None:
 996                     f['m3u8_media'] = last_media
 997                     last_media = None
 998                 formats.append(f)
 999                 last_info = {}
1000         self._sort_formats(formats)
1001         return formats
1002
1003     @staticmethod
1004     def _xpath_ns(path, namespace=None):
1005         if not namespace:
1006             return path
1007         out = []
1008         for c in path.split('/'):
1009             if not c or c == '.':
1010                 out.append(c)
1011             else:
1012                 out.append('{%s}%s' % (namespace, c))
1013         return '/'.join(out)
1014
1015     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1016         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1017
1018         if smil is False:
1019             assert not fatal
1020             return []
1021
1022         namespace = self._parse_smil_namespace(smil)
1023
1024         return self._parse_smil_formats(
1025             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1026
1027     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1028         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1029         if smil is False:
1030             return {}
1031         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1032
1033     def _download_smil(self, smil_url, video_id, fatal=True):
1034         return self._download_xml(
1035             smil_url, video_id, 'Downloading SMIL file',
1036             'Unable to download SMIL file', fatal=fatal)
1037
1038     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1039         namespace = self._parse_smil_namespace(smil)
1040
1041         formats = self._parse_smil_formats(
1042             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1043         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1044
1045         video_id = os.path.splitext(url_basename(smil_url))[0]
1046         title = None
1047         description = None
1048         upload_date = None
1049         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1050             name = meta.attrib.get('name')
1051             content = meta.attrib.get('content')
1052             if not name or not content:
1053                 continue
1054             if not title and name == 'title':
1055                 title = content
1056             elif not description and name in ('description', 'abstract'):
1057                 description = content
1058             elif not upload_date and name == 'date':
1059                 upload_date = unified_strdate(content)
1060
1061         thumbnails = [{
1062             'id': image.get('type'),
1063             'url': image.get('src'),
1064             'width': int_or_none(image.get('width')),
1065             'height': int_or_none(image.get('height')),
1066         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1067
1068         return {
1069             'id': video_id,
1070             'title': title or video_id,
1071             'description': description,
1072             'upload_date': upload_date,
1073             'thumbnails': thumbnails,
1074             'formats': formats,
1075             'subtitles': subtitles,
1076         }
1077
1078     def _parse_smil_namespace(self, smil):
1079         return self._search_regex(
1080             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1081
1082     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1083         base = smil_url
1084         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1085             b = meta.get('base') or meta.get('httpBase')
1086             if b:
1087                 base = b
1088                 break
1089
1090         formats = []
1091         rtmp_count = 0
1092         http_count = 0
1093
1094         videos = smil.findall(self._xpath_ns('.//video', namespace))
1095         for video in videos:
1096             src = video.get('src')
1097             if not src:
1098                 continue
1099
1100             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1101             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1102             width = int_or_none(video.get('width'))
1103             height = int_or_none(video.get('height'))
1104             proto = video.get('proto')
1105             ext = video.get('ext')
1106             src_ext = determine_ext(src)
1107             streamer = video.get('streamer') or base
1108
1109             if proto == 'rtmp' or streamer.startswith('rtmp'):
1110                 rtmp_count += 1
1111                 formats.append({
1112                     'url': streamer,
1113                     'play_path': src,
1114                     'ext': 'flv',
1115                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1116                     'tbr': bitrate,
1117                     'filesize': filesize,
1118                     'width': width,
1119                     'height': height,
1120                 })
1121                 if transform_rtmp_url:
1122                     streamer, src = transform_rtmp_url(streamer, src)
1123                     formats[-1].update({
1124                         'url': streamer,
1125                         'play_path': src,
1126                     })
1127                 continue
1128
1129             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1130
1131             if proto == 'm3u8' or src_ext == 'm3u8':
1132                 formats.extend(self._extract_m3u8_formats(
1133                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1134                 continue
1135
1136             if src_ext == 'f4m':
1137                 f4m_url = src_url
1138                 if not f4m_params:
1139                     f4m_params = {
1140                         'hdcore': '3.2.0',
1141                         'plugin': 'flowplayer-3.2.0.1',
1142                     }
1143                 f4m_url += '&' if '?' in f4m_url else '?'
1144                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1145                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1146                 continue
1147
1148             if src_url.startswith('http'):
1149                 http_count += 1
1150                 formats.append({
1151                     'url': src_url,
1152                     'ext': ext or src_ext or 'flv',
1153                     'format_id': 'http-%d' % (bitrate or http_count),
1154                     'tbr': bitrate,
1155                     'filesize': filesize,
1156                     'width': width,
1157                     'height': height,
1158                 })
1159                 continue
1160
1161         self._sort_formats(formats)
1162
1163         return formats
1164
1165     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1166         subtitles = {}
1167         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1168             src = textstream.get('src')
1169             if not src:
1170                 continue
1171             ext = textstream.get('ext') or determine_ext(src)
1172             if not ext:
1173                 type_ = textstream.get('type')
1174                 SUBTITLES_TYPES = {
1175                     'text/vtt': 'vtt',
1176                     'text/srt': 'srt',
1177                     'application/smptett+xml': 'tt',
1178                 }
1179                 if type_ in SUBTITLES_TYPES:
1180                     ext = SUBTITLES_TYPES[type_]
1181             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1182             subtitles.setdefault(lang, []).append({
1183                 'url': src,
1184                 'ext': ext,
1185             })
1186         return subtitles
1187
1188     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1189         xspf = self._download_xml(
1190             playlist_url, playlist_id, 'Downloading xpsf playlist',
1191             'Unable to download xspf manifest', fatal=fatal)
1192         if xspf is False:
1193             return []
1194         return self._parse_xspf(xspf, playlist_id)
1195
1196     def _parse_xspf(self, playlist, playlist_id):
1197         NS_MAP = {
1198             'xspf': 'http://xspf.org/ns/0/',
1199             's1': 'http://static.streamone.nl/player/ns/0',
1200         }
1201
1202         entries = []
1203         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1204             title = xpath_text(
1205                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1206             description = xpath_text(
1207                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1208             thumbnail = xpath_text(
1209                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1210             duration = float_or_none(
1211                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1212
1213             formats = [{
1214                 'url': location.text,
1215                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1216                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1217                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1218             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1219             self._sort_formats(formats)
1220
1221             entries.append({
1222                 'id': playlist_id,
1223                 'title': title,
1224                 'description': description,
1225                 'thumbnail': thumbnail,
1226                 'duration': duration,
1227                 'formats': formats,
1228             })
1229         return entries
1230
1231     def _live_title(self, name):
1232         """ Generate the title for a live video """
1233         now = datetime.datetime.now()
1234         now_str = now.strftime("%Y-%m-%d %H:%M")
1235         return name + ' ' + now_str
1236
1237     def _int(self, v, name, fatal=False, **kwargs):
1238         res = int_or_none(v, **kwargs)
1239         if 'get_attr' in kwargs:
1240             print(getattr(v, kwargs['get_attr']))
1241         if res is None:
1242             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1243             if fatal:
1244                 raise ExtractorError(msg)
1245             else:
1246                 self._downloader.report_warning(msg)
1247         return res
1248
1249     def _float(self, v, name, fatal=False, **kwargs):
1250         res = float_or_none(v, **kwargs)
1251         if res is None:
1252             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1253             if fatal:
1254                 raise ExtractorError(msg)
1255             else:
1256                 self._downloader.report_warning(msg)
1257         return res
1258
1259     def _set_cookie(self, domain, name, value, expire_time=None):
1260         cookie = compat_cookiejar.Cookie(
1261             0, name, value, None, None, domain, None,
1262             None, '/', True, False, expire_time, '', None, None, None)
1263         self._downloader.cookiejar.set_cookie(cookie)
1264
1265     def _get_cookies(self, url):
1266         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1267         req = compat_urllib_request.Request(url)
1268         self._downloader.cookiejar.add_cookie_header(req)
1269         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1270
1271     def get_testcases(self, include_onlymatching=False):
1272         t = getattr(self, '_TEST', None)
1273         if t:
1274             assert not hasattr(self, '_TESTS'), \
1275                 '%s has _TEST and _TESTS' % type(self).__name__
1276             tests = [t]
1277         else:
1278             tests = getattr(self, '_TESTS', [])
1279         for t in tests:
1280             if not include_onlymatching and t.get('only_matching', False):
1281                 continue
1282             t['name'] = type(self).__name__[:-len('IE')]
1283             yield t
1284
1285     def is_suitable(self, age_limit):
1286         """ Test whether the extractor is generally suitable for the given
1287         age limit (i.e. pornographic sites are not, all others usually are) """
1288
1289         any_restricted = False
1290         for tc in self.get_testcases(include_onlymatching=False):
1291             if 'playlist' in tc:
1292                 tc = tc['playlist'][0]
1293             is_restricted = age_restricted(
1294                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1295             if not is_restricted:
1296                 return True
1297             any_restricted = any_restricted or is_restricted
1298         return not any_restricted
1299
1300     def extract_subtitles(self, *args, **kwargs):
1301         if (self._downloader.params.get('writesubtitles', False) or
1302                 self._downloader.params.get('listsubtitles')):
1303             return self._get_subtitles(*args, **kwargs)
1304         return {}
1305
1306     def _get_subtitles(self, *args, **kwargs):
1307         raise NotImplementedError("This method must be implemented by subclasses")
1308
1309     @staticmethod
1310     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1311         """ Merge subtitle items for one language. Items with duplicated URLs
1312         will be dropped. """
1313         list1_urls = set([item['url'] for item in subtitle_list1])
1314         ret = list(subtitle_list1)
1315         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1316         return ret
1317
1318     @classmethod
1319     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1320         """ Merge two subtitle dictionaries, language by language. """
1321         ret = dict(subtitle_dict1)
1322         for lang in subtitle_dict2:
1323             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1324         return ret
1325
1326     def extract_automatic_captions(self, *args, **kwargs):
1327         if (self._downloader.params.get('writeautomaticsub', False) or
1328                 self._downloader.params.get('listsubtitles')):
1329             return self._get_automatic_captions(*args, **kwargs)
1330         return {}
1331
1332     def _get_automatic_captions(self, *args, **kwargs):
1333         raise NotImplementedError("This method must be implemented by subclasses")
1334
1335
1336 class SearchInfoExtractor(InfoExtractor):
1337     """
1338     Base class for paged search queries extractors.
1339     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1340     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1341     """
1342
1343     @classmethod
1344     def _make_valid_url(cls):
1345         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1346
1347     @classmethod
1348     def suitable(cls, url):
1349         return re.match(cls._make_valid_url(), url) is not None
1350
1351     def _real_extract(self, query):
1352         mobj = re.match(self._make_valid_url(), query)
1353         if mobj is None:
1354             raise ExtractorError('Invalid search query "%s"' % query)
1355
1356         prefix = mobj.group('prefix')
1357         query = mobj.group('query')
1358         if prefix == '':
1359             return self._get_n_results(query, 1)
1360         elif prefix == 'all':
1361             return self._get_n_results(query, self._MAX_RESULTS)
1362         else:
1363             n = int(prefix)
1364             if n <= 0:
1365                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1366             elif n > self._MAX_RESULTS:
1367                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1368                 n = self._MAX_RESULTS
1369             return self._get_n_results(query, n)
1370
1371     def _get_n_results(self, query, n):
1372         """Get a specified number of results for a query"""
1373         raise NotImplementedError("This method must be implemented by subclasses")
1374
1375     @property
1376     def SEARCH_KEY(self):
1377         return self._SEARCH_KEY