_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import contextlib
   8 import datetime
   9 import errno
  10 import fileinput
  11 import io
  12 import itertools
  13 import json
  14 import locale
  15 import operator
  16 import os
  17 import platform
  18 import re
  19 import shutil
  20 import subprocess
  21 import socket
  22 import sys
  23 import time
  24 import tokenize
  25 import traceback
  26
  27 from .compat import (
  28     compat_basestring,
  29     compat_cookiejar,
  30     compat_expanduser,
  31     compat_get_terminal_size,
  32     compat_http_client,
  33     compat_kwargs,
  34     compat_os_name,
  35     compat_str,
  36     compat_tokenize_tokenize,
  37     compat_urllib_error,
  38     compat_urllib_request,
  39     compat_urllib_request_DataHandler,
  40 )
  41 from .utils import (
  42     age_restricted,
  43     args_to_str,
  44     ContentTooShortError,
  45     date_from_str,
  46     DateRange,
  47     DEFAULT_OUTTMPL,
  48     determine_ext,
  49     determine_protocol,
  50     DownloadError,
  51     encode_compat_str,
  52     encodeFilename,
  53     error_to_compat_str,
  54     ExtractorError,
  55     format_bytes,
  56     formatSeconds,
  57     locked_file,
  58     make_HTTPS_handler,
  59     MaxDownloadsReached,
  60     PagedList,
  61     parse_filesize,
  62     PerRequestProxyHandler,
  63     platform_name,
  64     PostProcessingError,
  65     preferredencoding,
  66     prepend_extension,
  67     register_socks_protocols,
  68     render_table,
  69     replace_extension,
  70     SameFileError,
  71     sanitize_filename,
  72     sanitize_path,
  73     sanitize_url,
  74     sanitized_Request,
  75     std_headers,
  76     subtitles_filename,
  77     UnavailableVideoError,
  78     url_basename,
  79     version_tuple,
  80     write_json_file,
  81     write_string,
  82     YoutubeDLCookieProcessor,
  83     YoutubeDLHandler,
  84 )
  85 from .cache import Cache
  86 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
  87 from .downloader import get_suitable_downloader
  88 from .downloader.rtmp import rtmpdump_version
  89 from .postprocessor import (
  90     FFmpegFixupM3u8PP,
  91     FFmpegFixupM4aPP,
  92     FFmpegFixupStretchedPP,
  93     FFmpegMergerPP,
  94     FFmpegPostProcessor,
  95     get_postprocessor,
  96 )
  97 from .version import __version__
  98
  99 if compat_os_name == 'nt':
 100     import ctypes
 101
 102
 103 class YoutubeDL(object):
 104     """YoutubeDL class.
 105
 106     YoutubeDL objects are the ones responsible of downloading the
 107     actual video file and writing it to disk if the user has requested
 108     it, among some other tasks. In most cases there should be one per
 109     program. As, given a video URL, the downloader doesn't know how to
 110     extract all the needed information, task that InfoExtractors do, it
 111     has to pass the URL to one of them.
 112
 113     For this, YoutubeDL objects have a method that allows
 114     InfoExtractors to be registered in a given order. When it is passed
 115     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 116     finds that reports being able to handle it. The InfoExtractor extracts
 117     all the information about the video or videos the URL refers to, and
 118     YoutubeDL process the extracted information, possibly using a File
 119     Downloader to download the video.
 120
 121     YoutubeDL objects accept a lot of parameters. In order not to saturate
 122     the object constructor with arguments, it receives a dictionary of
 123     options instead. These options are available through the params
 124     attribute for the InfoExtractors to use. The YoutubeDL also
 125     registers itself as the downloader in charge for the InfoExtractors
 126     that are added to it, so this is a "mutual registration".
 127
 128     Available options:
 129
 130     username:          Username for authentication purposes.
 131     password:          Password for authentication purposes.
 132     videopassword:     Password for accessing a video.
 133     usenetrc:          Use netrc for authentication instead.
 134     verbose:           Print additional info to stdout.
 135     quiet:             Do not print messages to stdout.
 136     no_warnings:       Do not print out anything for warnings.
 137     forceurl:          Force printing final URL.
 138     forcetitle:        Force printing title.
 139     forceid:           Force printing ID.
 140     forcethumbnail:    Force printing thumbnail URL.
 141     forcedescription:  Force printing description.
 142     forcefilename:     Force printing final filename.
 143     forceduration:     Force printing duration.
 144     forcejson:         Force printing info_dict as JSON.
 145     dump_single_json:  Force printing the info_dict of the whole playlist
 146                        (or video) as a single JSON line.
 147     simulate:          Do not download the video files.
 148     format:            Video format code. See options.py for more information.
 149     outtmpl:           Template for output names.
 150     restrictfilenames: Do not allow "&" and spaces in file names
 151     ignoreerrors:      Do not stop on download errors.
 152     force_generic_extractor: Force downloader to use the generic extractor
 153     nooverwrites:      Prevent overwriting files.
 154     playliststart:     Playlist item to start at.
 155     playlistend:       Playlist item to end at.
 156     playlist_items:    Specific indices of playlist to download.
 157     playlistreverse:   Download playlist items in reverse order.
 158     matchtitle:        Download only matching titles.
 159     rejecttitle:       Reject downloads for matching titles.
 160     logger:            Log messages to a logging.Logger instance.
 161     logtostderr:       Log messages to stderr instead of stdout.
 162     writedescription:  Write the video description to a .description file
 163     writeinfojson:     Write the video description to a .info.json file
 164     writeannotations:  Write the video annotations to a .annotations.xml file
 165     writethumbnail:    Write the thumbnail image to a file
 166     write_all_thumbnails:  Write all thumbnail formats to files
 167     writesubtitles:    Write the video subtitles to a file
 168     writeautomaticsub: Write the automatically generated subtitles to a file
 169     allsubtitles:      Downloads all the subtitles of the video
 170                        (requires writesubtitles or writeautomaticsub)
 171     listsubtitles:     Lists all available subtitles for the video
 172     subtitlesformat:   The format code for subtitles
 173     subtitleslangs:    List of languages of the subtitles to download
 174     keepvideo:         Keep the video file after post-processing
 175     daterange:         A DateRange object, download only if the upload_date is in the range.
 176     skip_download:     Skip the actual download of the video file
 177     cachedir:          Location of the cache files in the filesystem.
 178                        False to disable filesystem cache.
 179     noplaylist:        Download single video instead of a playlist if in doubt.
 180     age_limit:         An integer representing the user's age in years.
 181                        Unsuitable videos for the given age are skipped.
 182     min_views:         An integer representing the minimum view count the video
 183                        must have in order to not be skipped.
 184                        Videos without view count information are always
 185                        downloaded. None for no limit.
 186     max_views:         An integer representing the maximum view count.
 187                        Videos that are more popular than that are not
 188                        downloaded.
 189                        Videos without view count information are always
 190                        downloaded. None for no limit.
 191     download_archive:  File name of a file where all downloads are recorded.
 192                        Videos already present in the file are not downloaded
 193                        again.
 194     cookiefile:        File name where cookies should be read from and dumped to.
 195     nocheckcertificate:Do not verify SSL certificates
 196     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 197                        At the moment, this is only supported by YouTube.
 198     proxy:             URL of the proxy server to use
 199     cn_verification_proxy:  URL of the proxy to use for IP address verification
 200                        on Chinese sites. (Experimental)
 201     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 202     bidi_workaround:   Work around buggy terminals without bidirectional text
 203                        support, using fridibi
 204     debug_printtraffic:Print out sent and received HTTP traffic
 205     include_ads:       Download ads as well
 206     default_search:    Prepend this string if an input url is not valid.
 207                        'auto' for elaborate guessing
 208     encoding:          Use this encoding instead of the system-specified.
 209     extract_flat:      Do not resolve URLs, return the immediate result.
 210                        Pass in 'in_playlist' to only show this behavior for
 211                        playlist items.
 212     postprocessors:    A list of dictionaries, each with an entry
 213                        * key:  The name of the postprocessor. See
 214                                youtube_dl/postprocessor/__init__.py for a list.
 215                        as well as any further keyword arguments for the
 216                        postprocessor.
 217     progress_hooks:    A list of functions that get called on download
 218                        progress, with a dictionary with the entries
 219                        * status: One of "downloading", "error", or "finished".
 220                                  Check this first and ignore unknown values.
 221
 222                        If status is one of "downloading", or "finished", the
 223                        following properties may also be present:
 224                        * filename: The final filename (always present)
 225                        * tmpfilename: The filename we're currently writing to
 226                        * downloaded_bytes: Bytes on disk
 227                        * total_bytes: Size of the whole file, None if unknown
 228                        * total_bytes_estimate: Guess of the eventual file size,
 229                                                None if unavailable.
 230                        * elapsed: The number of seconds since download started.
 231                        * eta: The estimated time in seconds, None if unknown
 232                        * speed: The download speed in bytes/second, None if
 233                                 unknown
 234                        * fragment_index: The counter of the currently
 235                                          downloaded video fragment.
 236                        * fragment_count: The number of fragments (= individual
 237                                          files that will be merged)
 238
 239                        Progress hooks are guaranteed to be called at least once
 240                        (with status "finished") if the download is successful.
 241     merge_output_format: Extension to use when merging formats.
 242     fixup:             Automatically correct known faults of the file.
 243                        One of:
 244                        - "never": do nothing
 245                        - "warn": only emit a warning
 246                        - "detect_or_warn": check whether we can do anything
 247                                            about it, warn otherwise (default)
 248     source_address:    (Experimental) Client-side IP address to bind to.
 249     call_home:         Boolean, true iff we are allowed to contact the
 250                        youtube-dl servers for debugging.
 251     sleep_interval:    Number of seconds to sleep before each download.
 252     listformats:       Print an overview of available video formats and exit.
 253     list_thumbnails:   Print a table of all thumbnails and exit.
 254     match_filter:      A function that gets called with the info_dict of
 255                        every video.
 256                        If it returns a message, the video is ignored.
 257                        If it returns None, the video is downloaded.
 258                        match_filter_func in utils.py is one example for this.
 259     no_color:          Do not emit color codes in output.
 260
 261     The following options determine which downloader is picked:
 262     external_downloader: Executable of the external downloader to call.
 263                        None or unset for standard (built-in) downloader.
 264     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
 265                        if True, otherwise use ffmpeg/avconv if False, otherwise
 266                        use downloader suggested by extractor if None.
 267
 268     The following parameters are not used by YoutubeDL itself, they are used by
 269     the downloader (see youtube_dl/downloader/common.py):
 270     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 271     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 272     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
 273
 274     The following options are used by the post processors:
 275     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 276                        otherwise prefer avconv.
 277     postprocessor_args: A list of additional command-line arguments for the
 278                         postprocessor.
 279     """
 280
 281     params = None
 282     _ies = []
 283     _pps = []
 284     _download_retcode = None
 285     _num_downloads = None
 286     _screen_file = None
 287
 288     def __init__(self, params=None, auto_init=True):
 289         """Create a FileDownloader object with the given options."""
 290         if params is None:
 291             params = {}
 292         self._ies = []
 293         self._ies_instances = {}
 294         self._pps = []
 295         self._progress_hooks = []
 296         self._download_retcode = 0
 297         self._num_downloads = 0
 298         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 299         self._err_file = sys.stderr
 300         self.params = {
 301             # Default parameters
 302             'nocheckcertificate': False,
 303         }
 304         self.params.update(params)
 305         self.cache = Cache(self)
 306
 307         if params.get('bidi_workaround', False):
 308             try:
 309                 import pty
 310                 master, slave = pty.openpty()
 311                 width = compat_get_terminal_size().columns
 312                 if width is None:
 313                     width_args = []
 314                 else:
 315                     width_args = ['-w', str(width)]
 316                 sp_kwargs = dict(
 317                     stdin=subprocess.PIPE,
 318                     stdout=slave,
 319                     stderr=self._err_file)
 320                 try:
 321                     self._output_process = subprocess.Popen(
 322                         ['bidiv'] + width_args, **sp_kwargs
 323                     )
 324                 except OSError:
 325                     self._output_process = subprocess.Popen(
 326                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 327                 self._output_channel = os.fdopen(master, 'rb')
 328             except OSError as ose:
 329                 if ose.errno == 2:
 330                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 331                 else:
 332                     raise
 333
 334         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 335                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
 336                 not params.get('restrictfilenames', False)):
 337             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 338             self.report_warning(
 339                 'Assuming --restrict-filenames since file system encoding '
 340                 'cannot encode all characters. '
 341                 'Set the LC_ALL environment variable to fix this.')
 342             self.params['restrictfilenames'] = True
 343
 344         if isinstance(params.get('outtmpl'), bytes):
 345             self.report_warning(
 346                 'Parameter outtmpl is bytes, but should be a unicode string. '
 347                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 348
 349         self._setup_opener()
 350
 351         if auto_init:
 352             self.print_debug_header()
 353             self.add_default_info_extractors()
 354
 355         for pp_def_raw in self.params.get('postprocessors', []):
 356             pp_class = get_postprocessor(pp_def_raw['key'])
 357             pp_def = dict(pp_def_raw)
 358             del pp_def['key']
 359             pp = pp_class(self, **compat_kwargs(pp_def))
 360             self.add_post_processor(pp)
 361
 362         for ph in self.params.get('progress_hooks', []):
 363             self.add_progress_hook(ph)
 364
 365         register_socks_protocols()
 366
 367     def warn_if_short_id(self, argv):
 368         # short YouTube ID starting with dash?
 369         idxs = [
 370             i for i, a in enumerate(argv)
 371             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 372         if idxs:
 373             correct_argv = (
 374                 ['youtube-dl'] +
 375                 [a for i, a in enumerate(argv) if i not in idxs] +
 376                 ['--'] + [argv[i] for i in idxs]
 377             )
 378             self.report_warning(
 379                 'Long argument string detected. '
 380                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 381                 args_to_str(correct_argv))
 382
 383     def add_info_extractor(self, ie):
 384         """Add an InfoExtractor object to the end of the list."""
 385         self._ies.append(ie)
 386         if not isinstance(ie, type):
 387             self._ies_instances[ie.ie_key()] = ie
 388             ie.set_downloader(self)
 389
 390     def get_info_extractor(self, ie_key):
 391         """
 392         Get an instance of an IE with name ie_key, it will try to get one from
 393         the _ies list, if there's no instance it will create a new one and add
 394         it to the extractor list.
 395         """
 396         ie = self._ies_instances.get(ie_key)
 397         if ie is None:
 398             ie = get_info_extractor(ie_key)()
 399             self.add_info_extractor(ie)
 400         return ie
 401
 402     def add_default_info_extractors(self):
 403         """
 404         Add the InfoExtractors returned by gen_extractors to the end of the list
 405         """
 406         for ie in gen_extractor_classes():
 407             self.add_info_extractor(ie)
 408
 409     def add_post_processor(self, pp):
 410         """Add a PostProcessor object to the end of the chain."""
 411         self._pps.append(pp)
 412         pp.set_downloader(self)
 413
 414     def add_progress_hook(self, ph):
 415         """Add the progress hook (currently only for the file downloader)"""
 416         self._progress_hooks.append(ph)
 417
 418     def _bidi_workaround(self, message):
 419         if not hasattr(self, '_output_channel'):
 420             return message
 421
 422         assert hasattr(self, '_output_process')
 423         assert isinstance(message, compat_str)
 424         line_count = message.count('\n') + 1
 425         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 426         self._output_process.stdin.flush()
 427         res = ''.join(self._output_channel.readline().decode('utf-8')
 428                       for _ in range(line_count))
 429         return res[:-len('\n')]
 430
 431     def to_screen(self, message, skip_eol=False):
 432         """Print message to stdout if not in quiet mode."""
 433         return self.to_stdout(message, skip_eol, check_quiet=True)
 434
 435     def _write_string(self, s, out=None):
 436         write_string(s, out=out, encoding=self.params.get('encoding'))
 437
 438     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 439         """Print message to stdout if not in quiet mode."""
 440         if self.params.get('logger'):
 441             self.params['logger'].debug(message)
 442         elif not check_quiet or not self.params.get('quiet', False):
 443             message = self._bidi_workaround(message)
 444             terminator = ['\n', ''][skip_eol]
 445             output = message + terminator
 446
 447             self._write_string(output, self._screen_file)
 448
 449     def to_stderr(self, message):
 450         """Print message to stderr."""
 451         assert isinstance(message, compat_str)
 452         if self.params.get('logger'):
 453             self.params['logger'].error(message)
 454         else:
 455             message = self._bidi_workaround(message)
 456             output = message + '\n'
 457             self._write_string(output, self._err_file)
 458
 459     def to_console_title(self, message):
 460         if not self.params.get('consoletitle', False):
 461             return
 462         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 463             # c_wchar_p() might not be necessary if `message` is
 464             # already of type unicode()
 465             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 466         elif 'TERM' in os.environ:
 467             self._write_string('\033]0;%s\007' % message, self._screen_file)
 468
 469     def save_console_title(self):
 470         if not self.params.get('consoletitle', False):
 471             return
 472         if 'TERM' in os.environ:
 473             # Save the title on stack
 474             self._write_string('\033[22;0t', self._screen_file)
 475
 476     def restore_console_title(self):
 477         if not self.params.get('consoletitle', False):
 478             return
 479         if 'TERM' in os.environ:
 480             # Restore the title from stack
 481             self._write_string('\033[23;0t', self._screen_file)
 482
 483     def __enter__(self):
 484         self.save_console_title()
 485         return self
 486
 487     def __exit__(self, *args):
 488         self.restore_console_title()
 489
 490         if self.params.get('cookiefile') is not None:
 491             self.cookiejar.save()
 492
 493     def trouble(self, message=None, tb=None):
 494         """Determine action to take when a download problem appears.
 495
 496         Depending on if the downloader has been configured to ignore
 497         download errors or not, this method may throw an exception or
 498         not when errors are found, after printing the message.
 499
 500         tb, if given, is additional traceback information.
 501         """
 502         if message is not None:
 503             self.to_stderr(message)
 504         if self.params.get('verbose'):
 505             if tb is None:
 506                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 507                     tb = ''
 508                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 509                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 510                     tb += encode_compat_str(traceback.format_exc())
 511                 else:
 512                     tb_data = traceback.format_list(traceback.extract_stack())
 513                     tb = ''.join(tb_data)
 514             self.to_stderr(tb)
 515         if not self.params.get('ignoreerrors', False):
 516             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 517                 exc_info = sys.exc_info()[1].exc_info
 518             else:
 519                 exc_info = sys.exc_info()
 520             raise DownloadError(message, exc_info)
 521         self._download_retcode = 1
 522
 523     def report_warning(self, message):
 524         '''
 525         Print the message to stderr, it will be prefixed with 'WARNING:'
 526         If stderr is a tty file the 'WARNING:' will be colored
 527         '''
 528         if self.params.get('logger') is not None:
 529             self.params['logger'].warning(message)
 530         else:
 531             if self.params.get('no_warnings'):
 532                 return
 533             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 534                 _msg_header = '\033[0;33mWARNING:\033[0m'
 535             else:
 536                 _msg_header = 'WARNING:'
 537             warning_message = '%s %s' % (_msg_header, message)
 538             self.to_stderr(warning_message)
 539
 540     def report_error(self, message, tb=None):
 541         '''
 542         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 543         in red if stderr is a tty file.
 544         '''
 545         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 546             _msg_header = '\033[0;31mERROR:\033[0m'
 547         else:
 548             _msg_header = 'ERROR:'
 549         error_message = '%s %s' % (_msg_header, message)
 550         self.trouble(error_message, tb)
 551
 552     def report_file_already_downloaded(self, file_name):
 553         """Report file has already been fully downloaded."""
 554         try:
 555             self.to_screen('[download] %s has already been downloaded' % file_name)
 556         except UnicodeEncodeError:
 557             self.to_screen('[download] The file has already been downloaded')
 558
 559     def prepare_filename(self, info_dict):
 560         """Generate the output filename."""
 561         try:
 562             template_dict = dict(info_dict)
 563
 564             template_dict['epoch'] = int(time.time())
 565             autonumber_size = self.params.get('autonumber_size')
 566             if autonumber_size is None:
 567                 autonumber_size = 5
 568             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 569             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 570             if template_dict.get('playlist_index') is not None:
 571                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
 572             if template_dict.get('resolution') is None:
 573                 if template_dict.get('width') and template_dict.get('height'):
 574                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 575                 elif template_dict.get('height'):
 576                     template_dict['resolution'] = '%sp' % template_dict['height']
 577                 elif template_dict.get('width'):
 578                     template_dict['resolution'] = '%dx?' % template_dict['width']
 579
 580             sanitize = lambda k, v: sanitize_filename(
 581                 compat_str(v),
 582                 restricted=self.params.get('restrictfilenames'),
 583                 is_id=(k == 'id'))
 584             template_dict = dict((k, sanitize(k, v))
 585                                  for k, v in template_dict.items()
 586                                  if v is not None and not isinstance(v, (list, tuple, dict)))
 587             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 588
 589             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 590             tmpl = compat_expanduser(outtmpl)
 591             filename = tmpl % template_dict
 592             # Temporary fix for #4787
 593             # 'Treat' all problem characters by passing filename through preferredencoding
 594             # to workaround encoding issues with subprocess on python2 @ Windows
 595             if sys.version_info < (3, 0) and sys.platform == 'win32':
 596                 filename = encodeFilename(filename, True).decode(preferredencoding())
 597             return sanitize_path(filename)
 598         except ValueError as err:
 599             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 600             return None
 601
 602     def _match_entry(self, info_dict, incomplete):
 603         """ Returns None iff the file should be downloaded """
 604
 605         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 606         if 'title' in info_dict:
 607             # This can happen when we're just evaluating the playlist
 608             title = info_dict['title']
 609             matchtitle = self.params.get('matchtitle', False)
 610             if matchtitle:
 611                 if not re.search(matchtitle, title, re.IGNORECASE):
 612                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 613             rejecttitle = self.params.get('rejecttitle', False)
 614             if rejecttitle:
 615                 if re.search(rejecttitle, title, re.IGNORECASE):
 616                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 617         date = info_dict.get('upload_date')
 618         if date is not None:
 619             dateRange = self.params.get('daterange', DateRange())
 620             if date not in dateRange:
 621                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 622         view_count = info_dict.get('view_count')
 623         if view_count is not None:
 624             min_views = self.params.get('min_views')
 625             if min_views is not None and view_count < min_views:
 626                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 627             max_views = self.params.get('max_views')
 628             if max_views is not None and view_count > max_views:
 629                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 630         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 631             return 'Skipping "%s" because it is age restricted' % video_title
 632         if self.in_download_archive(info_dict):
 633             return '%s has already been recorded in archive' % video_title
 634
 635         if not incomplete:
 636             match_filter = self.params.get('match_filter')
 637             if match_filter is not None:
 638                 ret = match_filter(info_dict)
 639                 if ret is not None:
 640                     return ret
 641
 642         return None
 643
 644     @staticmethod
 645     def add_extra_info(info_dict, extra_info):
 646         '''Set the keys from extra_info in info dict if they are missing'''
 647         for key, value in extra_info.items():
 648             info_dict.setdefault(key, value)
 649
 650     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 651                      process=True, force_generic_extractor=False):
 652         '''
 653         Returns a list with a dictionary for each video we find.
 654         If 'download', also downloads the videos.
 655         extra_info is a dict containing the extra values to add to each result
 656         '''
 657
 658         if not ie_key and force_generic_extractor:
 659             ie_key = 'Generic'
 660
 661         if ie_key:
 662             ies = [self.get_info_extractor(ie_key)]
 663         else:
 664             ies = self._ies
 665
 666         for ie in ies:
 667             if not ie.suitable(url):
 668                 continue
 669
 670             ie = self.get_info_extractor(ie.ie_key())
 671             if not ie.working():
 672                 self.report_warning('The program functionality for this site has been marked as broken, '
 673                                     'and will probably not work.')
 674
 675             try:
 676                 ie_result = ie.extract(url)
 677                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 678                     break
 679                 if isinstance(ie_result, list):
 680                     # Backwards compatibility: old IE result format
 681                     ie_result = {
 682                         '_type': 'compat_list',
 683                         'entries': ie_result,
 684                     }
 685                 self.add_default_extra_info(ie_result, ie, url)
 686                 if process:
 687                     return self.process_ie_result(ie_result, download, extra_info)
 688                 else:
 689                     return ie_result
 690             except ExtractorError as e:  # An error we somewhat expected
 691                 self.report_error(compat_str(e), e.format_traceback())
 692                 break
 693             except MaxDownloadsReached:
 694                 raise
 695             except Exception as e:
 696                 if self.params.get('ignoreerrors', False):
 697                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
 698                     break
 699                 else:
 700                     raise
 701         else:
 702             self.report_error('no suitable InfoExtractor for URL %s' % url)
 703
 704     def add_default_extra_info(self, ie_result, ie, url):
 705         self.add_extra_info(ie_result, {
 706             'extractor': ie.IE_NAME,
 707             'webpage_url': url,
 708             'webpage_url_basename': url_basename(url),
 709             'extractor_key': ie.ie_key(),
 710         })
 711
 712     def process_ie_result(self, ie_result, download=True, extra_info={}):
 713         """
 714         Take the result of the ie(may be modified) and resolve all unresolved
 715         references (URLs, playlist items).
 716
 717         It will also download the videos if 'download'.
 718         Returns the resolved ie_result.
 719         """
 720         result_type = ie_result.get('_type', 'video')
 721
 722         if result_type in ('url', 'url_transparent'):
 723             extract_flat = self.params.get('extract_flat', False)
 724             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 725                     extract_flat is True):
 726                 if self.params.get('forcejson', False):
 727                     self.to_stdout(json.dumps(ie_result))
 728                 return ie_result
 729
 730         if result_type == 'video':
 731             self.add_extra_info(ie_result, extra_info)
 732             return self.process_video_result(ie_result, download=download)
 733         elif result_type == 'url':
 734             # We have to add extra_info to the results because it may be
 735             # contained in a playlist
 736             return self.extract_info(ie_result['url'],
 737                                      download,
 738                                      ie_key=ie_result.get('ie_key'),
 739                                      extra_info=extra_info)
 740         elif result_type == 'url_transparent':
 741             # Use the information from the embedding page
 742             info = self.extract_info(
 743                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 744                 extra_info=extra_info, download=False, process=False)
 745
 746             force_properties = dict(
 747                 (k, v) for k, v in ie_result.items() if v is not None)
 748             for f in ('_type', 'url', 'ie_key'):
 749                 if f in force_properties:
 750                     del force_properties[f]
 751             new_result = info.copy()
 752             new_result.update(force_properties)
 753
 754             assert new_result.get('_type') != 'url_transparent'
 755
 756             return self.process_ie_result(
 757                 new_result, download=download, extra_info=extra_info)
 758         elif result_type == 'playlist' or result_type == 'multi_video':
 759             # We process each entry in the playlist
 760             playlist = ie_result.get('title') or ie_result.get('id')
 761             self.to_screen('[download] Downloading playlist: %s' % playlist)
 762
 763             playlist_results = []
 764
 765             playliststart = self.params.get('playliststart', 1) - 1
 766             playlistend = self.params.get('playlistend')
 767             # For backwards compatibility, interpret -1 as whole list
 768             if playlistend == -1:
 769                 playlistend = None
 770
 771             playlistitems_str = self.params.get('playlist_items')
 772             playlistitems = None
 773             if playlistitems_str is not None:
 774                 def iter_playlistitems(format):
 775                     for string_segment in format.split(','):
 776                         if '-' in string_segment:
 777                             start, end = string_segment.split('-')
 778                             for item in range(int(start), int(end) + 1):
 779                                 yield int(item)
 780                         else:
 781                             yield int(string_segment)
 782                 playlistitems = iter_playlistitems(playlistitems_str)
 783
 784             ie_entries = ie_result['entries']
 785             if isinstance(ie_entries, list):
 786                 n_all_entries = len(ie_entries)
 787                 if playlistitems:
 788                     entries = [
 789                         ie_entries[i - 1] for i in playlistitems
 790                         if -n_all_entries <= i - 1 < n_all_entries]
 791                 else:
 792                     entries = ie_entries[playliststart:playlistend]
 793                 n_entries = len(entries)
 794                 self.to_screen(
 795                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
 796                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 797             elif isinstance(ie_entries, PagedList):
 798                 if playlistitems:
 799                     entries = []
 800                     for item in playlistitems:
 801                         entries.extend(ie_entries.getslice(
 802                             item - 1, item
 803                         ))
 804                 else:
 805                     entries = ie_entries.getslice(
 806                         playliststart, playlistend)
 807                 n_entries = len(entries)
 808                 self.to_screen(
 809                     '[%s] playlist %s: Downloading %d videos' %
 810                     (ie_result['extractor'], playlist, n_entries))
 811             else:  # iterable
 812                 if playlistitems:
 813                     entry_list = list(ie_entries)
 814                     entries = [entry_list[i - 1] for i in playlistitems]
 815                 else:
 816                     entries = list(itertools.islice(
 817                         ie_entries, playliststart, playlistend))
 818                 n_entries = len(entries)
 819                 self.to_screen(
 820                     '[%s] playlist %s: Downloading %d videos' %
 821                     (ie_result['extractor'], playlist, n_entries))
 822
 823             if self.params.get('playlistreverse', False):
 824                 entries = entries[::-1]
 825
 826             for i, entry in enumerate(entries, 1):
 827                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 828                 extra = {
 829                     'n_entries': n_entries,
 830                     'playlist': playlist,
 831                     'playlist_id': ie_result.get('id'),
 832                     'playlist_title': ie_result.get('title'),
 833                     'playlist_index': i + playliststart,
 834                     'extractor': ie_result['extractor'],
 835                     'webpage_url': ie_result['webpage_url'],
 836                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 837                     'extractor_key': ie_result['extractor_key'],
 838                 }
 839
 840                 reason = self._match_entry(entry, incomplete=True)
 841                 if reason is not None:
 842                     self.to_screen('[download] ' + reason)
 843                     continue
 844
 845                 entry_result = self.process_ie_result(entry,
 846                                                       download=download,
 847                                                       extra_info=extra)
 848                 playlist_results.append(entry_result)
 849             ie_result['entries'] = playlist_results
 850             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
 851             return ie_result
 852         elif result_type == 'compat_list':
 853             self.report_warning(
 854                 'Extractor %s returned a compat_list result. '
 855                 'It needs to be updated.' % ie_result.get('extractor'))
 856
 857             def _fixup(r):
 858                 self.add_extra_info(
 859                     r,
 860                     {
 861                         'extractor': ie_result['extractor'],
 862                         'webpage_url': ie_result['webpage_url'],
 863                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 864                         'extractor_key': ie_result['extractor_key'],
 865                     }
 866                 )
 867                 return r
 868             ie_result['entries'] = [
 869                 self.process_ie_result(_fixup(r), download, extra_info)
 870                 for r in ie_result['entries']
 871             ]
 872             return ie_result
 873         else:
 874             raise Exception('Invalid result type: %s' % result_type)
 875
 876     def _build_format_filter(self, filter_spec):
 877         " Returns a function to filter the formats according to the filter_spec "
 878
 879         OPERATORS = {
 880             '<': operator.lt,
 881             '<=': operator.le,
 882             '>': operator.gt,
 883             '>=': operator.ge,
 884             '=': operator.eq,
 885             '!=': operator.ne,
 886         }
 887         operator_rex = re.compile(r'''(?x)\s*
 888             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
 889             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
 890             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
 891             $
 892             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
 893         m = operator_rex.search(filter_spec)
 894         if m:
 895             try:
 896                 comparison_value = int(m.group('value'))
 897             except ValueError:
 898                 comparison_value = parse_filesize(m.group('value'))
 899                 if comparison_value is None:
 900                     comparison_value = parse_filesize(m.group('value') + 'B')
 901                 if comparison_value is None:
 902                     raise ValueError(
 903                         'Invalid value %r in format specification %r' % (
 904                             m.group('value'), filter_spec))
 905             op = OPERATORS[m.group('op')]
 906
 907         if not m:
 908             STR_OPERATORS = {
 909                 '=': operator.eq,
 910                 '!=': operator.ne,
 911                 '^=': lambda attr, value: attr.startswith(value),
 912                 '$=': lambda attr, value: attr.endswith(value),
 913                 '*=': lambda attr, value: value in attr,
 914             }
 915             str_operator_rex = re.compile(r'''(?x)
 916                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
 917                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
 918                 \s*(?P<value>[a-zA-Z0-9._-]+)
 919                 \s*$
 920                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
 921             m = str_operator_rex.search(filter_spec)
 922             if m:
 923                 comparison_value = m.group('value')
 924                 op = STR_OPERATORS[m.group('op')]
 925
 926         if not m:
 927             raise ValueError('Invalid filter specification %r' % filter_spec)
 928
 929         def _filter(f):
 930             actual_value = f.get(m.group('key'))
 931             if actual_value is None:
 932                 return m.group('none_inclusive')
 933             return op(actual_value, comparison_value)
 934         return _filter
 935
 936     def build_format_selector(self, format_spec):
 937         def syntax_error(note, start):
 938             message = (
 939                 'Invalid format specification: '
 940                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
 941             return SyntaxError(message)
 942
 943         PICKFIRST = 'PICKFIRST'
 944         MERGE = 'MERGE'
 945         SINGLE = 'SINGLE'
 946         GROUP = 'GROUP'
 947         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
 948
 949         def _parse_filter(tokens):
 950             filter_parts = []
 951             for type, string, start, _, _ in tokens:
 952                 if type == tokenize.OP and string == ']':
 953                     return ''.join(filter_parts)
 954                 else:
 955                     filter_parts.append(string)
 956
 957         def _remove_unused_ops(tokens):
 958             # Remove operators that we don't use and join them with the surrounding strings
 959             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
 960             ALLOWED_OPS = ('/', '+', ',', '(', ')')
 961             last_string, last_start, last_end, last_line = None, None, None, None
 962             for type, string, start, end, line in tokens:
 963                 if type == tokenize.OP and string == '[':
 964                     if last_string:
 965                         yield tokenize.NAME, last_string, last_start, last_end, last_line
 966                         last_string = None
 967                     yield type, string, start, end, line
 968                     # everything inside brackets will be handled by _parse_filter
 969                     for type, string, start, end, line in tokens:
 970                         yield type, string, start, end, line
 971                         if type == tokenize.OP and string == ']':
 972                             break
 973                 elif type == tokenize.OP and string in ALLOWED_OPS:
 974                     if last_string:
 975                         yield tokenize.NAME, last_string, last_start, last_end, last_line
 976                         last_string = None
 977                     yield type, string, start, end, line
 978                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
 979                     if not last_string:
 980                         last_string = string
 981                         last_start = start
 982                         last_end = end
 983                     else:
 984                         last_string += string
 985             if last_string:
 986                 yield tokenize.NAME, last_string, last_start, last_end, last_line
 987
 988         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
 989             selectors = []
 990             current_selector = None
 991             for type, string, start, _, _ in tokens:
 992                 # ENCODING is only defined in python 3.x
 993                 if type == getattr(tokenize, 'ENCODING', None):
 994                     continue
 995                 elif type in [tokenize.NAME, tokenize.NUMBER]:
 996                     current_selector = FormatSelector(SINGLE, string, [])
 997                 elif type == tokenize.OP:
 998                     if string == ')':
 999                         if not inside_group:
1000                             # ')' will be handled by the parentheses group
1001                             tokens.restore_last_token()
1002                         break
1003                     elif inside_merge and string in ['/', ',']:
1004                         tokens.restore_last_token()
1005                         break
1006                     elif inside_choice and string == ',':
1007                         tokens.restore_last_token()
1008                         break
1009                     elif string == ',':
1010                         if not current_selector:
1011                             raise syntax_error('"," must follow a format selector', start)
1012                         selectors.append(current_selector)
1013                         current_selector = None
1014                     elif string == '/':
1015                         if not current_selector:
1016                             raise syntax_error('"/" must follow a format selector', start)
1017                         first_choice = current_selector
1018                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1019                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1020                     elif string == '[':
1021                         if not current_selector:
1022                             current_selector = FormatSelector(SINGLE, 'best', [])
1023                         format_filter = _parse_filter(tokens)
1024                         current_selector.filters.append(format_filter)
1025                     elif string == '(':
1026                         if current_selector:
1027                             raise syntax_error('Unexpected "("', start)
1028                         group = _parse_format_selection(tokens, inside_group=True)
1029                         current_selector = FormatSelector(GROUP, group, [])
1030                     elif string == '+':
1031                         video_selector = current_selector
1032                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1033                         if not video_selector or not audio_selector:
1034                             raise syntax_error('"+" must be between two format selectors', start)
1035                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1036                     else:
1037                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1038                 elif type == tokenize.ENDMARKER:
1039                     break
1040             if current_selector:
1041                 selectors.append(current_selector)
1042             return selectors
1043
1044         def _build_selector_function(selector):
1045             if isinstance(selector, list):
1046                 fs = [_build_selector_function(s) for s in selector]
1047
1048                 def selector_function(formats):
1049                     for f in fs:
1050                         for format in f(formats):
1051                             yield format
1052                 return selector_function
1053             elif selector.type == GROUP:
1054                 selector_function = _build_selector_function(selector.selector)
1055             elif selector.type == PICKFIRST:
1056                 fs = [_build_selector_function(s) for s in selector.selector]
1057
1058                 def selector_function(formats):
1059                     for f in fs:
1060                         picked_formats = list(f(formats))
1061                         if picked_formats:
1062                             return picked_formats
1063                     return []
1064             elif selector.type == SINGLE:
1065                 format_spec = selector.selector
1066
1067                 def selector_function(formats):
1068                     formats = list(formats)
1069                     if not formats:
1070                         return
1071                     if format_spec == 'all':
1072                         for f in formats:
1073                             yield f
1074                     elif format_spec in ['best', 'worst', None]:
1075                         format_idx = 0 if format_spec == 'worst' else -1
1076                         audiovideo_formats = [
1077                             f for f in formats
1078                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1079                         if audiovideo_formats:
1080                             yield audiovideo_formats[format_idx]
1081                         # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1082                         elif (all(f.get('acodec') != 'none' for f in formats) or
1083                               all(f.get('vcodec') != 'none' for f in formats)):
1084                             yield formats[format_idx]
1085                     elif format_spec == 'bestaudio':
1086                         audio_formats = [
1087                             f for f in formats
1088                             if f.get('vcodec') == 'none']
1089                         if audio_formats:
1090                             yield audio_formats[-1]
1091                     elif format_spec == 'worstaudio':
1092                         audio_formats = [
1093                             f for f in formats
1094                             if f.get('vcodec') == 'none']
1095                         if audio_formats:
1096                             yield audio_formats[0]
1097                     elif format_spec == 'bestvideo':
1098                         video_formats = [
1099                             f for f in formats
1100                             if f.get('acodec') == 'none']
1101                         if video_formats:
1102                             yield video_formats[-1]
1103                     elif format_spec == 'worstvideo':
1104                         video_formats = [
1105                             f for f in formats
1106                             if f.get('acodec') == 'none']
1107                         if video_formats:
1108                             yield video_formats[0]
1109                     else:
1110                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1111                         if format_spec in extensions:
1112                             filter_f = lambda f: f['ext'] == format_spec
1113                         else:
1114                             filter_f = lambda f: f['format_id'] == format_spec
1115                         matches = list(filter(filter_f, formats))
1116                         if matches:
1117                             yield matches[-1]
1118             elif selector.type == MERGE:
1119                 def _merge(formats_info):
1120                     format_1, format_2 = [f['format_id'] for f in formats_info]
1121                     # The first format must contain the video and the
1122                     # second the audio
1123                     if formats_info[0].get('vcodec') == 'none':
1124                         self.report_error('The first format must '
1125                                           'contain the video, try using '
1126                                           '"-f %s+%s"' % (format_2, format_1))
1127                         return
1128                     # Formats must be opposite (video+audio)
1129                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1130                         self.report_error(
1131                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1132                             % (format_1, format_2))
1133                         return
1134                     output_ext = (
1135                         formats_info[0]['ext']
1136                         if self.params.get('merge_output_format') is None
1137                         else self.params['merge_output_format'])
1138                     return {
1139                         'requested_formats': formats_info,
1140                         'format': '%s+%s' % (formats_info[0].get('format'),
1141                                              formats_info[1].get('format')),
1142                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1143                                                 formats_info[1].get('format_id')),
1144                         'width': formats_info[0].get('width'),
1145                         'height': formats_info[0].get('height'),
1146                         'resolution': formats_info[0].get('resolution'),
1147                         'fps': formats_info[0].get('fps'),
1148                         'vcodec': formats_info[0].get('vcodec'),
1149                         'vbr': formats_info[0].get('vbr'),
1150                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1151                         'acodec': formats_info[1].get('acodec'),
1152                         'abr': formats_info[1].get('abr'),
1153                         'ext': output_ext,
1154                     }
1155                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1156
1157                 def selector_function(formats):
1158                     formats = list(formats)
1159                     for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1160                         yield _merge(pair)
1161
1162             filters = [self._build_format_filter(f) for f in selector.filters]
1163
1164             def final_selector(formats):
1165                 for _filter in filters:
1166                     formats = list(filter(_filter, formats))
1167                 return selector_function(formats)
1168             return final_selector
1169
1170         stream = io.BytesIO(format_spec.encode('utf-8'))
1171         try:
1172             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1173         except tokenize.TokenError:
1174             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1175
1176         class TokenIterator(object):
1177             def __init__(self, tokens):
1178                 self.tokens = tokens
1179                 self.counter = 0
1180
1181             def __iter__(self):
1182                 return self
1183
1184             def __next__(self):
1185                 if self.counter >= len(self.tokens):
1186                     raise StopIteration()
1187                 value = self.tokens[self.counter]
1188                 self.counter += 1
1189                 return value
1190
1191             next = __next__
1192
1193             def restore_last_token(self):
1194                 self.counter -= 1
1195
1196         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1197         return _build_selector_function(parsed_selector)
1198
1199     def _calc_headers(self, info_dict):
1200         res = std_headers.copy()
1201
1202         add_headers = info_dict.get('http_headers')
1203         if add_headers:
1204             res.update(add_headers)
1205
1206         cookies = self._calc_cookies(info_dict)
1207         if cookies:
1208             res['Cookie'] = cookies
1209
1210         return res
1211
1212     def _calc_cookies(self, info_dict):
1213         pr = sanitized_Request(info_dict['url'])
1214         self.cookiejar.add_cookie_header(pr)
1215         return pr.get_header('Cookie')
1216
1217     def process_video_result(self, info_dict, download=True):
1218         assert info_dict.get('_type', 'video') == 'video'
1219
1220         if 'id' not in info_dict:
1221             raise ExtractorError('Missing "id" field in extractor result')
1222         if 'title' not in info_dict:
1223             raise ExtractorError('Missing "title" field in extractor result')
1224
1225         if 'playlist' not in info_dict:
1226             # It isn't part of a playlist
1227             info_dict['playlist'] = None
1228             info_dict['playlist_index'] = None
1229
1230         thumbnails = info_dict.get('thumbnails')
1231         if thumbnails is None:
1232             thumbnail = info_dict.get('thumbnail')
1233             if thumbnail:
1234                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1235         if thumbnails:
1236             thumbnails.sort(key=lambda t: (
1237                 t.get('preference'), t.get('width'), t.get('height'),
1238                 t.get('id'), t.get('url')))
1239             for i, t in enumerate(thumbnails):
1240                 t['url'] = sanitize_url(t['url'])
1241                 if t.get('width') and t.get('height'):
1242                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1243                 if t.get('id') is None:
1244                     t['id'] = '%d' % i
1245
1246         if self.params.get('list_thumbnails'):
1247             self.list_thumbnails(info_dict)
1248             return
1249
1250         thumbnail = info_dict.get('thumbnail')
1251         if thumbnail:
1252             info_dict['thumbnail'] = sanitize_url(thumbnail)
1253         elif thumbnails:
1254             info_dict['thumbnail'] = thumbnails[-1]['url']
1255
1256         if 'display_id' not in info_dict and 'id' in info_dict:
1257             info_dict['display_id'] = info_dict['id']
1258
1259         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1260             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1261             # see http://bugs.python.org/issue1646728)
1262             try:
1263                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1264                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1265             except (ValueError, OverflowError, OSError):
1266                 pass
1267
1268         # Auto generate title fields corresponding to the *_number fields when missing
1269         # in order to always have clean titles. This is very common for TV series.
1270         for field in ('chapter', 'season', 'episode'):
1271             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1272                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1273
1274         subtitles = info_dict.get('subtitles')
1275         if subtitles:
1276             for _, subtitle in subtitles.items():
1277                 for subtitle_format in subtitle:
1278                     if subtitle_format.get('url'):
1279                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1280                     if 'ext' not in subtitle_format:
1281                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1282
1283         if self.params.get('listsubtitles', False):
1284             if 'automatic_captions' in info_dict:
1285                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1286             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1287             return
1288         info_dict['requested_subtitles'] = self.process_subtitles(
1289             info_dict['id'], subtitles,
1290             info_dict.get('automatic_captions'))
1291
1292         # We now pick which formats have to be downloaded
1293         if info_dict.get('formats') is None:
1294             # There's only one format available
1295             formats = [info_dict]
1296         else:
1297             formats = info_dict['formats']
1298
1299         if not formats:
1300             raise ExtractorError('No video formats found!')
1301
1302         formats_dict = {}
1303
1304         # We check that all the formats have the format and format_id fields
1305         for i, format in enumerate(formats):
1306             if 'url' not in format:
1307                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1308
1309             format['url'] = sanitize_url(format['url'])
1310
1311             if format.get('format_id') is None:
1312                 format['format_id'] = compat_str(i)
1313             else:
1314                 # Sanitize format_id from characters used in format selector expression
1315                 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1316             format_id = format['format_id']
1317             if format_id not in formats_dict:
1318                 formats_dict[format_id] = []
1319             formats_dict[format_id].append(format)
1320
1321         # Make sure all formats have unique format_id
1322         for format_id, ambiguous_formats in formats_dict.items():
1323             if len(ambiguous_formats) > 1:
1324                 for i, format in enumerate(ambiguous_formats):
1325                     format['format_id'] = '%s-%d' % (format_id, i)
1326
1327         for i, format in enumerate(formats):
1328             if format.get('format') is None:
1329                 format['format'] = '{id} - {res}{note}'.format(
1330                     id=format['format_id'],
1331                     res=self.format_resolution(format),
1332                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1333                 )
1334             # Automatically determine file extension if missing
1335             if 'ext' not in format:
1336                 format['ext'] = determine_ext(format['url']).lower()
1337             # Automatically determine protocol if missing (useful for format
1338             # selection purposes)
1339             if 'protocol' not in format:
1340                 format['protocol'] = determine_protocol(format)
1341             # Add HTTP headers, so that external programs can use them from the
1342             # json output
1343             full_format_info = info_dict.copy()
1344             full_format_info.update(format)
1345             format['http_headers'] = self._calc_headers(full_format_info)
1346
1347         # TODO Central sorting goes here
1348
1349         if formats[0] is not info_dict:
1350             # only set the 'formats' fields if the original info_dict list them
1351             # otherwise we end up with a circular reference, the first (and unique)
1352             # element in the 'formats' field in info_dict is info_dict itself,
1353             # which can't be exported to json
1354             info_dict['formats'] = formats
1355         if self.params.get('listformats'):
1356             self.list_formats(info_dict)
1357             return
1358
1359         req_format = self.params.get('format')
1360         if req_format is None:
1361             req_format_list = []
1362             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1363                     not info_dict.get('is_live')):
1364                 merger = FFmpegMergerPP(self)
1365                 if merger.available and merger.can_merge():
1366                     req_format_list.append('bestvideo+bestaudio')
1367             req_format_list.append('best')
1368             req_format = '/'.join(req_format_list)
1369         format_selector = self.build_format_selector(req_format)
1370         formats_to_download = list(format_selector(formats))
1371         if not formats_to_download:
1372             raise ExtractorError('requested format not available',
1373                                  expected=True)
1374
1375         if download:
1376             if len(formats_to_download) > 1:
1377                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1378             for format in formats_to_download:
1379                 new_info = dict(info_dict)
1380                 new_info.update(format)
1381                 self.process_info(new_info)
1382         # We update the info dict with the best quality format (backwards compatibility)
1383         info_dict.update(formats_to_download[-1])
1384         return info_dict
1385
1386     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1387         """Select the requested subtitles and their format"""
1388         available_subs = {}
1389         if normal_subtitles and self.params.get('writesubtitles'):
1390             available_subs.update(normal_subtitles)
1391         if automatic_captions and self.params.get('writeautomaticsub'):
1392             for lang, cap_info in automatic_captions.items():
1393                 if lang not in available_subs:
1394                     available_subs[lang] = cap_info
1395
1396         if (not self.params.get('writesubtitles') and not
1397                 self.params.get('writeautomaticsub') or not
1398                 available_subs):
1399             return None
1400
1401         if self.params.get('allsubtitles', False):
1402             requested_langs = available_subs.keys()
1403         else:
1404             if self.params.get('subtitleslangs', False):
1405                 requested_langs = self.params.get('subtitleslangs')
1406             elif 'en' in available_subs:
1407                 requested_langs = ['en']
1408             else:
1409                 requested_langs = [list(available_subs.keys())[0]]
1410
1411         formats_query = self.params.get('subtitlesformat', 'best')
1412         formats_preference = formats_query.split('/') if formats_query else []
1413         subs = {}
1414         for lang in requested_langs:
1415             formats = available_subs.get(lang)
1416             if formats is None:
1417                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1418                 continue
1419             for ext in formats_preference:
1420                 if ext == 'best':
1421                     f = formats[-1]
1422                     break
1423                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1424                 if matches:
1425                     f = matches[-1]
1426                     break
1427             else:
1428                 f = formats[-1]
1429                 self.report_warning(
1430                     'No subtitle format found matching "%s" for language %s, '
1431                     'using %s' % (formats_query, lang, f['ext']))
1432             subs[lang] = f
1433         return subs
1434
1435     def process_info(self, info_dict):
1436         """Process a single resolved IE result."""
1437
1438         assert info_dict.get('_type', 'video') == 'video'
1439
1440         max_downloads = self.params.get('max_downloads')
1441         if max_downloads is not None:
1442             if self._num_downloads >= int(max_downloads):
1443                 raise MaxDownloadsReached()
1444
1445         info_dict['fulltitle'] = info_dict['title']
1446         if len(info_dict['title']) > 200:
1447             info_dict['title'] = info_dict['title'][:197] + '...'
1448
1449         if 'format' not in info_dict:
1450             info_dict['format'] = info_dict['ext']
1451
1452         reason = self._match_entry(info_dict, incomplete=False)
1453         if reason is not None:
1454             self.to_screen('[download] ' + reason)
1455             return
1456
1457         self._num_downloads += 1
1458
1459         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1460
1461         # Forced printings
1462         if self.params.get('forcetitle', False):
1463             self.to_stdout(info_dict['fulltitle'])
1464         if self.params.get('forceid', False):
1465             self.to_stdout(info_dict['id'])
1466         if self.params.get('forceurl', False):
1467             if info_dict.get('requested_formats') is not None:
1468                 for f in info_dict['requested_formats']:
1469                     self.to_stdout(f['url'] + f.get('play_path', ''))
1470             else:
1471                 # For RTMP URLs, also include the playpath
1472                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1473         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1474             self.to_stdout(info_dict['thumbnail'])
1475         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1476             self.to_stdout(info_dict['description'])
1477         if self.params.get('forcefilename', False) and filename is not None:
1478             self.to_stdout(filename)
1479         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1480             self.to_stdout(formatSeconds(info_dict['duration']))
1481         if self.params.get('forceformat', False):
1482             self.to_stdout(info_dict['format'])
1483         if self.params.get('forcejson', False):
1484             self.to_stdout(json.dumps(info_dict))
1485
1486         # Do nothing else if in simulate mode
1487         if self.params.get('simulate', False):
1488             return
1489
1490         if filename is None:
1491             return
1492
1493         try:
1494             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1495             if dn and not os.path.exists(dn):
1496                 os.makedirs(dn)
1497         except (OSError, IOError) as err:
1498             self.report_error('unable to create directory ' + error_to_compat_str(err))
1499             return
1500
1501         if self.params.get('writedescription', False):
1502             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1503             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1504                 self.to_screen('[info] Video description is already present')
1505             elif info_dict.get('description') is None:
1506                 self.report_warning('There\'s no description to write.')
1507             else:
1508                 try:
1509                     self.to_screen('[info] Writing video description to: ' + descfn)
1510                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1511                         descfile.write(info_dict['description'])
1512                 except (OSError, IOError):
1513                     self.report_error('Cannot write description file ' + descfn)
1514                     return
1515
1516         if self.params.get('writeannotations', False):
1517             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1518             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1519                 self.to_screen('[info] Video annotations are already present')
1520             else:
1521                 try:
1522                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1523                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1524                         annofile.write(info_dict['annotations'])
1525                 except (KeyError, TypeError):
1526                     self.report_warning('There are no annotations to write.')
1527                 except (OSError, IOError):
1528                     self.report_error('Cannot write annotations file: ' + annofn)
1529                     return
1530
1531         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1532                                        self.params.get('writeautomaticsub')])
1533
1534         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1535             # subtitles download errors are already managed as troubles in relevant IE
1536             # that way it will silently go on when used with unsupporting IE
1537             subtitles = info_dict['requested_subtitles']
1538             ie = self.get_info_extractor(info_dict['extractor_key'])
1539             for sub_lang, sub_info in subtitles.items():
1540                 sub_format = sub_info['ext']
1541                 if sub_info.get('data') is not None:
1542                     sub_data = sub_info['data']
1543                 else:
1544                     try:
1545                         sub_data = ie._download_webpage(
1546                             sub_info['url'], info_dict['id'], note=False)
1547                     except ExtractorError as err:
1548                         self.report_warning('Unable to download subtitle for "%s": %s' %
1549                                             (sub_lang, error_to_compat_str(err.cause)))
1550                         continue
1551                 try:
1552                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1553                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1554                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1555                     else:
1556                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1557                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1558                             subfile.write(sub_data)
1559                 except (OSError, IOError):
1560                     self.report_error('Cannot write subtitles file ' + sub_filename)
1561                     return
1562
1563         if self.params.get('writeinfojson', False):
1564             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1565             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1566                 self.to_screen('[info] Video description metadata is already present')
1567             else:
1568                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1569                 try:
1570                     write_json_file(self.filter_requested_info(info_dict), infofn)
1571                 except (OSError, IOError):
1572                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1573                     return
1574
1575         self._write_thumbnails(info_dict, filename)
1576
1577         if not self.params.get('skip_download', False):
1578             try:
1579                 def dl(name, info):
1580                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1581                     for ph in self._progress_hooks:
1582                         fd.add_progress_hook(ph)
1583                     if self.params.get('verbose'):
1584                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1585                     return fd.download(name, info)
1586
1587                 if info_dict.get('requested_formats') is not None:
1588                     downloaded = []
1589                     success = True
1590                     merger = FFmpegMergerPP(self)
1591                     if not merger.available:
1592                         postprocessors = []
1593                         self.report_warning('You have requested multiple '
1594                                             'formats but ffmpeg or avconv are not installed.'
1595                                             ' The formats won\'t be merged.')
1596                     else:
1597                         postprocessors = [merger]
1598
1599                     def compatible_formats(formats):
1600                         video, audio = formats
1601                         # Check extension
1602                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1603                         if video_ext and audio_ext:
1604                             COMPATIBLE_EXTS = (
1605                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1606                                 ('webm')
1607                             )
1608                             for exts in COMPATIBLE_EXTS:
1609                                 if video_ext in exts and audio_ext in exts:
1610                                     return True
1611                         # TODO: Check acodec/vcodec
1612                         return False
1613
1614                     filename_real_ext = os.path.splitext(filename)[1][1:]
1615                     filename_wo_ext = (
1616                         os.path.splitext(filename)[0]
1617                         if filename_real_ext == info_dict['ext']
1618                         else filename)
1619                     requested_formats = info_dict['requested_formats']
1620                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1621                         info_dict['ext'] = 'mkv'
1622                         self.report_warning(
1623                             'Requested formats are incompatible for merge and will be merged into mkv.')
1624                     # Ensure filename always has a correct extension for successful merge
1625                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1626                     if os.path.exists(encodeFilename(filename)):
1627                         self.to_screen(
1628                             '[download] %s has already been downloaded and '
1629                             'merged' % filename)
1630                     else:
1631                         for f in requested_formats:
1632                             new_info = dict(info_dict)
1633                             new_info.update(f)
1634                             fname = self.prepare_filename(new_info)
1635                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1636                             downloaded.append(fname)
1637                             partial_success = dl(fname, new_info)
1638                             success = success and partial_success
1639                         info_dict['__postprocessors'] = postprocessors
1640                         info_dict['__files_to_merge'] = downloaded
1641                 else:
1642                     # Just a single file
1643                     success = dl(filename, info_dict)
1644             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1645                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1646                 return
1647             except (OSError, IOError) as err:
1648                 raise UnavailableVideoError(err)
1649             except (ContentTooShortError, ) as err:
1650                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1651                 return
1652
1653             if success and filename != '-':
1654                 # Fixup content
1655                 fixup_policy = self.params.get('fixup')
1656                 if fixup_policy is None:
1657                     fixup_policy = 'detect_or_warn'
1658
1659                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1660
1661                 stretched_ratio = info_dict.get('stretched_ratio')
1662                 if stretched_ratio is not None and stretched_ratio != 1:
1663                     if fixup_policy == 'warn':
1664                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1665                             info_dict['id'], stretched_ratio))
1666                     elif fixup_policy == 'detect_or_warn':
1667                         stretched_pp = FFmpegFixupStretchedPP(self)
1668                         if stretched_pp.available:
1669                             info_dict.setdefault('__postprocessors', [])
1670                             info_dict['__postprocessors'].append(stretched_pp)
1671                         else:
1672                             self.report_warning(
1673                                 '%s: Non-uniform pixel ratio (%s). %s'
1674                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1675                     else:
1676                         assert fixup_policy in ('ignore', 'never')
1677
1678                 if (info_dict.get('requested_formats') is None and
1679                         info_dict.get('container') == 'm4a_dash'):
1680                     if fixup_policy == 'warn':
1681                         self.report_warning(
1682                             '%s: writing DASH m4a. '
1683                             'Only some players support this container.'
1684                             % info_dict['id'])
1685                     elif fixup_policy == 'detect_or_warn':
1686                         fixup_pp = FFmpegFixupM4aPP(self)
1687                         if fixup_pp.available:
1688                             info_dict.setdefault('__postprocessors', [])
1689                             info_dict['__postprocessors'].append(fixup_pp)
1690                         else:
1691                             self.report_warning(
1692                                 '%s: writing DASH m4a. '
1693                                 'Only some players support this container. %s'
1694                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1695                     else:
1696                         assert fixup_policy in ('ignore', 'never')
1697
1698                 if (info_dict.get('protocol') == 'm3u8_native' or
1699                         info_dict.get('protocol') == 'm3u8' and
1700                         self.params.get('hls_prefer_native')):
1701                     if fixup_policy == 'warn':
1702                         self.report_warning('%s: malformated aac bitstream.' % (
1703                             info_dict['id']))
1704                     elif fixup_policy == 'detect_or_warn':
1705                         fixup_pp = FFmpegFixupM3u8PP(self)
1706                         if fixup_pp.available:
1707                             info_dict.setdefault('__postprocessors', [])
1708                             info_dict['__postprocessors'].append(fixup_pp)
1709                         else:
1710                             self.report_warning(
1711                                 '%s: malformated aac bitstream. %s'
1712                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1713                     else:
1714                         assert fixup_policy in ('ignore', 'never')
1715
1716                 try:
1717                     self.post_process(filename, info_dict)
1718                 except (PostProcessingError) as err:
1719                     self.report_error('postprocessing: %s' % str(err))
1720                     return
1721                 self.record_download_archive(info_dict)
1722
1723     def download(self, url_list):
1724         """Download a given list of URLs."""
1725         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1726         if (len(url_list) > 1 and
1727                 '%' not in outtmpl and
1728                 self.params.get('max_downloads') != 1):
1729             raise SameFileError(outtmpl)
1730
1731         for url in url_list:
1732             try:
1733                 # It also downloads the videos
1734                 res = self.extract_info(
1735                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1736             except UnavailableVideoError:
1737                 self.report_error('unable to download video')
1738             except MaxDownloadsReached:
1739                 self.to_screen('[info] Maximum number of downloaded files reached.')
1740                 raise
1741             else:
1742                 if self.params.get('dump_single_json', False):
1743                     self.to_stdout(json.dumps(res))
1744
1745         return self._download_retcode
1746
1747     def download_with_info_file(self, info_filename):
1748         with contextlib.closing(fileinput.FileInput(
1749                 [info_filename], mode='r',
1750                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1751             # FileInput doesn't have a read method, we can't call json.load
1752             info = self.filter_requested_info(json.loads('\n'.join(f)))
1753         try:
1754             self.process_ie_result(info, download=True)
1755         except DownloadError:
1756             webpage_url = info.get('webpage_url')
1757             if webpage_url is not None:
1758                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1759                 return self.download([webpage_url])
1760             else:
1761                 raise
1762         return self._download_retcode
1763
1764     @staticmethod
1765     def filter_requested_info(info_dict):
1766         return dict(
1767             (k, v) for k, v in info_dict.items()
1768             if k not in ['requested_formats', 'requested_subtitles'])
1769
1770     def post_process(self, filename, ie_info):
1771         """Run all the postprocessors on the given file."""
1772         info = dict(ie_info)
1773         info['filepath'] = filename
1774         pps_chain = []
1775         if ie_info.get('__postprocessors') is not None:
1776             pps_chain.extend(ie_info['__postprocessors'])
1777         pps_chain.extend(self._pps)
1778         for pp in pps_chain:
1779             files_to_delete = []
1780             try:
1781                 files_to_delete, info = pp.run(info)
1782             except PostProcessingError as e:
1783                 self.report_error(e.msg)
1784             if files_to_delete and not self.params.get('keepvideo', False):
1785                 for old_filename in files_to_delete:
1786                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1787                     try:
1788                         os.remove(encodeFilename(old_filename))
1789                     except (IOError, OSError):
1790                         self.report_warning('Unable to remove downloaded original file')
1791
1792     def _make_archive_id(self, info_dict):
1793         # Future-proof against any change in case
1794         # and backwards compatibility with prior versions
1795         extractor = info_dict.get('extractor_key')
1796         if extractor is None:
1797             if 'id' in info_dict:
1798                 extractor = info_dict.get('ie_key')  # key in a playlist
1799         if extractor is None:
1800             return None  # Incomplete video information
1801         return extractor.lower() + ' ' + info_dict['id']
1802
1803     def in_download_archive(self, info_dict):
1804         fn = self.params.get('download_archive')
1805         if fn is None:
1806             return False
1807
1808         vid_id = self._make_archive_id(info_dict)
1809         if vid_id is None:
1810             return False  # Incomplete video information
1811
1812         try:
1813             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1814                 for line in archive_file:
1815                     if line.strip() == vid_id:
1816                         return True
1817         except IOError as ioe:
1818             if ioe.errno != errno.ENOENT:
1819                 raise
1820         return False
1821
1822     def record_download_archive(self, info_dict):
1823         fn = self.params.get('download_archive')
1824         if fn is None:
1825             return
1826         vid_id = self._make_archive_id(info_dict)
1827         assert vid_id
1828         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1829             archive_file.write(vid_id + '\n')
1830
1831     @staticmethod
1832     def format_resolution(format, default='unknown'):
1833         if format.get('vcodec') == 'none':
1834             return 'audio only'
1835         if format.get('resolution') is not None:
1836             return format['resolution']
1837         if format.get('height') is not None:
1838             if format.get('width') is not None:
1839                 res = '%sx%s' % (format['width'], format['height'])
1840             else:
1841                 res = '%sp' % format['height']
1842         elif format.get('width') is not None:
1843             res = '%dx?' % format['width']
1844         else:
1845             res = default
1846         return res
1847
1848     def _format_note(self, fdict):
1849         res = ''
1850         if fdict.get('ext') in ['f4f', 'f4m']:
1851             res += '(unsupported) '
1852         if fdict.get('language'):
1853             if res:
1854                 res += ' '
1855             res += '[%s] ' % fdict['language']
1856         if fdict.get('format_note') is not None:
1857             res += fdict['format_note'] + ' '
1858         if fdict.get('tbr') is not None:
1859             res += '%4dk ' % fdict['tbr']
1860         if fdict.get('container') is not None:
1861             if res:
1862                 res += ', '
1863             res += '%s container' % fdict['container']
1864         if (fdict.get('vcodec') is not None and
1865                 fdict.get('vcodec') != 'none'):
1866             if res:
1867                 res += ', '
1868             res += fdict['vcodec']
1869             if fdict.get('vbr') is not None:
1870                 res += '@'
1871         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1872             res += 'video@'
1873         if fdict.get('vbr') is not None:
1874             res += '%4dk' % fdict['vbr']
1875         if fdict.get('fps') is not None:
1876             if res:
1877                 res += ', '
1878             res += '%sfps' % fdict['fps']
1879         if fdict.get('acodec') is not None:
1880             if res:
1881                 res += ', '
1882             if fdict['acodec'] == 'none':
1883                 res += 'video only'
1884             else:
1885                 res += '%-5s' % fdict['acodec']
1886         elif fdict.get('abr') is not None:
1887             if res:
1888                 res += ', '
1889             res += 'audio'
1890         if fdict.get('abr') is not None:
1891             res += '@%3dk' % fdict['abr']
1892         if fdict.get('asr') is not None:
1893             res += ' (%5dHz)' % fdict['asr']
1894         if fdict.get('filesize') is not None:
1895             if res:
1896                 res += ', '
1897             res += format_bytes(fdict['filesize'])
1898         elif fdict.get('filesize_approx') is not None:
1899             if res:
1900                 res += ', '
1901             res += '~' + format_bytes(fdict['filesize_approx'])
1902         return res
1903
1904     def list_formats(self, info_dict):
1905         formats = info_dict.get('formats', [info_dict])
1906         table = [
1907             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1908             for f in formats
1909             if f.get('preference') is None or f['preference'] >= -1000]
1910         if len(formats) > 1:
1911             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1912
1913         header_line = ['format code', 'extension', 'resolution', 'note']
1914         self.to_screen(
1915             '[info] Available formats for %s:\n%s' %
1916             (info_dict['id'], render_table(header_line, table)))
1917
1918     def list_thumbnails(self, info_dict):
1919         thumbnails = info_dict.get('thumbnails')
1920         if not thumbnails:
1921             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1922             return
1923
1924         self.to_screen(
1925             '[info] Thumbnails for %s:' % info_dict['id'])
1926         self.to_screen(render_table(
1927             ['ID', 'width', 'height', 'URL'],
1928             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1929
1930     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1931         if not subtitles:
1932             self.to_screen('%s has no %s' % (video_id, name))
1933             return
1934         self.to_screen(
1935             'Available %s for %s:' % (name, video_id))
1936         self.to_screen(render_table(
1937             ['Language', 'formats'],
1938             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1939                 for lang, formats in subtitles.items()]))
1940
1941     def urlopen(self, req):
1942         """ Start an HTTP download """
1943         if isinstance(req, compat_basestring):
1944             req = sanitized_Request(req)
1945         return self._opener.open(req, timeout=self._socket_timeout)
1946
1947     def print_debug_header(self):
1948         if not self.params.get('verbose'):
1949             return
1950
1951         if type('') is not compat_str:
1952             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1953             self.report_warning(
1954                 'Your Python is broken! Update to a newer and supported version')
1955
1956         stdout_encoding = getattr(
1957             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1958         encoding_str = (
1959             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1960                 locale.getpreferredencoding(),
1961                 sys.getfilesystemencoding(),
1962                 stdout_encoding,
1963                 self.get_encoding()))
1964         write_string(encoding_str, encoding=None)
1965
1966         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1967         if _LAZY_LOADER:
1968             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
1969         try:
1970             sp = subprocess.Popen(
1971                 ['git', 'rev-parse', '--short', 'HEAD'],
1972                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1973                 cwd=os.path.dirname(os.path.abspath(__file__)))
1974             out, err = sp.communicate()
1975             out = out.decode().strip()
1976             if re.match('[0-9a-f]+', out):
1977                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1978         except Exception:
1979             try:
1980                 sys.exc_clear()
1981             except Exception:
1982                 pass
1983         self._write_string('[debug] Python version %s - %s\n' % (
1984             platform.python_version(), platform_name()))
1985
1986         exe_versions = FFmpegPostProcessor.get_versions(self)
1987         exe_versions['rtmpdump'] = rtmpdump_version()
1988         exe_str = ', '.join(
1989             '%s %s' % (exe, v)
1990             for exe, v in sorted(exe_versions.items())
1991             if v
1992         )
1993         if not exe_str:
1994             exe_str = 'none'
1995         self._write_string('[debug] exe versions: %s\n' % exe_str)
1996
1997         proxy_map = {}
1998         for handler in self._opener.handlers:
1999             if hasattr(handler, 'proxies'):
2000                 proxy_map.update(handler.proxies)
2001         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2002
2003         if self.params.get('call_home', False):
2004             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2005             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2006             latest_version = self.urlopen(
2007                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2008             if version_tuple(latest_version) > version_tuple(__version__):
2009                 self.report_warning(
2010                     'You are using an outdated version (newest version: %s)! '
2011                     'See https://yt-dl.org/update if you need help updating.' %
2012                     latest_version)
2013
2014     def _setup_opener(self):
2015         timeout_val = self.params.get('socket_timeout')
2016         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2017
2018         opts_cookiefile = self.params.get('cookiefile')
2019         opts_proxy = self.params.get('proxy')
2020
2021         if opts_cookiefile is None:
2022             self.cookiejar = compat_cookiejar.CookieJar()
2023         else:
2024             opts_cookiefile = compat_expanduser(opts_cookiefile)
2025             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2026                 opts_cookiefile)
2027             if os.access(opts_cookiefile, os.R_OK):
2028                 self.cookiejar.load()
2029
2030         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2031         if opts_proxy is not None:
2032             if opts_proxy == '':
2033                 proxies = {}
2034             else:
2035                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2036         else:
2037             proxies = compat_urllib_request.getproxies()
2038             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2039             if 'http' in proxies and 'https' not in proxies:
2040                 proxies['https'] = proxies['http']
2041         proxy_handler = PerRequestProxyHandler(proxies)
2042
2043         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2044         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2045         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2046         data_handler = compat_urllib_request_DataHandler()
2047
2048         # When passing our own FileHandler instance, build_opener won't add the
2049         # default FileHandler and allows us to disable the file protocol, which
2050         # can be used for malicious purposes (see
2051         # https://github.com/rg3/youtube-dl/issues/8227)
2052         file_handler = compat_urllib_request.FileHandler()
2053
2054         def file_open(*args, **kwargs):
2055             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2056         file_handler.file_open = file_open
2057
2058         opener = compat_urllib_request.build_opener(
2059             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2060
2061         # Delete the default user-agent header, which would otherwise apply in
2062         # cases where our custom HTTP handler doesn't come into play
2063         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2064         opener.addheaders = []
2065         self._opener = opener
2066
2067     def encode(self, s):
2068         if isinstance(s, bytes):
2069             return s  # Already encoded
2070
2071         try:
2072             return s.encode(self.get_encoding())
2073         except UnicodeEncodeError as err:
2074             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2075             raise
2076
2077     def get_encoding(self):
2078         encoding = self.params.get('encoding')
2079         if encoding is None:
2080             encoding = preferredencoding()
2081         return encoding
2082
2083     def _write_thumbnails(self, info_dict, filename):
2084         if self.params.get('writethumbnail', False):
2085             thumbnails = info_dict.get('thumbnails')
2086             if thumbnails:
2087                 thumbnails = [thumbnails[-1]]
2088         elif self.params.get('write_all_thumbnails', False):
2089             thumbnails = info_dict.get('thumbnails')
2090         else:
2091             return
2092
2093         if not thumbnails:
2094             # No thumbnails present, so return immediately
2095             return
2096
2097         for t in thumbnails:
2098             thumb_ext = determine_ext(t['url'], 'jpg')
2099             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2100             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2101             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2102
2103             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2104                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2105                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2106             else:
2107                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2108                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2109                 try:
2110                     uf = self.urlopen(t['url'])
2111                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2112                         shutil.copyfileobj(uf, thumbf)
2113                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2114                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2115                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2116                     self.report_warning('Unable to download thumbnail "%s": %s' %
2117                                         (t['url'], error_to_compat_str(err)))