_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import contextlib
   8 import copy
   9 import datetime
  10 import errno
  11 import fileinput
  12 import io
  13 import itertools
  14 import json
  15 import locale
  16 import operator
  17 import os
  18 import platform
  19 import re
  20 import shutil
  21 import subprocess
  22 import socket
  23 import sys
  24 import time
  25 import tokenize
  26 import traceback
  27 import random
  28
  29 from .compat import (
  30     compat_basestring,
  31     compat_cookiejar,
  32     compat_get_terminal_size,
  33     compat_http_client,
  34     compat_kwargs,
  35     compat_numeric_types,
  36     compat_os_name,
  37     compat_str,
  38     compat_tokenize_tokenize,
  39     compat_urllib_error,
  40     compat_urllib_request,
  41     compat_urllib_request_DataHandler,
  42 )
  43 from .utils import (
  44     age_restricted,
  45     args_to_str,
  46     ContentTooShortError,
  47     date_from_str,
  48     DateRange,
  49     DEFAULT_OUTTMPL,
  50     determine_ext,
  51     determine_protocol,
  52     DownloadError,
  53     encode_compat_str,
  54     encodeFilename,
  55     error_to_compat_str,
  56     expand_path,
  57     ExtractorError,
  58     format_bytes,
  59     formatSeconds,
  60     GeoRestrictedError,
  61     ISO3166Utils,
  62     locked_file,
  63     make_HTTPS_handler,
  64     MaxDownloadsReached,
  65     PagedList,
  66     parse_filesize,
  67     PerRequestProxyHandler,
  68     platform_name,
  69     PostProcessingError,
  70     preferredencoding,
  71     prepend_extension,
  72     register_socks_protocols,
  73     render_table,
  74     replace_extension,
  75     SameFileError,
  76     sanitize_filename,
  77     sanitize_path,
  78     sanitize_url,
  79     sanitized_Request,
  80     std_headers,
  81     subtitles_filename,
  82     UnavailableVideoError,
  83     url_basename,
  84     version_tuple,
  85     write_json_file,
  86     write_string,
  87     YoutubeDLCookieProcessor,
  88     YoutubeDLHandler,
  89     PhantomJSwrapper,
  90 )
  91 from .cache import Cache
  92 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
  93 from .downloader import get_suitable_downloader
  94 from .downloader.rtmp import rtmpdump_version
  95 from .postprocessor import (
  96     FFmpegFixupM3u8PP,
  97     FFmpegFixupM4aPP,
  98     FFmpegFixupStretchedPP,
  99     FFmpegMergerPP,
 100     FFmpegPostProcessor,
 101     get_postprocessor,
 102 )
 103 from .version import __version__
 104
 105 if compat_os_name == 'nt':
 106     import ctypes
 107
 108
 109 class YoutubeDL(object):
 110     """YoutubeDL class.
 111
 112     YoutubeDL objects are the ones responsible of downloading the
 113     actual video file and writing it to disk if the user has requested
 114     it, among some other tasks. In most cases there should be one per
 115     program. As, given a video URL, the downloader doesn't know how to
 116     extract all the needed information, task that InfoExtractors do, it
 117     has to pass the URL to one of them.
 118
 119     For this, YoutubeDL objects have a method that allows
 120     InfoExtractors to be registered in a given order. When it is passed
 121     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 122     finds that reports being able to handle it. The InfoExtractor extracts
 123     all the information about the video or videos the URL refers to, and
 124     YoutubeDL process the extracted information, possibly using a File
 125     Downloader to download the video.
 126
 127     YoutubeDL objects accept a lot of parameters. In order not to saturate
 128     the object constructor with arguments, it receives a dictionary of
 129     options instead. These options are available through the params
 130     attribute for the InfoExtractors to use. The YoutubeDL also
 131     registers itself as the downloader in charge for the InfoExtractors
 132     that are added to it, so this is a "mutual registration".
 133
 134     Available options:
 135
 136     username:          Username for authentication purposes.
 137     password:          Password for authentication purposes.
 138     videopassword:     Password for accessing a video.
 139     ap_mso:            Adobe Pass multiple-system operator identifier.
 140     ap_username:       Multiple-system operator account username.
 141     ap_password:       Multiple-system operator account password.
 142     usenetrc:          Use netrc for authentication instead.
 143     verbose:           Print additional info to stdout.
 144     quiet:             Do not print messages to stdout.
 145     no_warnings:       Do not print out anything for warnings.
 146     forceurl:          Force printing final URL.
 147     forcetitle:        Force printing title.
 148     forceid:           Force printing ID.
 149     forcethumbnail:    Force printing thumbnail URL.
 150     forcedescription:  Force printing description.
 151     forcefilename:     Force printing final filename.
 152     forceduration:     Force printing duration.
 153     forcejson:         Force printing info_dict as JSON.
 154     dump_single_json:  Force printing the info_dict of the whole playlist
 155                        (or video) as a single JSON line.
 156     simulate:          Do not download the video files.
 157     format:            Video format code. See options.py for more information.
 158     outtmpl:           Template for output names.
 159     restrictfilenames: Do not allow "&" and spaces in file names
 160     ignoreerrors:      Do not stop on download errors.
 161     force_generic_extractor: Force downloader to use the generic extractor
 162     nooverwrites:      Prevent overwriting files.
 163     playliststart:     Playlist item to start at.
 164     playlistend:       Playlist item to end at.
 165     playlist_items:    Specific indices of playlist to download.
 166     playlistreverse:   Download playlist items in reverse order.
 167     playlistrandom:    Download playlist items in random order.
 168     matchtitle:        Download only matching titles.
 169     rejecttitle:       Reject downloads for matching titles.
 170     logger:            Log messages to a logging.Logger instance.
 171     logtostderr:       Log messages to stderr instead of stdout.
 172     writedescription:  Write the video description to a .description file
 173     writeinfojson:     Write the video description to a .info.json file
 174     writeannotations:  Write the video annotations to a .annotations.xml file
 175     writethumbnail:    Write the thumbnail image to a file
 176     write_all_thumbnails:  Write all thumbnail formats to files
 177     writesubtitles:    Write the video subtitles to a file
 178     writeautomaticsub: Write the automatically generated subtitles to a file
 179     allsubtitles:      Downloads all the subtitles of the video
 180                        (requires writesubtitles or writeautomaticsub)
 181     listsubtitles:     Lists all available subtitles for the video
 182     subtitlesformat:   The format code for subtitles
 183     subtitleslangs:    List of languages of the subtitles to download
 184     keepvideo:         Keep the video file after post-processing
 185     daterange:         A DateRange object, download only if the upload_date is in the range.
 186     skip_download:     Skip the actual download of the video file
 187     cachedir:          Location of the cache files in the filesystem.
 188                        False to disable filesystem cache.
 189     noplaylist:        Download single video instead of a playlist if in doubt.
 190     age_limit:         An integer representing the user's age in years.
 191                        Unsuitable videos for the given age are skipped.
 192     min_views:         An integer representing the minimum view count the video
 193                        must have in order to not be skipped.
 194                        Videos without view count information are always
 195                        downloaded. None for no limit.
 196     max_views:         An integer representing the maximum view count.
 197                        Videos that are more popular than that are not
 198                        downloaded.
 199                        Videos without view count information are always
 200                        downloaded. None for no limit.
 201     download_archive:  File name of a file where all downloads are recorded.
 202                        Videos already present in the file are not downloaded
 203                        again.
 204     cookiefile:        File name where cookies should be read from and dumped to.
 205     nocheckcertificate:Do not verify SSL certificates
 206     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 207                        At the moment, this is only supported by YouTube.
 208     proxy:             URL of the proxy server to use
 209     geo_verification_proxy:  URL of the proxy to use for IP address verification
 210                        on geo-restricted sites. (Experimental)
 211     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 212     bidi_workaround:   Work around buggy terminals without bidirectional text
 213                        support, using fridibi
 214     debug_printtraffic:Print out sent and received HTTP traffic
 215     include_ads:       Download ads as well
 216     default_search:    Prepend this string if an input url is not valid.
 217                        'auto' for elaborate guessing
 218     encoding:          Use this encoding instead of the system-specified.
 219     extract_flat:      Do not resolve URLs, return the immediate result.
 220                        Pass in 'in_playlist' to only show this behavior for
 221                        playlist items.
 222     postprocessors:    A list of dictionaries, each with an entry
 223                        * key:  The name of the postprocessor. See
 224                                youtube_dl/postprocessor/__init__.py for a list.
 225                        as well as any further keyword arguments for the
 226                        postprocessor.
 227     progress_hooks:    A list of functions that get called on download
 228                        progress, with a dictionary with the entries
 229                        * status: One of "downloading", "error", or "finished".
 230                                  Check this first and ignore unknown values.
 231
 232                        If status is one of "downloading", or "finished", the
 233                        following properties may also be present:
 234                        * filename: The final filename (always present)
 235                        * tmpfilename: The filename we're currently writing to
 236                        * downloaded_bytes: Bytes on disk
 237                        * total_bytes: Size of the whole file, None if unknown
 238                        * total_bytes_estimate: Guess of the eventual file size,
 239                                                None if unavailable.
 240                        * elapsed: The number of seconds since download started.
 241                        * eta: The estimated time in seconds, None if unknown
 242                        * speed: The download speed in bytes/second, None if
 243                                 unknown
 244                        * fragment_index: The counter of the currently
 245                                          downloaded video fragment.
 246                        * fragment_count: The number of fragments (= individual
 247                                          files that will be merged)
 248
 249                        Progress hooks are guaranteed to be called at least once
 250                        (with status "finished") if the download is successful.
 251     merge_output_format: Extension to use when merging formats.
 252     fixup:             Automatically correct known faults of the file.
 253                        One of:
 254                        - "never": do nothing
 255                        - "warn": only emit a warning
 256                        - "detect_or_warn": check whether we can do anything
 257                                            about it, warn otherwise (default)
 258     source_address:    (Experimental) Client-side IP address to bind to.
 259     call_home:         Boolean, true iff we are allowed to contact the
 260                        youtube-dl servers for debugging.
 261     sleep_interval:    Number of seconds to sleep before each download when
 262                        used alone or a lower bound of a range for randomized
 263                        sleep before each download (minimum possible number
 264                        of seconds to sleep) when used along with
 265                        max_sleep_interval.
 266     max_sleep_interval:Upper bound of a range for randomized sleep before each
 267                        download (maximum possible number of seconds to sleep).
 268                        Must only be used along with sleep_interval.
 269                        Actual sleep time will be a random float from range
 270                        [sleep_interval; max_sleep_interval].
 271     listformats:       Print an overview of available video formats and exit.
 272     list_thumbnails:   Print a table of all thumbnails and exit.
 273     match_filter:      A function that gets called with the info_dict of
 274                        every video.
 275                        If it returns a message, the video is ignored.
 276                        If it returns None, the video is downloaded.
 277                        match_filter_func in utils.py is one example for this.
 278     no_color:          Do not emit color codes in output.
 279     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
 280                        HTTP header (experimental)
 281     geo_bypass_country:
 282                        Two-letter ISO 3166-2 country code that will be used for
 283                        explicit geographic restriction bypassing via faking
 284                        X-Forwarded-For HTTP header (experimental)
 285
 286     The following options determine which downloader is picked:
 287     external_downloader: Executable of the external downloader to call.
 288                        None or unset for standard (built-in) downloader.
 289     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
 290                        if True, otherwise use ffmpeg/avconv if False, otherwise
 291                        use downloader suggested by extractor if None.
 292
 293     The following parameters are not used by YoutubeDL itself, they are used by
 294     the downloader (see youtube_dl/downloader/common.py):
 295     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 296     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 297     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
 298
 299     The following options are used by the post processors:
 300     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 301                        otherwise prefer avconv.
 302     postprocessor_args: A list of additional command-line arguments for the
 303                         postprocessor.
 304     """
 305
 306     params = None
 307     _ies = []
 308     _pps = []
 309     _download_retcode = None
 310     _num_downloads = None
 311     _screen_file = None
 312
 313     def __init__(self, params=None, auto_init=True):
 314         """Create a FileDownloader object with the given options."""
 315         if params is None:
 316             params = {}
 317         self._ies = []
 318         self._ies_instances = {}
 319         self._pps = []
 320         self._progress_hooks = []
 321         self._download_retcode = 0
 322         self._num_downloads = 0
 323         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 324         self._err_file = sys.stderr
 325         self.params = {
 326             # Default parameters
 327             'nocheckcertificate': False,
 328         }
 329         self.params.update(params)
 330         self.cache = Cache(self)
 331
 332         def check_deprecated(param, option, suggestion):
 333             if self.params.get(param) is not None:
 334                 self.report_warning(
 335                     '%s is deprecated. Use %s instead.' % (option, suggestion))
 336                 return True
 337             return False
 338
 339         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
 340             if self.params.get('geo_verification_proxy') is None:
 341                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
 342
 343         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
 344         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
 345         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
 346
 347         if params.get('bidi_workaround', False):
 348             try:
 349                 import pty
 350                 master, slave = pty.openpty()
 351                 width = compat_get_terminal_size().columns
 352                 if width is None:
 353                     width_args = []
 354                 else:
 355                     width_args = ['-w', str(width)]
 356                 sp_kwargs = dict(
 357                     stdin=subprocess.PIPE,
 358                     stdout=slave,
 359                     stderr=self._err_file)
 360                 try:
 361                     self._output_process = subprocess.Popen(
 362                         ['bidiv'] + width_args, **sp_kwargs
 363                     )
 364                 except OSError:
 365                     self._output_process = subprocess.Popen(
 366                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 367                 self._output_channel = os.fdopen(master, 'rb')
 368             except OSError as ose:
 369                 if ose.errno == errno.ENOENT:
 370                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 371                 else:
 372                     raise
 373
 374         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 375                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
 376                 not params.get('restrictfilenames', False)):
 377             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 378             self.report_warning(
 379                 'Assuming --restrict-filenames since file system encoding '
 380                 'cannot encode all characters. '
 381                 'Set the LC_ALL environment variable to fix this.')
 382             self.params['restrictfilenames'] = True
 383
 384         if isinstance(params.get('outtmpl'), bytes):
 385             self.report_warning(
 386                 'Parameter outtmpl is bytes, but should be a unicode string. '
 387                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 388
 389         self._setup_opener()
 390
 391         if auto_init:
 392             self.print_debug_header()
 393             self.add_default_info_extractors()
 394
 395         for pp_def_raw in self.params.get('postprocessors', []):
 396             pp_class = get_postprocessor(pp_def_raw['key'])
 397             pp_def = dict(pp_def_raw)
 398             del pp_def['key']
 399             pp = pp_class(self, **compat_kwargs(pp_def))
 400             self.add_post_processor(pp)
 401
 402         for ph in self.params.get('progress_hooks', []):
 403             self.add_progress_hook(ph)
 404
 405         register_socks_protocols()
 406
 407     def warn_if_short_id(self, argv):
 408         # short YouTube ID starting with dash?
 409         idxs = [
 410             i for i, a in enumerate(argv)
 411             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 412         if idxs:
 413             correct_argv = (
 414                 ['youtube-dl'] +
 415                 [a for i, a in enumerate(argv) if i not in idxs] +
 416                 ['--'] + [argv[i] for i in idxs]
 417             )
 418             self.report_warning(
 419                 'Long argument string detected. '
 420                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 421                 args_to_str(correct_argv))
 422
 423     def add_info_extractor(self, ie):
 424         """Add an InfoExtractor object to the end of the list."""
 425         self._ies.append(ie)
 426         if not isinstance(ie, type):
 427             self._ies_instances[ie.ie_key()] = ie
 428             ie.set_downloader(self)
 429
 430     def get_info_extractor(self, ie_key):
 431         """
 432         Get an instance of an IE with name ie_key, it will try to get one from
 433         the _ies list, if there's no instance it will create a new one and add
 434         it to the extractor list.
 435         """
 436         ie = self._ies_instances.get(ie_key)
 437         if ie is None:
 438             ie = get_info_extractor(ie_key)()
 439             self.add_info_extractor(ie)
 440         return ie
 441
 442     def add_default_info_extractors(self):
 443         """
 444         Add the InfoExtractors returned by gen_extractors to the end of the list
 445         """
 446         for ie in gen_extractor_classes():
 447             self.add_info_extractor(ie)
 448
 449     def add_post_processor(self, pp):
 450         """Add a PostProcessor object to the end of the chain."""
 451         self._pps.append(pp)
 452         pp.set_downloader(self)
 453
 454     def add_progress_hook(self, ph):
 455         """Add the progress hook (currently only for the file downloader)"""
 456         self._progress_hooks.append(ph)
 457
 458     def _bidi_workaround(self, message):
 459         if not hasattr(self, '_output_channel'):
 460             return message
 461
 462         assert hasattr(self, '_output_process')
 463         assert isinstance(message, compat_str)
 464         line_count = message.count('\n') + 1
 465         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 466         self._output_process.stdin.flush()
 467         res = ''.join(self._output_channel.readline().decode('utf-8')
 468                       for _ in range(line_count))
 469         return res[:-len('\n')]
 470
 471     def to_screen(self, message, skip_eol=False):
 472         """Print message to stdout if not in quiet mode."""
 473         return self.to_stdout(message, skip_eol, check_quiet=True)
 474
 475     def _write_string(self, s, out=None):
 476         write_string(s, out=out, encoding=self.params.get('encoding'))
 477
 478     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 479         """Print message to stdout if not in quiet mode."""
 480         if self.params.get('logger'):
 481             self.params['logger'].debug(message)
 482         elif not check_quiet or not self.params.get('quiet', False):
 483             message = self._bidi_workaround(message)
 484             terminator = ['\n', ''][skip_eol]
 485             output = message + terminator
 486
 487             self._write_string(output, self._screen_file)
 488
 489     def to_stderr(self, message):
 490         """Print message to stderr."""
 491         assert isinstance(message, compat_str)
 492         if self.params.get('logger'):
 493             self.params['logger'].error(message)
 494         else:
 495             message = self._bidi_workaround(message)
 496             output = message + '\n'
 497             self._write_string(output, self._err_file)
 498
 499     def to_console_title(self, message):
 500         if not self.params.get('consoletitle', False):
 501             return
 502         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 503             # c_wchar_p() might not be necessary if `message` is
 504             # already of type unicode()
 505             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 506         elif 'TERM' in os.environ:
 507             self._write_string('\033]0;%s\007' % message, self._screen_file)
 508
 509     def save_console_title(self):
 510         if not self.params.get('consoletitle', False):
 511             return
 512         if 'TERM' in os.environ:
 513             # Save the title on stack
 514             self._write_string('\033[22;0t', self._screen_file)
 515
 516     def restore_console_title(self):
 517         if not self.params.get('consoletitle', False):
 518             return
 519         if 'TERM' in os.environ:
 520             # Restore the title from stack
 521             self._write_string('\033[23;0t', self._screen_file)
 522
 523     def __enter__(self):
 524         self.save_console_title()
 525         return self
 526
 527     def __exit__(self, *args):
 528         self.restore_console_title()
 529
 530         if self.params.get('cookiefile') is not None:
 531             self.cookiejar.save()
 532
 533     def trouble(self, message=None, tb=None):
 534         """Determine action to take when a download problem appears.
 535
 536         Depending on if the downloader has been configured to ignore
 537         download errors or not, this method may throw an exception or
 538         not when errors are found, after printing the message.
 539
 540         tb, if given, is additional traceback information.
 541         """
 542         if message is not None:
 543             self.to_stderr(message)
 544         if self.params.get('verbose'):
 545             if tb is None:
 546                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 547                     tb = ''
 548                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 549                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 550                     tb += encode_compat_str(traceback.format_exc())
 551                 else:
 552                     tb_data = traceback.format_list(traceback.extract_stack())
 553                     tb = ''.join(tb_data)
 554             self.to_stderr(tb)
 555         if not self.params.get('ignoreerrors', False):
 556             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 557                 exc_info = sys.exc_info()[1].exc_info
 558             else:
 559                 exc_info = sys.exc_info()
 560             raise DownloadError(message, exc_info)
 561         self._download_retcode = 1
 562
 563     def report_warning(self, message):
 564         '''
 565         Print the message to stderr, it will be prefixed with 'WARNING:'
 566         If stderr is a tty file the 'WARNING:' will be colored
 567         '''
 568         if self.params.get('logger') is not None:
 569             self.params['logger'].warning(message)
 570         else:
 571             if self.params.get('no_warnings'):
 572                 return
 573             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 574                 _msg_header = '\033[0;33mWARNING:\033[0m'
 575             else:
 576                 _msg_header = 'WARNING:'
 577             warning_message = '%s %s' % (_msg_header, message)
 578             self.to_stderr(warning_message)
 579
 580     def report_error(self, message, tb=None):
 581         '''
 582         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 583         in red if stderr is a tty file.
 584         '''
 585         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 586             _msg_header = '\033[0;31mERROR:\033[0m'
 587         else:
 588             _msg_header = 'ERROR:'
 589         error_message = '%s %s' % (_msg_header, message)
 590         self.trouble(error_message, tb)
 591
 592     def report_file_already_downloaded(self, file_name):
 593         """Report file has already been fully downloaded."""
 594         try:
 595             self.to_screen('[download] %s has already been downloaded' % file_name)
 596         except UnicodeEncodeError:
 597             self.to_screen('[download] The file has already been downloaded')
 598
 599     def prepare_filename(self, info_dict):
 600         """Generate the output filename."""
 601         try:
 602             template_dict = dict(info_dict)
 603
 604             template_dict['epoch'] = int(time.time())
 605             autonumber_size = self.params.get('autonumber_size')
 606             if autonumber_size is None:
 607                 autonumber_size = 5
 608             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
 609             if template_dict.get('resolution') is None:
 610                 if template_dict.get('width') and template_dict.get('height'):
 611                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 612                 elif template_dict.get('height'):
 613                     template_dict['resolution'] = '%sp' % template_dict['height']
 614                 elif template_dict.get('width'):
 615                     template_dict['resolution'] = '%dx?' % template_dict['width']
 616
 617             sanitize = lambda k, v: sanitize_filename(
 618                 compat_str(v),
 619                 restricted=self.params.get('restrictfilenames'),
 620                 is_id=(k == 'id' or k.endswith('_id')))
 621             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
 622                                  for k, v in template_dict.items()
 623                                  if v is not None and not isinstance(v, (list, tuple, dict)))
 624             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 625
 626             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 627
 628             # For fields playlist_index and autonumber convert all occurrences
 629             # of %(field)s to %(field)0Nd for backward compatibility
 630             field_size_compat_map = {
 631                 'playlist_index': len(str(template_dict['n_entries'])),
 632                 'autonumber': autonumber_size,
 633             }
 634             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
 635             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
 636             if mobj:
 637                 outtmpl = re.sub(
 638                     FIELD_SIZE_COMPAT_RE,
 639                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
 640                     outtmpl)
 641
 642             NUMERIC_FIELDS = set((
 643                 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 644                 'timestamp', 'upload_year', 'upload_month', 'upload_day',
 645                 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 646                 'average_rating', 'comment_count', 'age_limit',
 647                 'start_time', 'end_time',
 648                 'chapter_number', 'season_number', 'episode_number',
 649                 'track_number', 'disc_number', 'release_year',
 650                 'playlist_index',
 651             ))
 652
 653             # Missing numeric fields used together with integer presentation types
 654             # in format specification will break the argument substitution since
 655             # string 'NA' is returned for missing fields. We will patch output
 656             # template for missing fields to meet string presentation type.
 657             for numeric_field in NUMERIC_FIELDS:
 658                 if numeric_field not in template_dict:
 659                     # As of [1] format syntax is:
 660                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
 661                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
 662                     FORMAT_RE = r'''(?x)
 663                         (?<!%)
 664                         %
 665                         \({0}\)  # mapping key
 666                         (?:[#0\-+ ]+)?  # conversion flags (optional)
 667                         (?:\d+)?  # minimum field width (optional)
 668                         (?:\.\d+)?  # precision (optional)
 669                         [hlL]?  # length modifier (optional)
 670                         [diouxXeEfFgGcrs%]  # conversion type
 671                     '''
 672                     outtmpl = re.sub(
 673                         FORMAT_RE.format(numeric_field),
 674                         r'%({0})s'.format(numeric_field), outtmpl)
 675
 676             filename = expand_path(outtmpl % template_dict)
 677             # Temporary fix for #4787
 678             # 'Treat' all problem characters by passing filename through preferredencoding
 679             # to workaround encoding issues with subprocess on python2 @ Windows
 680             if sys.version_info < (3, 0) and sys.platform == 'win32':
 681                 filename = encodeFilename(filename, True).decode(preferredencoding())
 682             return sanitize_path(filename)
 683         except ValueError as err:
 684             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 685             return None
 686
 687     def _match_entry(self, info_dict, incomplete):
 688         """ Returns None iff the file should be downloaded """
 689
 690         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 691         if 'title' in info_dict:
 692             # This can happen when we're just evaluating the playlist
 693             title = info_dict['title']
 694             matchtitle = self.params.get('matchtitle', False)
 695             if matchtitle:
 696                 if not re.search(matchtitle, title, re.IGNORECASE):
 697                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 698             rejecttitle = self.params.get('rejecttitle', False)
 699             if rejecttitle:
 700                 if re.search(rejecttitle, title, re.IGNORECASE):
 701                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 702         date = info_dict.get('upload_date')
 703         if date is not None:
 704             dateRange = self.params.get('daterange', DateRange())
 705             if date not in dateRange:
 706                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 707         view_count = info_dict.get('view_count')
 708         if view_count is not None:
 709             min_views = self.params.get('min_views')
 710             if min_views is not None and view_count < min_views:
 711                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 712             max_views = self.params.get('max_views')
 713             if max_views is not None and view_count > max_views:
 714                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 715         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 716             return 'Skipping "%s" because it is age restricted' % video_title
 717         if self.in_download_archive(info_dict):
 718             return '%s has already been recorded in archive' % video_title
 719
 720         if not incomplete:
 721             match_filter = self.params.get('match_filter')
 722             if match_filter is not None:
 723                 ret = match_filter(info_dict)
 724                 if ret is not None:
 725                     return ret
 726
 727         return None
 728
 729     @staticmethod
 730     def add_extra_info(info_dict, extra_info):
 731         '''Set the keys from extra_info in info dict if they are missing'''
 732         for key, value in extra_info.items():
 733             info_dict.setdefault(key, value)
 734
 735     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 736                      process=True, force_generic_extractor=False):
 737         '''
 738         Returns a list with a dictionary for each video we find.
 739         If 'download', also downloads the videos.
 740         extra_info is a dict containing the extra values to add to each result
 741         '''
 742
 743         if not ie_key and force_generic_extractor:
 744             ie_key = 'Generic'
 745
 746         if ie_key:
 747             ies = [self.get_info_extractor(ie_key)]
 748         else:
 749             ies = self._ies
 750
 751         for ie in ies:
 752             if not ie.suitable(url):
 753                 continue
 754
 755             ie = self.get_info_extractor(ie.ie_key())
 756             if not ie.working():
 757                 self.report_warning('The program functionality for this site has been marked as broken, '
 758                                     'and will probably not work.')
 759
 760             try:
 761                 ie_result = ie.extract(url)
 762                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 763                     break
 764                 if isinstance(ie_result, list):
 765                     # Backwards compatibility: old IE result format
 766                     ie_result = {
 767                         '_type': 'compat_list',
 768                         'entries': ie_result,
 769                     }
 770                 self.add_default_extra_info(ie_result, ie, url)
 771                 if process:
 772                     return self.process_ie_result(ie_result, download, extra_info)
 773                 else:
 774                     return ie_result
 775             except GeoRestrictedError as e:
 776                 msg = e.msg
 777                 if e.countries:
 778                     msg += '\nThis video is available in %s.' % ', '.join(
 779                         map(ISO3166Utils.short2full, e.countries))
 780                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
 781                 self.report_error(msg)
 782                 break
 783             except ExtractorError as e:  # An error we somewhat expected
 784                 self.report_error(compat_str(e), e.format_traceback())
 785                 break
 786             except MaxDownloadsReached:
 787                 raise
 788             except Exception as e:
 789                 if self.params.get('ignoreerrors', False):
 790                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
 791                     break
 792                 else:
 793                     raise
 794         else:
 795             self.report_error('no suitable InfoExtractor for URL %s' % url)
 796
 797     def add_default_extra_info(self, ie_result, ie, url):
 798         self.add_extra_info(ie_result, {
 799             'extractor': ie.IE_NAME,
 800             'webpage_url': url,
 801             'webpage_url_basename': url_basename(url),
 802             'extractor_key': ie.ie_key(),
 803         })
 804
 805     def process_ie_result(self, ie_result, download=True, extra_info={}):
 806         """
 807         Take the result of the ie(may be modified) and resolve all unresolved
 808         references (URLs, playlist items).
 809
 810         It will also download the videos if 'download'.
 811         Returns the resolved ie_result.
 812         """
 813         result_type = ie_result.get('_type', 'video')
 814
 815         if result_type in ('url', 'url_transparent'):
 816             ie_result['url'] = sanitize_url(ie_result['url'])
 817             extract_flat = self.params.get('extract_flat', False)
 818             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 819                     extract_flat is True):
 820                 if self.params.get('forcejson', False):
 821                     self.to_stdout(json.dumps(ie_result))
 822                 return ie_result
 823
 824         if result_type == 'video':
 825             self.add_extra_info(ie_result, extra_info)
 826             return self.process_video_result(ie_result, download=download)
 827         elif result_type == 'url':
 828             # We have to add extra_info to the results because it may be
 829             # contained in a playlist
 830             return self.extract_info(ie_result['url'],
 831                                      download,
 832                                      ie_key=ie_result.get('ie_key'),
 833                                      extra_info=extra_info)
 834         elif result_type == 'url_transparent':
 835             # Use the information from the embedding page
 836             info = self.extract_info(
 837                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 838                 extra_info=extra_info, download=False, process=False)
 839
 840             # extract_info may return None when ignoreerrors is enabled and
 841             # extraction failed with an error, don't crash and return early
 842             # in this case
 843             if not info:
 844                 return info
 845
 846             force_properties = dict(
 847                 (k, v) for k, v in ie_result.items() if v is not None)
 848             for f in ('_type', 'url', 'ie_key'):
 849                 if f in force_properties:
 850                     del force_properties[f]
 851             new_result = info.copy()
 852             new_result.update(force_properties)
 853
 854             # Extracted info may not be a video result (i.e.
 855             # info.get('_type', 'video') != video) but rather an url or
 856             # url_transparent. In such cases outer metadata (from ie_result)
 857             # should be propagated to inner one (info). For this to happen
 858             # _type of info should be overridden with url_transparent. This
 859             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
 860             if new_result.get('_type') == 'url':
 861                 new_result['_type'] = 'url_transparent'
 862
 863             return self.process_ie_result(
 864                 new_result, download=download, extra_info=extra_info)
 865         elif result_type in ('playlist', 'multi_video'):
 866             # We process each entry in the playlist
 867             playlist = ie_result.get('title') or ie_result.get('id')
 868             self.to_screen('[download] Downloading playlist: %s' % playlist)
 869
 870             playlist_results = []
 871
 872             playliststart = self.params.get('playliststart', 1) - 1
 873             playlistend = self.params.get('playlistend')
 874             # For backwards compatibility, interpret -1 as whole list
 875             if playlistend == -1:
 876                 playlistend = None
 877
 878             playlistitems_str = self.params.get('playlist_items')
 879             playlistitems = None
 880             if playlistitems_str is not None:
 881                 def iter_playlistitems(format):
 882                     for string_segment in format.split(','):
 883                         if '-' in string_segment:
 884                             start, end = string_segment.split('-')
 885                             for item in range(int(start), int(end) + 1):
 886                                 yield int(item)
 887                         else:
 888                             yield int(string_segment)
 889                 playlistitems = iter_playlistitems(playlistitems_str)
 890
 891             ie_entries = ie_result['entries']
 892             if isinstance(ie_entries, list):
 893                 n_all_entries = len(ie_entries)
 894                 if playlistitems:
 895                     entries = [
 896                         ie_entries[i - 1] for i in playlistitems
 897                         if -n_all_entries <= i - 1 < n_all_entries]
 898                 else:
 899                     entries = ie_entries[playliststart:playlistend]
 900                 n_entries = len(entries)
 901                 self.to_screen(
 902                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
 903                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 904             elif isinstance(ie_entries, PagedList):
 905                 if playlistitems:
 906                     entries = []
 907                     for item in playlistitems:
 908                         entries.extend(ie_entries.getslice(
 909                             item - 1, item
 910                         ))
 911                 else:
 912                     entries = ie_entries.getslice(
 913                         playliststart, playlistend)
 914                 n_entries = len(entries)
 915                 self.to_screen(
 916                     '[%s] playlist %s: Downloading %d videos' %
 917                     (ie_result['extractor'], playlist, n_entries))
 918             else:  # iterable
 919                 if playlistitems:
 920                     entry_list = list(ie_entries)
 921                     entries = [entry_list[i - 1] for i in playlistitems]
 922                 else:
 923                     entries = list(itertools.islice(
 924                         ie_entries, playliststart, playlistend))
 925                 n_entries = len(entries)
 926                 self.to_screen(
 927                     '[%s] playlist %s: Downloading %d videos' %
 928                     (ie_result['extractor'], playlist, n_entries))
 929
 930             if self.params.get('playlistreverse', False):
 931                 entries = entries[::-1]
 932
 933             if self.params.get('playlistrandom', False):
 934                 random.shuffle(entries)
 935
 936             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
 937
 938             for i, entry in enumerate(entries, 1):
 939                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 940                 # This __x_forwarded_for_ip thing is a bit ugly but requires
 941                 # minimal changes
 942                 if x_forwarded_for:
 943                     entry['__x_forwarded_for_ip'] = x_forwarded_for
 944                 extra = {
 945                     'n_entries': n_entries,
 946                     'playlist': playlist,
 947                     'playlist_id': ie_result.get('id'),
 948                     'playlist_title': ie_result.get('title'),
 949                     'playlist_index': i + playliststart,
 950                     'extractor': ie_result['extractor'],
 951                     'webpage_url': ie_result['webpage_url'],
 952                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 953                     'extractor_key': ie_result['extractor_key'],
 954                 }
 955
 956                 reason = self._match_entry(entry, incomplete=True)
 957                 if reason is not None:
 958                     self.to_screen('[download] ' + reason)
 959                     continue
 960
 961                 entry_result = self.process_ie_result(entry,
 962                                                       download=download,
 963                                                       extra_info=extra)
 964                 playlist_results.append(entry_result)
 965             ie_result['entries'] = playlist_results
 966             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
 967             return ie_result
 968         elif result_type == 'compat_list':
 969             self.report_warning(
 970                 'Extractor %s returned a compat_list result. '
 971                 'It needs to be updated.' % ie_result.get('extractor'))
 972
 973             def _fixup(r):
 974                 self.add_extra_info(
 975                     r,
 976                     {
 977                         'extractor': ie_result['extractor'],
 978                         'webpage_url': ie_result['webpage_url'],
 979                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 980                         'extractor_key': ie_result['extractor_key'],
 981                     }
 982                 )
 983                 return r
 984             ie_result['entries'] = [
 985                 self.process_ie_result(_fixup(r), download, extra_info)
 986                 for r in ie_result['entries']
 987             ]
 988             return ie_result
 989         else:
 990             raise Exception('Invalid result type: %s' % result_type)
 991
 992     def _build_format_filter(self, filter_spec):
 993         " Returns a function to filter the formats according to the filter_spec "
 994
 995         OPERATORS = {
 996             '<': operator.lt,
 997             '<=': operator.le,
 998             '>': operator.gt,
 999             '>=': operator.ge,
1000             '=': operator.eq,
1001             '!=': operator.ne,
1002         }
1003         operator_rex = re.compile(r'''(?x)\s*
1004             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1005             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1006             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1007             $
1008             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1009         m = operator_rex.search(filter_spec)
1010         if m:
1011             try:
1012                 comparison_value = int(m.group('value'))
1013             except ValueError:
1014                 comparison_value = parse_filesize(m.group('value'))
1015                 if comparison_value is None:
1016                     comparison_value = parse_filesize(m.group('value') + 'B')
1017                 if comparison_value is None:
1018                     raise ValueError(
1019                         'Invalid value %r in format specification %r' % (
1020                             m.group('value'), filter_spec))
1021             op = OPERATORS[m.group('op')]
1022
1023         if not m:
1024             STR_OPERATORS = {
1025                 '=': operator.eq,
1026                 '!=': operator.ne,
1027                 '^=': lambda attr, value: attr.startswith(value),
1028                 '$=': lambda attr, value: attr.endswith(value),
1029                 '*=': lambda attr, value: value in attr,
1030             }
1031             str_operator_rex = re.compile(r'''(?x)
1032                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1033                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1034                 \s*(?P<value>[a-zA-Z0-9._-]+)
1035                 \s*$
1036                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1037             m = str_operator_rex.search(filter_spec)
1038             if m:
1039                 comparison_value = m.group('value')
1040                 op = STR_OPERATORS[m.group('op')]
1041
1042         if not m:
1043             raise ValueError('Invalid filter specification %r' % filter_spec)
1044
1045         def _filter(f):
1046             actual_value = f.get(m.group('key'))
1047             if actual_value is None:
1048                 return m.group('none_inclusive')
1049             return op(actual_value, comparison_value)
1050         return _filter
1051
1052     def build_format_selector(self, format_spec):
1053         def syntax_error(note, start):
1054             message = (
1055                 'Invalid format specification: '
1056                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1057             return SyntaxError(message)
1058
1059         PICKFIRST = 'PICKFIRST'
1060         MERGE = 'MERGE'
1061         SINGLE = 'SINGLE'
1062         GROUP = 'GROUP'
1063         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1064
1065         def _parse_filter(tokens):
1066             filter_parts = []
1067             for type, string, start, _, _ in tokens:
1068                 if type == tokenize.OP and string == ']':
1069                     return ''.join(filter_parts)
1070                 else:
1071                     filter_parts.append(string)
1072
1073         def _remove_unused_ops(tokens):
1074             # Remove operators that we don't use and join them with the surrounding strings
1075             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1076             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1077             last_string, last_start, last_end, last_line = None, None, None, None
1078             for type, string, start, end, line in tokens:
1079                 if type == tokenize.OP and string == '[':
1080                     if last_string:
1081                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1082                         last_string = None
1083                     yield type, string, start, end, line
1084                     # everything inside brackets will be handled by _parse_filter
1085                     for type, string, start, end, line in tokens:
1086                         yield type, string, start, end, line
1087                         if type == tokenize.OP and string == ']':
1088                             break
1089                 elif type == tokenize.OP and string in ALLOWED_OPS:
1090                     if last_string:
1091                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1092                         last_string = None
1093                     yield type, string, start, end, line
1094                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1095                     if not last_string:
1096                         last_string = string
1097                         last_start = start
1098                         last_end = end
1099                     else:
1100                         last_string += string
1101             if last_string:
1102                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1103
1104         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1105             selectors = []
1106             current_selector = None
1107             for type, string, start, _, _ in tokens:
1108                 # ENCODING is only defined in python 3.x
1109                 if type == getattr(tokenize, 'ENCODING', None):
1110                     continue
1111                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1112                     current_selector = FormatSelector(SINGLE, string, [])
1113                 elif type == tokenize.OP:
1114                     if string == ')':
1115                         if not inside_group:
1116                             # ')' will be handled by the parentheses group
1117                             tokens.restore_last_token()
1118                         break
1119                     elif inside_merge and string in ['/', ',']:
1120                         tokens.restore_last_token()
1121                         break
1122                     elif inside_choice and string == ',':
1123                         tokens.restore_last_token()
1124                         break
1125                     elif string == ',':
1126                         if not current_selector:
1127                             raise syntax_error('"," must follow a format selector', start)
1128                         selectors.append(current_selector)
1129                         current_selector = None
1130                     elif string == '/':
1131                         if not current_selector:
1132                             raise syntax_error('"/" must follow a format selector', start)
1133                         first_choice = current_selector
1134                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1135                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1136                     elif string == '[':
1137                         if not current_selector:
1138                             current_selector = FormatSelector(SINGLE, 'best', [])
1139                         format_filter = _parse_filter(tokens)
1140                         current_selector.filters.append(format_filter)
1141                     elif string == '(':
1142                         if current_selector:
1143                             raise syntax_error('Unexpected "("', start)
1144                         group = _parse_format_selection(tokens, inside_group=True)
1145                         current_selector = FormatSelector(GROUP, group, [])
1146                     elif string == '+':
1147                         video_selector = current_selector
1148                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1149                         if not video_selector or not audio_selector:
1150                             raise syntax_error('"+" must be between two format selectors', start)
1151                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1152                     else:
1153                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1154                 elif type == tokenize.ENDMARKER:
1155                     break
1156             if current_selector:
1157                 selectors.append(current_selector)
1158             return selectors
1159
1160         def _build_selector_function(selector):
1161             if isinstance(selector, list):
1162                 fs = [_build_selector_function(s) for s in selector]
1163
1164                 def selector_function(ctx):
1165                     for f in fs:
1166                         for format in f(ctx):
1167                             yield format
1168                 return selector_function
1169             elif selector.type == GROUP:
1170                 selector_function = _build_selector_function(selector.selector)
1171             elif selector.type == PICKFIRST:
1172                 fs = [_build_selector_function(s) for s in selector.selector]
1173
1174                 def selector_function(ctx):
1175                     for f in fs:
1176                         picked_formats = list(f(ctx))
1177                         if picked_formats:
1178                             return picked_formats
1179                     return []
1180             elif selector.type == SINGLE:
1181                 format_spec = selector.selector
1182
1183                 def selector_function(ctx):
1184                     formats = list(ctx['formats'])
1185                     if not formats:
1186                         return
1187                     if format_spec == 'all':
1188                         for f in formats:
1189                             yield f
1190                     elif format_spec in ['best', 'worst', None]:
1191                         format_idx = 0 if format_spec == 'worst' else -1
1192                         audiovideo_formats = [
1193                             f for f in formats
1194                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1195                         if audiovideo_formats:
1196                             yield audiovideo_formats[format_idx]
1197                         # for extractors with incomplete formats (audio only (soundcloud)
1198                         # or video only (imgur)) we will fallback to best/worst
1199                         # {video,audio}-only format
1200                         elif ctx['incomplete_formats']:
1201                             yield formats[format_idx]
1202                     elif format_spec == 'bestaudio':
1203                         audio_formats = [
1204                             f for f in formats
1205                             if f.get('vcodec') == 'none']
1206                         if audio_formats:
1207                             yield audio_formats[-1]
1208                     elif format_spec == 'worstaudio':
1209                         audio_formats = [
1210                             f for f in formats
1211                             if f.get('vcodec') == 'none']
1212                         if audio_formats:
1213                             yield audio_formats[0]
1214                     elif format_spec == 'bestvideo':
1215                         video_formats = [
1216                             f for f in formats
1217                             if f.get('acodec') == 'none']
1218                         if video_formats:
1219                             yield video_formats[-1]
1220                     elif format_spec == 'worstvideo':
1221                         video_formats = [
1222                             f for f in formats
1223                             if f.get('acodec') == 'none']
1224                         if video_formats:
1225                             yield video_formats[0]
1226                     else:
1227                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1228                         if format_spec in extensions:
1229                             filter_f = lambda f: f['ext'] == format_spec
1230                         else:
1231                             filter_f = lambda f: f['format_id'] == format_spec
1232                         matches = list(filter(filter_f, formats))
1233                         if matches:
1234                             yield matches[-1]
1235             elif selector.type == MERGE:
1236                 def _merge(formats_info):
1237                     format_1, format_2 = [f['format_id'] for f in formats_info]
1238                     # The first format must contain the video and the
1239                     # second the audio
1240                     if formats_info[0].get('vcodec') == 'none':
1241                         self.report_error('The first format must '
1242                                           'contain the video, try using '
1243                                           '"-f %s+%s"' % (format_2, format_1))
1244                         return
1245                     # Formats must be opposite (video+audio)
1246                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1247                         self.report_error(
1248                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1249                             % (format_1, format_2))
1250                         return
1251                     output_ext = (
1252                         formats_info[0]['ext']
1253                         if self.params.get('merge_output_format') is None
1254                         else self.params['merge_output_format'])
1255                     return {
1256                         'requested_formats': formats_info,
1257                         'format': '%s+%s' % (formats_info[0].get('format'),
1258                                              formats_info[1].get('format')),
1259                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1260                                                 formats_info[1].get('format_id')),
1261                         'width': formats_info[0].get('width'),
1262                         'height': formats_info[0].get('height'),
1263                         'resolution': formats_info[0].get('resolution'),
1264                         'fps': formats_info[0].get('fps'),
1265                         'vcodec': formats_info[0].get('vcodec'),
1266                         'vbr': formats_info[0].get('vbr'),
1267                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1268                         'acodec': formats_info[1].get('acodec'),
1269                         'abr': formats_info[1].get('abr'),
1270                         'ext': output_ext,
1271                     }
1272                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1273
1274                 def selector_function(ctx):
1275                     for pair in itertools.product(
1276                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1277                         yield _merge(pair)
1278
1279             filters = [self._build_format_filter(f) for f in selector.filters]
1280
1281             def final_selector(ctx):
1282                 ctx_copy = copy.deepcopy(ctx)
1283                 for _filter in filters:
1284                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1285                 return selector_function(ctx_copy)
1286             return final_selector
1287
1288         stream = io.BytesIO(format_spec.encode('utf-8'))
1289         try:
1290             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1291         except tokenize.TokenError:
1292             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1293
1294         class TokenIterator(object):
1295             def __init__(self, tokens):
1296                 self.tokens = tokens
1297                 self.counter = 0
1298
1299             def __iter__(self):
1300                 return self
1301
1302             def __next__(self):
1303                 if self.counter >= len(self.tokens):
1304                     raise StopIteration()
1305                 value = self.tokens[self.counter]
1306                 self.counter += 1
1307                 return value
1308
1309             next = __next__
1310
1311             def restore_last_token(self):
1312                 self.counter -= 1
1313
1314         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1315         return _build_selector_function(parsed_selector)
1316
1317     def _calc_headers(self, info_dict):
1318         res = std_headers.copy()
1319
1320         add_headers = info_dict.get('http_headers')
1321         if add_headers:
1322             res.update(add_headers)
1323
1324         cookies = self._calc_cookies(info_dict)
1325         if cookies:
1326             res['Cookie'] = cookies
1327
1328         if 'X-Forwarded-For' not in res:
1329             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1330             if x_forwarded_for_ip:
1331                 res['X-Forwarded-For'] = x_forwarded_for_ip
1332
1333         return res
1334
1335     def _calc_cookies(self, info_dict):
1336         pr = sanitized_Request(info_dict['url'])
1337         self.cookiejar.add_cookie_header(pr)
1338         return pr.get_header('Cookie')
1339
1340     def process_video_result(self, info_dict, download=True):
1341         assert info_dict.get('_type', 'video') == 'video'
1342
1343         if 'id' not in info_dict:
1344             raise ExtractorError('Missing "id" field in extractor result')
1345         if 'title' not in info_dict:
1346             raise ExtractorError('Missing "title" field in extractor result')
1347
1348         if not isinstance(info_dict['id'], compat_str):
1349             self.report_warning('"id" field is not a string - forcing string conversion')
1350             info_dict['id'] = compat_str(info_dict['id'])
1351
1352         if 'playlist' not in info_dict:
1353             # It isn't part of a playlist
1354             info_dict['playlist'] = None
1355             info_dict['playlist_index'] = None
1356
1357         thumbnails = info_dict.get('thumbnails')
1358         if thumbnails is None:
1359             thumbnail = info_dict.get('thumbnail')
1360             if thumbnail:
1361                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1362         if thumbnails:
1363             thumbnails.sort(key=lambda t: (
1364                 t.get('preference') if t.get('preference') is not None else -1,
1365                 t.get('width') if t.get('width') is not None else -1,
1366                 t.get('height') if t.get('height') is not None else -1,
1367                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1368             for i, t in enumerate(thumbnails):
1369                 t['url'] = sanitize_url(t['url'])
1370                 if t.get('width') and t.get('height'):
1371                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1372                 if t.get('id') is None:
1373                     t['id'] = '%d' % i
1374
1375         if self.params.get('list_thumbnails'):
1376             self.list_thumbnails(info_dict)
1377             return
1378
1379         thumbnail = info_dict.get('thumbnail')
1380         if thumbnail:
1381             info_dict['thumbnail'] = sanitize_url(thumbnail)
1382         elif thumbnails:
1383             info_dict['thumbnail'] = thumbnails[-1]['url']
1384
1385         if 'display_id' not in info_dict and 'id' in info_dict:
1386             info_dict['display_id'] = info_dict['id']
1387
1388         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1389             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1390             # see http://bugs.python.org/issue1646728)
1391             try:
1392                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1393                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1394             except (ValueError, OverflowError, OSError):
1395                 pass
1396
1397         # Auto generate title fields corresponding to the *_number fields when missing
1398         # in order to always have clean titles. This is very common for TV series.
1399         for field in ('chapter', 'season', 'episode'):
1400             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1401                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1402
1403         subtitles = info_dict.get('subtitles')
1404         if subtitles:
1405             for _, subtitle in subtitles.items():
1406                 for subtitle_format in subtitle:
1407                     if subtitle_format.get('url'):
1408                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1409                     if subtitle_format.get('ext') is None:
1410                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1411
1412         if self.params.get('listsubtitles', False):
1413             if 'automatic_captions' in info_dict:
1414                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1415             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1416             return
1417         info_dict['requested_subtitles'] = self.process_subtitles(
1418             info_dict['id'], subtitles,
1419             info_dict.get('automatic_captions'))
1420
1421         # We now pick which formats have to be downloaded
1422         if info_dict.get('formats') is None:
1423             # There's only one format available
1424             formats = [info_dict]
1425         else:
1426             formats = info_dict['formats']
1427
1428         if not formats:
1429             raise ExtractorError('No video formats found!')
1430
1431         formats_dict = {}
1432
1433         # We check that all the formats have the format and format_id fields
1434         for i, format in enumerate(formats):
1435             if 'url' not in format:
1436                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1437
1438             format['url'] = sanitize_url(format['url'])
1439
1440             if format.get('format_id') is None:
1441                 format['format_id'] = compat_str(i)
1442             else:
1443                 # Sanitize format_id from characters used in format selector expression
1444                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1445             format_id = format['format_id']
1446             if format_id not in formats_dict:
1447                 formats_dict[format_id] = []
1448             formats_dict[format_id].append(format)
1449
1450         # Make sure all formats have unique format_id
1451         for format_id, ambiguous_formats in formats_dict.items():
1452             if len(ambiguous_formats) > 1:
1453                 for i, format in enumerate(ambiguous_formats):
1454                     format['format_id'] = '%s-%d' % (format_id, i)
1455
1456         for i, format in enumerate(formats):
1457             if format.get('format') is None:
1458                 format['format'] = '{id} - {res}{note}'.format(
1459                     id=format['format_id'],
1460                     res=self.format_resolution(format),
1461                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1462                 )
1463             # Automatically determine file extension if missing
1464             if format.get('ext') is None:
1465                 format['ext'] = determine_ext(format['url']).lower()
1466             # Automatically determine protocol if missing (useful for format
1467             # selection purposes)
1468             if format.get('protocol') is None:
1469                 format['protocol'] = determine_protocol(format)
1470             # Add HTTP headers, so that external programs can use them from the
1471             # json output
1472             full_format_info = info_dict.copy()
1473             full_format_info.update(format)
1474             format['http_headers'] = self._calc_headers(full_format_info)
1475         # Remove private housekeeping stuff
1476         if '__x_forwarded_for_ip' in info_dict:
1477             del info_dict['__x_forwarded_for_ip']
1478
1479         # TODO Central sorting goes here
1480
1481         if formats[0] is not info_dict:
1482             # only set the 'formats' fields if the original info_dict list them
1483             # otherwise we end up with a circular reference, the first (and unique)
1484             # element in the 'formats' field in info_dict is info_dict itself,
1485             # which can't be exported to json
1486             info_dict['formats'] = formats
1487         if self.params.get('listformats'):
1488             self.list_formats(info_dict)
1489             return
1490
1491         req_format = self.params.get('format')
1492         if req_format is None:
1493             req_format_list = []
1494             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1495                     not info_dict.get('is_live')):
1496                 merger = FFmpegMergerPP(self)
1497                 if merger.available and merger.can_merge():
1498                     req_format_list.append('bestvideo+bestaudio')
1499             req_format_list.append('best')
1500             req_format = '/'.join(req_format_list)
1501         format_selector = self.build_format_selector(req_format)
1502
1503         # While in format selection we may need to have an access to the original
1504         # format set in order to calculate some metrics or do some processing.
1505         # For now we need to be able to guess whether original formats provided
1506         # by extractor are incomplete or not (i.e. whether extractor provides only
1507         # video-only or audio-only formats) for proper formats selection for
1508         # extractors with such incomplete formats (see
1509         # https://github.com/rg3/youtube-dl/pull/5556).
1510         # Since formats may be filtered during format selection and may not match
1511         # the original formats the results may be incorrect. Thus original formats
1512         # or pre-calculated metrics should be passed to format selection routines
1513         # as well.
1514         # We will pass a context object containing all necessary additional data
1515         # instead of just formats.
1516         # This fixes incorrect format selection issue (see
1517         # https://github.com/rg3/youtube-dl/issues/10083).
1518         incomplete_formats = (
1519             # All formats are video-only or
1520             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1521             # all formats are audio-only
1522             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1523
1524         ctx = {
1525             'formats': formats,
1526             'incomplete_formats': incomplete_formats,
1527         }
1528
1529         formats_to_download = list(format_selector(ctx))
1530         if not formats_to_download:
1531             raise ExtractorError('requested format not available',
1532                                  expected=True)
1533
1534         if download:
1535             if len(formats_to_download) > 1:
1536                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1537             for format in formats_to_download:
1538                 new_info = dict(info_dict)
1539                 new_info.update(format)
1540                 self.process_info(new_info)
1541         # We update the info dict with the best quality format (backwards compatibility)
1542         info_dict.update(formats_to_download[-1])
1543         return info_dict
1544
1545     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1546         """Select the requested subtitles and their format"""
1547         available_subs = {}
1548         if normal_subtitles and self.params.get('writesubtitles'):
1549             available_subs.update(normal_subtitles)
1550         if automatic_captions and self.params.get('writeautomaticsub'):
1551             for lang, cap_info in automatic_captions.items():
1552                 if lang not in available_subs:
1553                     available_subs[lang] = cap_info
1554
1555         if (not self.params.get('writesubtitles') and not
1556                 self.params.get('writeautomaticsub') or not
1557                 available_subs):
1558             return None
1559
1560         if self.params.get('allsubtitles', False):
1561             requested_langs = available_subs.keys()
1562         else:
1563             if self.params.get('subtitleslangs', False):
1564                 requested_langs = self.params.get('subtitleslangs')
1565             elif 'en' in available_subs:
1566                 requested_langs = ['en']
1567             else:
1568                 requested_langs = [list(available_subs.keys())[0]]
1569
1570         formats_query = self.params.get('subtitlesformat', 'best')
1571         formats_preference = formats_query.split('/') if formats_query else []
1572         subs = {}
1573         for lang in requested_langs:
1574             formats = available_subs.get(lang)
1575             if formats is None:
1576                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1577                 continue
1578             for ext in formats_preference:
1579                 if ext == 'best':
1580                     f = formats[-1]
1581                     break
1582                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1583                 if matches:
1584                     f = matches[-1]
1585                     break
1586             else:
1587                 f = formats[-1]
1588                 self.report_warning(
1589                     'No subtitle format found matching "%s" for language %s, '
1590                     'using %s' % (formats_query, lang, f['ext']))
1591             subs[lang] = f
1592         return subs
1593
1594     def process_info(self, info_dict):
1595         """Process a single resolved IE result."""
1596
1597         assert info_dict.get('_type', 'video') == 'video'
1598
1599         max_downloads = self.params.get('max_downloads')
1600         if max_downloads is not None:
1601             if self._num_downloads >= int(max_downloads):
1602                 raise MaxDownloadsReached()
1603
1604         info_dict['fulltitle'] = info_dict['title']
1605         if len(info_dict['title']) > 200:
1606             info_dict['title'] = info_dict['title'][:197] + '...'
1607
1608         if 'format' not in info_dict:
1609             info_dict['format'] = info_dict['ext']
1610
1611         reason = self._match_entry(info_dict, incomplete=False)
1612         if reason is not None:
1613             self.to_screen('[download] ' + reason)
1614             return
1615
1616         self._num_downloads += 1
1617
1618         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1619
1620         # Forced printings
1621         if self.params.get('forcetitle', False):
1622             self.to_stdout(info_dict['fulltitle'])
1623         if self.params.get('forceid', False):
1624             self.to_stdout(info_dict['id'])
1625         if self.params.get('forceurl', False):
1626             if info_dict.get('requested_formats') is not None:
1627                 for f in info_dict['requested_formats']:
1628                     self.to_stdout(f['url'] + f.get('play_path', ''))
1629             else:
1630                 # For RTMP URLs, also include the playpath
1631                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1632         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1633             self.to_stdout(info_dict['thumbnail'])
1634         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1635             self.to_stdout(info_dict['description'])
1636         if self.params.get('forcefilename', False) and filename is not None:
1637             self.to_stdout(filename)
1638         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1639             self.to_stdout(formatSeconds(info_dict['duration']))
1640         if self.params.get('forceformat', False):
1641             self.to_stdout(info_dict['format'])
1642         if self.params.get('forcejson', False):
1643             self.to_stdout(json.dumps(info_dict))
1644
1645         # Do nothing else if in simulate mode
1646         if self.params.get('simulate', False):
1647             return
1648
1649         if filename is None:
1650             return
1651
1652         try:
1653             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1654             if dn and not os.path.exists(dn):
1655                 os.makedirs(dn)
1656         except (OSError, IOError) as err:
1657             self.report_error('unable to create directory ' + error_to_compat_str(err))
1658             return
1659
1660         if self.params.get('writedescription', False):
1661             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1662             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1663                 self.to_screen('[info] Video description is already present')
1664             elif info_dict.get('description') is None:
1665                 self.report_warning('There\'s no description to write.')
1666             else:
1667                 try:
1668                     self.to_screen('[info] Writing video description to: ' + descfn)
1669                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1670                         descfile.write(info_dict['description'])
1671                 except (OSError, IOError):
1672                     self.report_error('Cannot write description file ' + descfn)
1673                     return
1674
1675         if self.params.get('writeannotations', False):
1676             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1677             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1678                 self.to_screen('[info] Video annotations are already present')
1679             else:
1680                 try:
1681                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1682                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1683                         annofile.write(info_dict['annotations'])
1684                 except (KeyError, TypeError):
1685                     self.report_warning('There are no annotations to write.')
1686                 except (OSError, IOError):
1687                     self.report_error('Cannot write annotations file: ' + annofn)
1688                     return
1689
1690         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1691                                        self.params.get('writeautomaticsub')])
1692
1693         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1694             # subtitles download errors are already managed as troubles in relevant IE
1695             # that way it will silently go on when used with unsupporting IE
1696             subtitles = info_dict['requested_subtitles']
1697             ie = self.get_info_extractor(info_dict['extractor_key'])
1698             for sub_lang, sub_info in subtitles.items():
1699                 sub_format = sub_info['ext']
1700                 if sub_info.get('data') is not None:
1701                     sub_data = sub_info['data']
1702                 else:
1703                     try:
1704                         sub_data = ie._download_webpage(
1705                             sub_info['url'], info_dict['id'], note=False)
1706                     except ExtractorError as err:
1707                         self.report_warning('Unable to download subtitle for "%s": %s' %
1708                                             (sub_lang, error_to_compat_str(err.cause)))
1709                         continue
1710                 try:
1711                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1712                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1713                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1714                     else:
1715                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1716                         # Use newline='' to prevent conversion of newline characters
1717                         # See https://github.com/rg3/youtube-dl/issues/10268
1718                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1719                             subfile.write(sub_data)
1720                 except (OSError, IOError):
1721                     self.report_error('Cannot write subtitles file ' + sub_filename)
1722                     return
1723
1724         if self.params.get('writeinfojson', False):
1725             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1726             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1727                 self.to_screen('[info] Video description metadata is already present')
1728             else:
1729                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1730                 try:
1731                     write_json_file(self.filter_requested_info(info_dict), infofn)
1732                 except (OSError, IOError):
1733                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1734                     return
1735
1736         self._write_thumbnails(info_dict, filename)
1737
1738         if not self.params.get('skip_download', False):
1739             try:
1740                 def dl(name, info):
1741                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1742                     for ph in self._progress_hooks:
1743                         fd.add_progress_hook(ph)
1744                     if self.params.get('verbose'):
1745                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1746                     return fd.download(name, info)
1747
1748                 if info_dict.get('requested_formats') is not None:
1749                     downloaded = []
1750                     success = True
1751                     merger = FFmpegMergerPP(self)
1752                     if not merger.available:
1753                         postprocessors = []
1754                         self.report_warning('You have requested multiple '
1755                                             'formats but ffmpeg or avconv are not installed.'
1756                                             ' The formats won\'t be merged.')
1757                     else:
1758                         postprocessors = [merger]
1759
1760                     def compatible_formats(formats):
1761                         video, audio = formats
1762                         # Check extension
1763                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1764                         if video_ext and audio_ext:
1765                             COMPATIBLE_EXTS = (
1766                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1767                                 ('webm')
1768                             )
1769                             for exts in COMPATIBLE_EXTS:
1770                                 if video_ext in exts and audio_ext in exts:
1771                                     return True
1772                         # TODO: Check acodec/vcodec
1773                         return False
1774
1775                     filename_real_ext = os.path.splitext(filename)[1][1:]
1776                     filename_wo_ext = (
1777                         os.path.splitext(filename)[0]
1778                         if filename_real_ext == info_dict['ext']
1779                         else filename)
1780                     requested_formats = info_dict['requested_formats']
1781                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1782                         info_dict['ext'] = 'mkv'
1783                         self.report_warning(
1784                             'Requested formats are incompatible for merge and will be merged into mkv.')
1785                     # Ensure filename always has a correct extension for successful merge
1786                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1787                     if os.path.exists(encodeFilename(filename)):
1788                         self.to_screen(
1789                             '[download] %s has already been downloaded and '
1790                             'merged' % filename)
1791                     else:
1792                         for f in requested_formats:
1793                             new_info = dict(info_dict)
1794                             new_info.update(f)
1795                             fname = self.prepare_filename(new_info)
1796                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1797                             downloaded.append(fname)
1798                             partial_success = dl(fname, new_info)
1799                             success = success and partial_success
1800                         info_dict['__postprocessors'] = postprocessors
1801                         info_dict['__files_to_merge'] = downloaded
1802                 else:
1803                     # Just a single file
1804                     success = dl(filename, info_dict)
1805             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1806                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1807                 return
1808             except (OSError, IOError) as err:
1809                 raise UnavailableVideoError(err)
1810             except (ContentTooShortError, ) as err:
1811                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1812                 return
1813
1814             if success and filename != '-':
1815                 # Fixup content
1816                 fixup_policy = self.params.get('fixup')
1817                 if fixup_policy is None:
1818                     fixup_policy = 'detect_or_warn'
1819
1820                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1821
1822                 stretched_ratio = info_dict.get('stretched_ratio')
1823                 if stretched_ratio is not None and stretched_ratio != 1:
1824                     if fixup_policy == 'warn':
1825                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1826                             info_dict['id'], stretched_ratio))
1827                     elif fixup_policy == 'detect_or_warn':
1828                         stretched_pp = FFmpegFixupStretchedPP(self)
1829                         if stretched_pp.available:
1830                             info_dict.setdefault('__postprocessors', [])
1831                             info_dict['__postprocessors'].append(stretched_pp)
1832                         else:
1833                             self.report_warning(
1834                                 '%s: Non-uniform pixel ratio (%s). %s'
1835                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1836                     else:
1837                         assert fixup_policy in ('ignore', 'never')
1838
1839                 if (info_dict.get('requested_formats') is None and
1840                         info_dict.get('container') == 'm4a_dash'):
1841                     if fixup_policy == 'warn':
1842                         self.report_warning(
1843                             '%s: writing DASH m4a. '
1844                             'Only some players support this container.'
1845                             % info_dict['id'])
1846                     elif fixup_policy == 'detect_or_warn':
1847                         fixup_pp = FFmpegFixupM4aPP(self)
1848                         if fixup_pp.available:
1849                             info_dict.setdefault('__postprocessors', [])
1850                             info_dict['__postprocessors'].append(fixup_pp)
1851                         else:
1852                             self.report_warning(
1853                                 '%s: writing DASH m4a. '
1854                                 'Only some players support this container. %s'
1855                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1856                     else:
1857                         assert fixup_policy in ('ignore', 'never')
1858
1859                 if (info_dict.get('protocol') == 'm3u8_native' or
1860                         info_dict.get('protocol') == 'm3u8' and
1861                         self.params.get('hls_prefer_native')):
1862                     if fixup_policy == 'warn':
1863                         self.report_warning('%s: malformated aac bitstream.' % (
1864                             info_dict['id']))
1865                     elif fixup_policy == 'detect_or_warn':
1866                         fixup_pp = FFmpegFixupM3u8PP(self)
1867                         if fixup_pp.available:
1868                             info_dict.setdefault('__postprocessors', [])
1869                             info_dict['__postprocessors'].append(fixup_pp)
1870                         else:
1871                             self.report_warning(
1872                                 '%s: malformated aac bitstream. %s'
1873                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1874                     else:
1875                         assert fixup_policy in ('ignore', 'never')
1876
1877                 try:
1878                     self.post_process(filename, info_dict)
1879                 except (PostProcessingError) as err:
1880                     self.report_error('postprocessing: %s' % str(err))
1881                     return
1882                 self.record_download_archive(info_dict)
1883
1884     def download(self, url_list):
1885         """Download a given list of URLs."""
1886         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1887         if (len(url_list) > 1 and
1888                 outtmpl != '-' and
1889                 '%' not in outtmpl and
1890                 self.params.get('max_downloads') != 1):
1891             raise SameFileError(outtmpl)
1892
1893         for url in url_list:
1894             try:
1895                 # It also downloads the videos
1896                 res = self.extract_info(
1897                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1898             except UnavailableVideoError:
1899                 self.report_error('unable to download video')
1900             except MaxDownloadsReached:
1901                 self.to_screen('[info] Maximum number of downloaded files reached.')
1902                 raise
1903             else:
1904                 if self.params.get('dump_single_json', False):
1905                     self.to_stdout(json.dumps(res))
1906
1907         return self._download_retcode
1908
1909     def download_with_info_file(self, info_filename):
1910         with contextlib.closing(fileinput.FileInput(
1911                 [info_filename], mode='r',
1912                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1913             # FileInput doesn't have a read method, we can't call json.load
1914             info = self.filter_requested_info(json.loads('\n'.join(f)))
1915         try:
1916             self.process_ie_result(info, download=True)
1917         except DownloadError:
1918             webpage_url = info.get('webpage_url')
1919             if webpage_url is not None:
1920                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1921                 return self.download([webpage_url])
1922             else:
1923                 raise
1924         return self._download_retcode
1925
1926     @staticmethod
1927     def filter_requested_info(info_dict):
1928         return dict(
1929             (k, v) for k, v in info_dict.items()
1930             if k not in ['requested_formats', 'requested_subtitles'])
1931
1932     def post_process(self, filename, ie_info):
1933         """Run all the postprocessors on the given file."""
1934         info = dict(ie_info)
1935         info['filepath'] = filename
1936         pps_chain = []
1937         if ie_info.get('__postprocessors') is not None:
1938             pps_chain.extend(ie_info['__postprocessors'])
1939         pps_chain.extend(self._pps)
1940         for pp in pps_chain:
1941             files_to_delete = []
1942             try:
1943                 files_to_delete, info = pp.run(info)
1944             except PostProcessingError as e:
1945                 self.report_error(e.msg)
1946             if files_to_delete and not self.params.get('keepvideo', False):
1947                 for old_filename in files_to_delete:
1948                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1949                     try:
1950                         os.remove(encodeFilename(old_filename))
1951                     except (IOError, OSError):
1952                         self.report_warning('Unable to remove downloaded original file')
1953
1954     def _make_archive_id(self, info_dict):
1955         # Future-proof against any change in case
1956         # and backwards compatibility with prior versions
1957         extractor = info_dict.get('extractor_key')
1958         if extractor is None:
1959             if 'id' in info_dict:
1960                 extractor = info_dict.get('ie_key')  # key in a playlist
1961         if extractor is None:
1962             return None  # Incomplete video information
1963         return extractor.lower() + ' ' + info_dict['id']
1964
1965     def in_download_archive(self, info_dict):
1966         fn = self.params.get('download_archive')
1967         if fn is None:
1968             return False
1969
1970         vid_id = self._make_archive_id(info_dict)
1971         if vid_id is None:
1972             return False  # Incomplete video information
1973
1974         try:
1975             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1976                 for line in archive_file:
1977                     if line.strip() == vid_id:
1978                         return True
1979         except IOError as ioe:
1980             if ioe.errno != errno.ENOENT:
1981                 raise
1982         return False
1983
1984     def record_download_archive(self, info_dict):
1985         fn = self.params.get('download_archive')
1986         if fn is None:
1987             return
1988         vid_id = self._make_archive_id(info_dict)
1989         assert vid_id
1990         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1991             archive_file.write(vid_id + '\n')
1992
1993     @staticmethod
1994     def format_resolution(format, default='unknown'):
1995         if format.get('vcodec') == 'none':
1996             return 'audio only'
1997         if format.get('resolution') is not None:
1998             return format['resolution']
1999         if format.get('height') is not None:
2000             if format.get('width') is not None:
2001                 res = '%sx%s' % (format['width'], format['height'])
2002             else:
2003                 res = '%sp' % format['height']
2004         elif format.get('width') is not None:
2005             res = '%dx?' % format['width']
2006         else:
2007             res = default
2008         return res
2009
2010     def _format_note(self, fdict):
2011         res = ''
2012         if fdict.get('ext') in ['f4f', 'f4m']:
2013             res += '(unsupported) '
2014         if fdict.get('language'):
2015             if res:
2016                 res += ' '
2017             res += '[%s] ' % fdict['language']
2018         if fdict.get('format_note') is not None:
2019             res += fdict['format_note'] + ' '
2020         if fdict.get('tbr') is not None:
2021             res += '%4dk ' % fdict['tbr']
2022         if fdict.get('container') is not None:
2023             if res:
2024                 res += ', '
2025             res += '%s container' % fdict['container']
2026         if (fdict.get('vcodec') is not None and
2027                 fdict.get('vcodec') != 'none'):
2028             if res:
2029                 res += ', '
2030             res += fdict['vcodec']
2031             if fdict.get('vbr') is not None:
2032                 res += '@'
2033         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2034             res += 'video@'
2035         if fdict.get('vbr') is not None:
2036             res += '%4dk' % fdict['vbr']
2037         if fdict.get('fps') is not None:
2038             if res:
2039                 res += ', '
2040             res += '%sfps' % fdict['fps']
2041         if fdict.get('acodec') is not None:
2042             if res:
2043                 res += ', '
2044             if fdict['acodec'] == 'none':
2045                 res += 'video only'
2046             else:
2047                 res += '%-5s' % fdict['acodec']
2048         elif fdict.get('abr') is not None:
2049             if res:
2050                 res += ', '
2051             res += 'audio'
2052         if fdict.get('abr') is not None:
2053             res += '@%3dk' % fdict['abr']
2054         if fdict.get('asr') is not None:
2055             res += ' (%5dHz)' % fdict['asr']
2056         if fdict.get('filesize') is not None:
2057             if res:
2058                 res += ', '
2059             res += format_bytes(fdict['filesize'])
2060         elif fdict.get('filesize_approx') is not None:
2061             if res:
2062                 res += ', '
2063             res += '~' + format_bytes(fdict['filesize_approx'])
2064         return res
2065
2066     def list_formats(self, info_dict):
2067         formats = info_dict.get('formats', [info_dict])
2068         table = [
2069             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2070             for f in formats
2071             if f.get('preference') is None or f['preference'] >= -1000]
2072         if len(formats) > 1:
2073             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2074
2075         header_line = ['format code', 'extension', 'resolution', 'note']
2076         self.to_screen(
2077             '[info] Available formats for %s:\n%s' %
2078             (info_dict['id'], render_table(header_line, table)))
2079
2080     def list_thumbnails(self, info_dict):
2081         thumbnails = info_dict.get('thumbnails')
2082         if not thumbnails:
2083             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2084             return
2085
2086         self.to_screen(
2087             '[info] Thumbnails for %s:' % info_dict['id'])
2088         self.to_screen(render_table(
2089             ['ID', 'width', 'height', 'URL'],
2090             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2091
2092     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2093         if not subtitles:
2094             self.to_screen('%s has no %s' % (video_id, name))
2095             return
2096         self.to_screen(
2097             'Available %s for %s:' % (name, video_id))
2098         self.to_screen(render_table(
2099             ['Language', 'formats'],
2100             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2101                 for lang, formats in subtitles.items()]))
2102
2103     def urlopen(self, req):
2104         """ Start an HTTP download """
2105         if isinstance(req, compat_basestring):
2106             req = sanitized_Request(req)
2107         return self._opener.open(req, timeout=self._socket_timeout)
2108
2109     def print_debug_header(self):
2110         if not self.params.get('verbose'):
2111             return
2112
2113         if type('') is not compat_str:
2114             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2115             self.report_warning(
2116                 'Your Python is broken! Update to a newer and supported version')
2117
2118         stdout_encoding = getattr(
2119             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2120         encoding_str = (
2121             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2122                 locale.getpreferredencoding(),
2123                 sys.getfilesystemencoding(),
2124                 stdout_encoding,
2125                 self.get_encoding()))
2126         write_string(encoding_str, encoding=None)
2127
2128         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2129         if _LAZY_LOADER:
2130             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2131         try:
2132             sp = subprocess.Popen(
2133                 ['git', 'rev-parse', '--short', 'HEAD'],
2134                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2135                 cwd=os.path.dirname(os.path.abspath(__file__)))
2136             out, err = sp.communicate()
2137             out = out.decode().strip()
2138             if re.match('[0-9a-f]+', out):
2139                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2140         except Exception:
2141             try:
2142                 sys.exc_clear()
2143             except Exception:
2144                 pass
2145         self._write_string('[debug] Python version %s - %s\n' % (
2146             platform.python_version(), platform_name()))
2147
2148         exe_versions = FFmpegPostProcessor.get_versions(self)
2149         exe_versions['rtmpdump'] = rtmpdump_version()
2150         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2151         exe_str = ', '.join(
2152             '%s %s' % (exe, v)
2153             for exe, v in sorted(exe_versions.items())
2154             if v
2155         )
2156         if not exe_str:
2157             exe_str = 'none'
2158         self._write_string('[debug] exe versions: %s\n' % exe_str)
2159
2160         proxy_map = {}
2161         for handler in self._opener.handlers:
2162             if hasattr(handler, 'proxies'):
2163                 proxy_map.update(handler.proxies)
2164         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2165
2166         if self.params.get('call_home', False):
2167             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2168             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2169             latest_version = self.urlopen(
2170                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2171             if version_tuple(latest_version) > version_tuple(__version__):
2172                 self.report_warning(
2173                     'You are using an outdated version (newest version: %s)! '
2174                     'See https://yt-dl.org/update if you need help updating.' %
2175                     latest_version)
2176
2177     def _setup_opener(self):
2178         timeout_val = self.params.get('socket_timeout')
2179         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2180
2181         opts_cookiefile = self.params.get('cookiefile')
2182         opts_proxy = self.params.get('proxy')
2183
2184         if opts_cookiefile is None:
2185             self.cookiejar = compat_cookiejar.CookieJar()
2186         else:
2187             opts_cookiefile = expand_path(opts_cookiefile)
2188             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2189                 opts_cookiefile)
2190             if os.access(opts_cookiefile, os.R_OK):
2191                 self.cookiejar.load()
2192
2193         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2194         if opts_proxy is not None:
2195             if opts_proxy == '':
2196                 proxies = {}
2197             else:
2198                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2199         else:
2200             proxies = compat_urllib_request.getproxies()
2201             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2202             if 'http' in proxies and 'https' not in proxies:
2203                 proxies['https'] = proxies['http']
2204         proxy_handler = PerRequestProxyHandler(proxies)
2205
2206         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2207         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2208         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2209         data_handler = compat_urllib_request_DataHandler()
2210
2211         # When passing our own FileHandler instance, build_opener won't add the
2212         # default FileHandler and allows us to disable the file protocol, which
2213         # can be used for malicious purposes (see
2214         # https://github.com/rg3/youtube-dl/issues/8227)
2215         file_handler = compat_urllib_request.FileHandler()
2216
2217         def file_open(*args, **kwargs):
2218             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2219         file_handler.file_open = file_open
2220
2221         opener = compat_urllib_request.build_opener(
2222             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2223
2224         # Delete the default user-agent header, which would otherwise apply in
2225         # cases where our custom HTTP handler doesn't come into play
2226         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2227         opener.addheaders = []
2228         self._opener = opener
2229
2230     def encode(self, s):
2231         if isinstance(s, bytes):
2232             return s  # Already encoded
2233
2234         try:
2235             return s.encode(self.get_encoding())
2236         except UnicodeEncodeError as err:
2237             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2238             raise
2239
2240     def get_encoding(self):
2241         encoding = self.params.get('encoding')
2242         if encoding is None:
2243             encoding = preferredencoding()
2244         return encoding
2245
2246     def _write_thumbnails(self, info_dict, filename):
2247         if self.params.get('writethumbnail', False):
2248             thumbnails = info_dict.get('thumbnails')
2249             if thumbnails:
2250                 thumbnails = [thumbnails[-1]]
2251         elif self.params.get('write_all_thumbnails', False):
2252             thumbnails = info_dict.get('thumbnails')
2253         else:
2254             return
2255
2256         if not thumbnails:
2257             # No thumbnails present, so return immediately
2258             return
2259
2260         for t in thumbnails:
2261             thumb_ext = determine_ext(t['url'], 'jpg')
2262             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2263             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2264             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2265
2266             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2267                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2268                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2269             else:
2270                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2271                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2272                 try:
2273                     uf = self.urlopen(t['url'])
2274                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2275                         shutil.copyfileobj(uf, thumbf)
2276                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2277                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2278                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2279                     self.report_warning('Unable to download thumbnail "%s": %s' %
2280                                         (t['url'], error_to_compat_str(err)))