_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import contextlib
   8 import copy
   9 import datetime
  10 import errno
  11 import fileinput
  12 import io
  13 import itertools
  14 import json
  15 import locale
  16 import operator
  17 import os
  18 import platform
  19 import re
  20 import shutil
  21 import subprocess
  22 import socket
  23 import string
  24 import sys
  25 import time
  26 import tokenize
  27 import traceback
  28 import random
  29
  30 from .compat import (
  31     compat_basestring,
  32     compat_cookiejar,
  33     compat_get_terminal_size,
  34     compat_http_client,
  35     compat_kwargs,
  36     compat_numeric_types,
  37     compat_os_name,
  38     compat_str,
  39     compat_tokenize_tokenize,
  40     compat_urllib_error,
  41     compat_urllib_request,
  42     compat_urllib_request_DataHandler,
  43 )
  44 from .utils import (
  45     age_restricted,
  46     args_to_str,
  47     ContentTooShortError,
  48     date_from_str,
  49     DateRange,
  50     DEFAULT_OUTTMPL,
  51     determine_ext,
  52     determine_protocol,
  53     DownloadError,
  54     encode_compat_str,
  55     encodeFilename,
  56     error_to_compat_str,
  57     expand_path,
  58     ExtractorError,
  59     format_bytes,
  60     formatSeconds,
  61     GeoRestrictedError,
  62     int_or_none,
  63     ISO3166Utils,
  64     locked_file,
  65     make_HTTPS_handler,
  66     MaxDownloadsReached,
  67     PagedList,
  68     parse_filesize,
  69     PerRequestProxyHandler,
  70     platform_name,
  71     PostProcessingError,
  72     preferredencoding,
  73     prepend_extension,
  74     register_socks_protocols,
  75     render_table,
  76     replace_extension,
  77     SameFileError,
  78     sanitize_filename,
  79     sanitize_path,
  80     sanitize_url,
  81     sanitized_Request,
  82     std_headers,
  83     subtitles_filename,
  84     UnavailableVideoError,
  85     url_basename,
  86     version_tuple,
  87     write_json_file,
  88     write_string,
  89     YoutubeDLCookieProcessor,
  90     YoutubeDLHandler,
  91 )
  92 from .cache import Cache
  93 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
  94 from .downloader import get_suitable_downloader
  95 from .downloader.rtmp import rtmpdump_version
  96 from .postprocessor import (
  97     FFmpegFixupM3u8PP,
  98     FFmpegFixupM4aPP,
  99     FFmpegFixupStretchedPP,
 100     FFmpegMergerPP,
 101     FFmpegPostProcessor,
 102     get_postprocessor,
 103 )
 104 from .version import __version__
 105
 106 if compat_os_name == 'nt':
 107     import ctypes
 108
 109
 110 class YoutubeDL(object):
 111     """YoutubeDL class.
 112
 113     YoutubeDL objects are the ones responsible of downloading the
 114     actual video file and writing it to disk if the user has requested
 115     it, among some other tasks. In most cases there should be one per
 116     program. As, given a video URL, the downloader doesn't know how to
 117     extract all the needed information, task that InfoExtractors do, it
 118     has to pass the URL to one of them.
 119
 120     For this, YoutubeDL objects have a method that allows
 121     InfoExtractors to be registered in a given order. When it is passed
 122     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 123     finds that reports being able to handle it. The InfoExtractor extracts
 124     all the information about the video or videos the URL refers to, and
 125     YoutubeDL process the extracted information, possibly using a File
 126     Downloader to download the video.
 127
 128     YoutubeDL objects accept a lot of parameters. In order not to saturate
 129     the object constructor with arguments, it receives a dictionary of
 130     options instead. These options are available through the params
 131     attribute for the InfoExtractors to use. The YoutubeDL also
 132     registers itself as the downloader in charge for the InfoExtractors
 133     that are added to it, so this is a "mutual registration".
 134
 135     Available options:
 136
 137     username:          Username for authentication purposes.
 138     password:          Password for authentication purposes.
 139     videopassword:     Password for accessing a video.
 140     ap_mso:            Adobe Pass multiple-system operator identifier.
 141     ap_username:       Multiple-system operator account username.
 142     ap_password:       Multiple-system operator account password.
 143     usenetrc:          Use netrc for authentication instead.
 144     verbose:           Print additional info to stdout.
 145     quiet:             Do not print messages to stdout.
 146     no_warnings:       Do not print out anything for warnings.
 147     forceurl:          Force printing final URL.
 148     forcetitle:        Force printing title.
 149     forceid:           Force printing ID.
 150     forcethumbnail:    Force printing thumbnail URL.
 151     forcedescription:  Force printing description.
 152     forcefilename:     Force printing final filename.
 153     forceduration:     Force printing duration.
 154     forcejson:         Force printing info_dict as JSON.
 155     dump_single_json:  Force printing the info_dict of the whole playlist
 156                        (or video) as a single JSON line.
 157     simulate:          Do not download the video files.
 158     format:            Video format code. See options.py for more information.
 159     outtmpl:           Template for output names.
 160     restrictfilenames: Do not allow "&" and spaces in file names
 161     ignoreerrors:      Do not stop on download errors.
 162     force_generic_extractor: Force downloader to use the generic extractor
 163     nooverwrites:      Prevent overwriting files.
 164     playliststart:     Playlist item to start at.
 165     playlistend:       Playlist item to end at.
 166     playlist_items:    Specific indices of playlist to download.
 167     playlistreverse:   Download playlist items in reverse order.
 168     playlistrandom:    Download playlist items in random order.
 169     matchtitle:        Download only matching titles.
 170     rejecttitle:       Reject downloads for matching titles.
 171     logger:            Log messages to a logging.Logger instance.
 172     logtostderr:       Log messages to stderr instead of stdout.
 173     writedescription:  Write the video description to a .description file
 174     writeinfojson:     Write the video description to a .info.json file
 175     writeannotations:  Write the video annotations to a .annotations.xml file
 176     writethumbnail:    Write the thumbnail image to a file
 177     write_all_thumbnails:  Write all thumbnail formats to files
 178     writesubtitles:    Write the video subtitles to a file
 179     writeautomaticsub: Write the automatically generated subtitles to a file
 180     allsubtitles:      Downloads all the subtitles of the video
 181                        (requires writesubtitles or writeautomaticsub)
 182     listsubtitles:     Lists all available subtitles for the video
 183     subtitlesformat:   The format code for subtitles
 184     subtitleslangs:    List of languages of the subtitles to download
 185     keepvideo:         Keep the video file after post-processing
 186     daterange:         A DateRange object, download only if the upload_date is in the range.
 187     skip_download:     Skip the actual download of the video file
 188     cachedir:          Location of the cache files in the filesystem.
 189                        False to disable filesystem cache.
 190     noplaylist:        Download single video instead of a playlist if in doubt.
 191     age_limit:         An integer representing the user's age in years.
 192                        Unsuitable videos for the given age are skipped.
 193     min_views:         An integer representing the minimum view count the video
 194                        must have in order to not be skipped.
 195                        Videos without view count information are always
 196                        downloaded. None for no limit.
 197     max_views:         An integer representing the maximum view count.
 198                        Videos that are more popular than that are not
 199                        downloaded.
 200                        Videos without view count information are always
 201                        downloaded. None for no limit.
 202     download_archive:  File name of a file where all downloads are recorded.
 203                        Videos already present in the file are not downloaded
 204                        again.
 205     cookiefile:        File name where cookies should be read from and dumped to.
 206     nocheckcertificate:Do not verify SSL certificates
 207     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 208                        At the moment, this is only supported by YouTube.
 209     proxy:             URL of the proxy server to use
 210     geo_verification_proxy:  URL of the proxy to use for IP address verification
 211                        on geo-restricted sites. (Experimental)
 212     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 213     bidi_workaround:   Work around buggy terminals without bidirectional text
 214                        support, using fridibi
 215     debug_printtraffic:Print out sent and received HTTP traffic
 216     include_ads:       Download ads as well
 217     default_search:    Prepend this string if an input url is not valid.
 218                        'auto' for elaborate guessing
 219     encoding:          Use this encoding instead of the system-specified.
 220     extract_flat:      Do not resolve URLs, return the immediate result.
 221                        Pass in 'in_playlist' to only show this behavior for
 222                        playlist items.
 223     postprocessors:    A list of dictionaries, each with an entry
 224                        * key:  The name of the postprocessor. See
 225                                youtube_dl/postprocessor/__init__.py for a list.
 226                        as well as any further keyword arguments for the
 227                        postprocessor.
 228     progress_hooks:    A list of functions that get called on download
 229                        progress, with a dictionary with the entries
 230                        * status: One of "downloading", "error", or "finished".
 231                                  Check this first and ignore unknown values.
 232
 233                        If status is one of "downloading", or "finished", the
 234                        following properties may also be present:
 235                        * filename: The final filename (always present)
 236                        * tmpfilename: The filename we're currently writing to
 237                        * downloaded_bytes: Bytes on disk
 238                        * total_bytes: Size of the whole file, None if unknown
 239                        * total_bytes_estimate: Guess of the eventual file size,
 240                                                None if unavailable.
 241                        * elapsed: The number of seconds since download started.
 242                        * eta: The estimated time in seconds, None if unknown
 243                        * speed: The download speed in bytes/second, None if
 244                                 unknown
 245                        * fragment_index: The counter of the currently
 246                                          downloaded video fragment.
 247                        * fragment_count: The number of fragments (= individual
 248                                          files that will be merged)
 249
 250                        Progress hooks are guaranteed to be called at least once
 251                        (with status "finished") if the download is successful.
 252     merge_output_format: Extension to use when merging formats.
 253     fixup:             Automatically correct known faults of the file.
 254                        One of:
 255                        - "never": do nothing
 256                        - "warn": only emit a warning
 257                        - "detect_or_warn": check whether we can do anything
 258                                            about it, warn otherwise (default)
 259     source_address:    (Experimental) Client-side IP address to bind to.
 260     call_home:         Boolean, true iff we are allowed to contact the
 261                        youtube-dl servers for debugging.
 262     sleep_interval:    Number of seconds to sleep before each download when
 263                        used alone or a lower bound of a range for randomized
 264                        sleep before each download (minimum possible number
 265                        of seconds to sleep) when used along with
 266                        max_sleep_interval.
 267     max_sleep_interval:Upper bound of a range for randomized sleep before each
 268                        download (maximum possible number of seconds to sleep).
 269                        Must only be used along with sleep_interval.
 270                        Actual sleep time will be a random float from range
 271                        [sleep_interval; max_sleep_interval].
 272     listformats:       Print an overview of available video formats and exit.
 273     list_thumbnails:   Print a table of all thumbnails and exit.
 274     match_filter:      A function that gets called with the info_dict of
 275                        every video.
 276                        If it returns a message, the video is ignored.
 277                        If it returns None, the video is downloaded.
 278                        match_filter_func in utils.py is one example for this.
 279     no_color:          Do not emit color codes in output.
 280     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
 281                        HTTP header (experimental)
 282     geo_bypass_country:
 283                        Two-letter ISO 3166-2 country code that will be used for
 284                        explicit geographic restriction bypassing via faking
 285                        X-Forwarded-For HTTP header (experimental)
 286
 287     The following options determine which downloader is picked:
 288     external_downloader: Executable of the external downloader to call.
 289                        None or unset for standard (built-in) downloader.
 290     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
 291                        if True, otherwise use ffmpeg/avconv if False, otherwise
 292                        use downloader suggested by extractor if None.
 293
 294     The following parameters are not used by YoutubeDL itself, they are used by
 295     the downloader (see youtube_dl/downloader/common.py):
 296     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 297     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 298     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
 299
 300     The following options are used by the post processors:
 301     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 302                        otherwise prefer avconv.
 303     postprocessor_args: A list of additional command-line arguments for the
 304                         postprocessor.
 305     """
 306
 307     _NUMERIC_FIELDS = set((
 308         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
 309         'timestamp', 'upload_year', 'upload_month', 'upload_day',
 310         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
 311         'average_rating', 'comment_count', 'age_limit',
 312         'start_time', 'end_time',
 313         'chapter_number', 'season_number', 'episode_number',
 314         'track_number', 'disc_number', 'release_year',
 315         'playlist_index',
 316     ))
 317
 318     params = None
 319     _ies = []
 320     _pps = []
 321     _download_retcode = None
 322     _num_downloads = None
 323     _screen_file = None
 324
 325     def __init__(self, params=None, auto_init=True):
 326         """Create a FileDownloader object with the given options."""
 327         if params is None:
 328             params = {}
 329         self._ies = []
 330         self._ies_instances = {}
 331         self._pps = []
 332         self._progress_hooks = []
 333         self._download_retcode = 0
 334         self._num_downloads = 0
 335         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 336         self._err_file = sys.stderr
 337         self.params = {
 338             # Default parameters
 339             'nocheckcertificate': False,
 340         }
 341         self.params.update(params)
 342         self.cache = Cache(self)
 343
 344         def check_deprecated(param, option, suggestion):
 345             if self.params.get(param) is not None:
 346                 self.report_warning(
 347                     '%s is deprecated. Use %s instead.' % (option, suggestion))
 348                 return True
 349             return False
 350
 351         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
 352             if self.params.get('geo_verification_proxy') is None:
 353                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
 354
 355         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
 356         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
 357         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
 358
 359         if params.get('bidi_workaround', False):
 360             try:
 361                 import pty
 362                 master, slave = pty.openpty()
 363                 width = compat_get_terminal_size().columns
 364                 if width is None:
 365                     width_args = []
 366                 else:
 367                     width_args = ['-w', str(width)]
 368                 sp_kwargs = dict(
 369                     stdin=subprocess.PIPE,
 370                     stdout=slave,
 371                     stderr=self._err_file)
 372                 try:
 373                     self._output_process = subprocess.Popen(
 374                         ['bidiv'] + width_args, **sp_kwargs
 375                     )
 376                 except OSError:
 377                     self._output_process = subprocess.Popen(
 378                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 379                 self._output_channel = os.fdopen(master, 'rb')
 380             except OSError as ose:
 381                 if ose.errno == errno.ENOENT:
 382                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 383                 else:
 384                     raise
 385
 386         if (sys.platform != 'win32' and
 387                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
 388                 not params.get('restrictfilenames', False)):
 389             # Unicode filesystem API will throw errors (#1474, #13027)
 390             self.report_warning(
 391                 'Assuming --restrict-filenames since file system encoding '
 392                 'cannot encode all characters. '
 393                 'Set the LC_ALL environment variable to fix this.')
 394             self.params['restrictfilenames'] = True
 395
 396         if isinstance(params.get('outtmpl'), bytes):
 397             self.report_warning(
 398                 'Parameter outtmpl is bytes, but should be a unicode string. '
 399                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 400
 401         self._setup_opener()
 402
 403         if auto_init:
 404             self.print_debug_header()
 405             self.add_default_info_extractors()
 406
 407         for pp_def_raw in self.params.get('postprocessors', []):
 408             pp_class = get_postprocessor(pp_def_raw['key'])
 409             pp_def = dict(pp_def_raw)
 410             del pp_def['key']
 411             pp = pp_class(self, **compat_kwargs(pp_def))
 412             self.add_post_processor(pp)
 413
 414         for ph in self.params.get('progress_hooks', []):
 415             self.add_progress_hook(ph)
 416
 417         register_socks_protocols()
 418
 419     def warn_if_short_id(self, argv):
 420         # short YouTube ID starting with dash?
 421         idxs = [
 422             i for i, a in enumerate(argv)
 423             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 424         if idxs:
 425             correct_argv = (
 426                 ['youtube-dl'] +
 427                 [a for i, a in enumerate(argv) if i not in idxs] +
 428                 ['--'] + [argv[i] for i in idxs]
 429             )
 430             self.report_warning(
 431                 'Long argument string detected. '
 432                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 433                 args_to_str(correct_argv))
 434
 435     def add_info_extractor(self, ie):
 436         """Add an InfoExtractor object to the end of the list."""
 437         self._ies.append(ie)
 438         if not isinstance(ie, type):
 439             self._ies_instances[ie.ie_key()] = ie
 440             ie.set_downloader(self)
 441
 442     def get_info_extractor(self, ie_key):
 443         """
 444         Get an instance of an IE with name ie_key, it will try to get one from
 445         the _ies list, if there's no instance it will create a new one and add
 446         it to the extractor list.
 447         """
 448         ie = self._ies_instances.get(ie_key)
 449         if ie is None:
 450             ie = get_info_extractor(ie_key)()
 451             self.add_info_extractor(ie)
 452         return ie
 453
 454     def add_default_info_extractors(self):
 455         """
 456         Add the InfoExtractors returned by gen_extractors to the end of the list
 457         """
 458         for ie in gen_extractor_classes():
 459             self.add_info_extractor(ie)
 460
 461     def add_post_processor(self, pp):
 462         """Add a PostProcessor object to the end of the chain."""
 463         self._pps.append(pp)
 464         pp.set_downloader(self)
 465
 466     def add_progress_hook(self, ph):
 467         """Add the progress hook (currently only for the file downloader)"""
 468         self._progress_hooks.append(ph)
 469
 470     def _bidi_workaround(self, message):
 471         if not hasattr(self, '_output_channel'):
 472             return message
 473
 474         assert hasattr(self, '_output_process')
 475         assert isinstance(message, compat_str)
 476         line_count = message.count('\n') + 1
 477         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 478         self._output_process.stdin.flush()
 479         res = ''.join(self._output_channel.readline().decode('utf-8')
 480                       for _ in range(line_count))
 481         return res[:-len('\n')]
 482
 483     def to_screen(self, message, skip_eol=False):
 484         """Print message to stdout if not in quiet mode."""
 485         return self.to_stdout(message, skip_eol, check_quiet=True)
 486
 487     def _write_string(self, s, out=None):
 488         write_string(s, out=out, encoding=self.params.get('encoding'))
 489
 490     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 491         """Print message to stdout if not in quiet mode."""
 492         if self.params.get('logger'):
 493             self.params['logger'].debug(message)
 494         elif not check_quiet or not self.params.get('quiet', False):
 495             message = self._bidi_workaround(message)
 496             terminator = ['\n', ''][skip_eol]
 497             output = message + terminator
 498
 499             self._write_string(output, self._screen_file)
 500
 501     def to_stderr(self, message):
 502         """Print message to stderr."""
 503         assert isinstance(message, compat_str)
 504         if self.params.get('logger'):
 505             self.params['logger'].error(message)
 506         else:
 507             message = self._bidi_workaround(message)
 508             output = message + '\n'
 509             self._write_string(output, self._err_file)
 510
 511     def to_console_title(self, message):
 512         if not self.params.get('consoletitle', False):
 513             return
 514         if compat_os_name == 'nt':
 515             if ctypes.windll.kernel32.GetConsoleWindow():
 516                 # c_wchar_p() might not be necessary if `message` is
 517                 # already of type unicode()
 518                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 519         elif 'TERM' in os.environ:
 520             self._write_string('\033]0;%s\007' % message, self._screen_file)
 521
 522     def save_console_title(self):
 523         if not self.params.get('consoletitle', False):
 524             return
 525         if compat_os_name != 'nt' and 'TERM' in os.environ:
 526             # Save the title on stack
 527             self._write_string('\033[22;0t', self._screen_file)
 528
 529     def restore_console_title(self):
 530         if not self.params.get('consoletitle', False):
 531             return
 532         if compat_os_name != 'nt' and 'TERM' in os.environ:
 533             # Restore the title from stack
 534             self._write_string('\033[23;0t', self._screen_file)
 535
 536     def __enter__(self):
 537         self.save_console_title()
 538         return self
 539
 540     def __exit__(self, *args):
 541         self.restore_console_title()
 542
 543         if self.params.get('cookiefile') is not None:
 544             self.cookiejar.save()
 545
 546     def trouble(self, message=None, tb=None):
 547         """Determine action to take when a download problem appears.
 548
 549         Depending on if the downloader has been configured to ignore
 550         download errors or not, this method may throw an exception or
 551         not when errors are found, after printing the message.
 552
 553         tb, if given, is additional traceback information.
 554         """
 555         if message is not None:
 556             self.to_stderr(message)
 557         if self.params.get('verbose'):
 558             if tb is None:
 559                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 560                     tb = ''
 561                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 562                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 563                     tb += encode_compat_str(traceback.format_exc())
 564                 else:
 565                     tb_data = traceback.format_list(traceback.extract_stack())
 566                     tb = ''.join(tb_data)
 567             self.to_stderr(tb)
 568         if not self.params.get('ignoreerrors', False):
 569             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 570                 exc_info = sys.exc_info()[1].exc_info
 571             else:
 572                 exc_info = sys.exc_info()
 573             raise DownloadError(message, exc_info)
 574         self._download_retcode = 1
 575
 576     def report_warning(self, message):
 577         '''
 578         Print the message to stderr, it will be prefixed with 'WARNING:'
 579         If stderr is a tty file the 'WARNING:' will be colored
 580         '''
 581         if self.params.get('logger') is not None:
 582             self.params['logger'].warning(message)
 583         else:
 584             if self.params.get('no_warnings'):
 585                 return
 586             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 587                 _msg_header = '\033[0;33mWARNING:\033[0m'
 588             else:
 589                 _msg_header = 'WARNING:'
 590             warning_message = '%s %s' % (_msg_header, message)
 591             self.to_stderr(warning_message)
 592
 593     def report_error(self, message, tb=None):
 594         '''
 595         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 596         in red if stderr is a tty file.
 597         '''
 598         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
 599             _msg_header = '\033[0;31mERROR:\033[0m'
 600         else:
 601             _msg_header = 'ERROR:'
 602         error_message = '%s %s' % (_msg_header, message)
 603         self.trouble(error_message, tb)
 604
 605     def report_file_already_downloaded(self, file_name):
 606         """Report file has already been fully downloaded."""
 607         try:
 608             self.to_screen('[download] %s has already been downloaded' % file_name)
 609         except UnicodeEncodeError:
 610             self.to_screen('[download] The file has already been downloaded')
 611
 612     def prepare_filename(self, info_dict):
 613         """Generate the output filename."""
 614         try:
 615             template_dict = dict(info_dict)
 616
 617             template_dict['epoch'] = int(time.time())
 618             autonumber_size = self.params.get('autonumber_size')
 619             if autonumber_size is None:
 620                 autonumber_size = 5
 621             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
 622             if template_dict.get('resolution') is None:
 623                 if template_dict.get('width') and template_dict.get('height'):
 624                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 625                 elif template_dict.get('height'):
 626                     template_dict['resolution'] = '%sp' % template_dict['height']
 627                 elif template_dict.get('width'):
 628                     template_dict['resolution'] = '%dx?' % template_dict['width']
 629
 630             sanitize = lambda k, v: sanitize_filename(
 631                 compat_str(v),
 632                 restricted=self.params.get('restrictfilenames'),
 633                 is_id=(k == 'id' or k.endswith('_id')))
 634             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
 635                                  for k, v in template_dict.items()
 636                                  if v is not None and not isinstance(v, (list, tuple, dict)))
 637             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 638
 639             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 640
 641             # For fields playlist_index and autonumber convert all occurrences
 642             # of %(field)s to %(field)0Nd for backward compatibility
 643             field_size_compat_map = {
 644                 'playlist_index': len(str(template_dict['n_entries'])),
 645                 'autonumber': autonumber_size,
 646             }
 647             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
 648             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
 649             if mobj:
 650                 outtmpl = re.sub(
 651                     FIELD_SIZE_COMPAT_RE,
 652                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
 653                     outtmpl)
 654
 655             # Missing numeric fields used together with integer presentation types
 656             # in format specification will break the argument substitution since
 657             # string 'NA' is returned for missing fields. We will patch output
 658             # template for missing fields to meet string presentation type.
 659             for numeric_field in self._NUMERIC_FIELDS:
 660                 if numeric_field not in template_dict:
 661                     # As of [1] format syntax is:
 662                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
 663                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
 664                     FORMAT_RE = r'''(?x)
 665                         (?<!%)
 666                         %
 667                         \({0}\)  # mapping key
 668                         (?:[#0\-+ ]+)?  # conversion flags (optional)
 669                         (?:\d+)?  # minimum field width (optional)
 670                         (?:\.\d+)?  # precision (optional)
 671                         [hlL]?  # length modifier (optional)
 672                         [diouxXeEfFgGcrs%]  # conversion type
 673                     '''
 674                     outtmpl = re.sub(
 675                         FORMAT_RE.format(numeric_field),
 676                         r'%({0})s'.format(numeric_field), outtmpl)
 677
 678             # expand_path translates '%%' into '%' and '$$' into '$'
 679             # correspondingly that is not what we want since we need to keep
 680             # '%%' intact for template dict substitution step. Working around
 681             # with boundary-alike separator hack.
 682             sep = ''.join([random.choice(string.ascii_letters) for _ in range(32)])
 683             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
 684
 685             # outtmpl should be expand_path'ed before template dict substitution
 686             # because meta fields may contain env variables we don't want to
 687             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
 688             # title "Hello $PATH", we don't want `$PATH` to be expanded.
 689             filename = expand_path(outtmpl).replace(sep, '') % template_dict
 690
 691             # Temporary fix for #4787
 692             # 'Treat' all problem characters by passing filename through preferredencoding
 693             # to workaround encoding issues with subprocess on python2 @ Windows
 694             if sys.version_info < (3, 0) and sys.platform == 'win32':
 695                 filename = encodeFilename(filename, True).decode(preferredencoding())
 696             return sanitize_path(filename)
 697         except ValueError as err:
 698             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 699             return None
 700
 701     def _match_entry(self, info_dict, incomplete):
 702         """ Returns None iff the file should be downloaded """
 703
 704         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 705         if 'title' in info_dict:
 706             # This can happen when we're just evaluating the playlist
 707             title = info_dict['title']
 708             matchtitle = self.params.get('matchtitle', False)
 709             if matchtitle:
 710                 if not re.search(matchtitle, title, re.IGNORECASE):
 711                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 712             rejecttitle = self.params.get('rejecttitle', False)
 713             if rejecttitle:
 714                 if re.search(rejecttitle, title, re.IGNORECASE):
 715                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 716         date = info_dict.get('upload_date')
 717         if date is not None:
 718             dateRange = self.params.get('daterange', DateRange())
 719             if date not in dateRange:
 720                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 721         view_count = info_dict.get('view_count')
 722         if view_count is not None:
 723             min_views = self.params.get('min_views')
 724             if min_views is not None and view_count < min_views:
 725                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 726             max_views = self.params.get('max_views')
 727             if max_views is not None and view_count > max_views:
 728                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 729         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 730             return 'Skipping "%s" because it is age restricted' % video_title
 731         if self.in_download_archive(info_dict):
 732             return '%s has already been recorded in archive' % video_title
 733
 734         if not incomplete:
 735             match_filter = self.params.get('match_filter')
 736             if match_filter is not None:
 737                 ret = match_filter(info_dict)
 738                 if ret is not None:
 739                     return ret
 740
 741         return None
 742
 743     @staticmethod
 744     def add_extra_info(info_dict, extra_info):
 745         '''Set the keys from extra_info in info dict if they are missing'''
 746         for key, value in extra_info.items():
 747             info_dict.setdefault(key, value)
 748
 749     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 750                      process=True, force_generic_extractor=False):
 751         '''
 752         Returns a list with a dictionary for each video we find.
 753         If 'download', also downloads the videos.
 754         extra_info is a dict containing the extra values to add to each result
 755         '''
 756
 757         if not ie_key and force_generic_extractor:
 758             ie_key = 'Generic'
 759
 760         if ie_key:
 761             ies = [self.get_info_extractor(ie_key)]
 762         else:
 763             ies = self._ies
 764
 765         for ie in ies:
 766             if not ie.suitable(url):
 767                 continue
 768
 769             ie = self.get_info_extractor(ie.ie_key())
 770             if not ie.working():
 771                 self.report_warning('The program functionality for this site has been marked as broken, '
 772                                     'and will probably not work.')
 773
 774             try:
 775                 ie_result = ie.extract(url)
 776                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 777                     break
 778                 if isinstance(ie_result, list):
 779                     # Backwards compatibility: old IE result format
 780                     ie_result = {
 781                         '_type': 'compat_list',
 782                         'entries': ie_result,
 783                     }
 784                 self.add_default_extra_info(ie_result, ie, url)
 785                 if process:
 786                     return self.process_ie_result(ie_result, download, extra_info)
 787                 else:
 788                     return ie_result
 789             except GeoRestrictedError as e:
 790                 msg = e.msg
 791                 if e.countries:
 792                     msg += '\nThis video is available in %s.' % ', '.join(
 793                         map(ISO3166Utils.short2full, e.countries))
 794                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
 795                 self.report_error(msg)
 796                 break
 797             except ExtractorError as e:  # An error we somewhat expected
 798                 self.report_error(compat_str(e), e.format_traceback())
 799                 break
 800             except MaxDownloadsReached:
 801                 raise
 802             except Exception as e:
 803                 if self.params.get('ignoreerrors', False):
 804                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
 805                     break
 806                 else:
 807                     raise
 808         else:
 809             self.report_error('no suitable InfoExtractor for URL %s' % url)
 810
 811     def add_default_extra_info(self, ie_result, ie, url):
 812         self.add_extra_info(ie_result, {
 813             'extractor': ie.IE_NAME,
 814             'webpage_url': url,
 815             'webpage_url_basename': url_basename(url),
 816             'extractor_key': ie.ie_key(),
 817         })
 818
 819     def process_ie_result(self, ie_result, download=True, extra_info={}):
 820         """
 821         Take the result of the ie(may be modified) and resolve all unresolved
 822         references (URLs, playlist items).
 823
 824         It will also download the videos if 'download'.
 825         Returns the resolved ie_result.
 826         """
 827         result_type = ie_result.get('_type', 'video')
 828
 829         if result_type in ('url', 'url_transparent'):
 830             ie_result['url'] = sanitize_url(ie_result['url'])
 831             extract_flat = self.params.get('extract_flat', False)
 832             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 833                     extract_flat is True):
 834                 if self.params.get('forcejson', False):
 835                     self.to_stdout(json.dumps(ie_result))
 836                 return ie_result
 837
 838         if result_type == 'video':
 839             self.add_extra_info(ie_result, extra_info)
 840             return self.process_video_result(ie_result, download=download)
 841         elif result_type == 'url':
 842             # We have to add extra_info to the results because it may be
 843             # contained in a playlist
 844             return self.extract_info(ie_result['url'],
 845                                      download,
 846                                      ie_key=ie_result.get('ie_key'),
 847                                      extra_info=extra_info)
 848         elif result_type == 'url_transparent':
 849             # Use the information from the embedding page
 850             info = self.extract_info(
 851                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 852                 extra_info=extra_info, download=False, process=False)
 853
 854             # extract_info may return None when ignoreerrors is enabled and
 855             # extraction failed with an error, don't crash and return early
 856             # in this case
 857             if not info:
 858                 return info
 859
 860             force_properties = dict(
 861                 (k, v) for k, v in ie_result.items() if v is not None)
 862             for f in ('_type', 'url', 'ie_key'):
 863                 if f in force_properties:
 864                     del force_properties[f]
 865             new_result = info.copy()
 866             new_result.update(force_properties)
 867
 868             # Extracted info may not be a video result (i.e.
 869             # info.get('_type', 'video') != video) but rather an url or
 870             # url_transparent. In such cases outer metadata (from ie_result)
 871             # should be propagated to inner one (info). For this to happen
 872             # _type of info should be overridden with url_transparent. This
 873             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
 874             if new_result.get('_type') == 'url':
 875                 new_result['_type'] = 'url_transparent'
 876
 877             return self.process_ie_result(
 878                 new_result, download=download, extra_info=extra_info)
 879         elif result_type in ('playlist', 'multi_video'):
 880             # We process each entry in the playlist
 881             playlist = ie_result.get('title') or ie_result.get('id')
 882             self.to_screen('[download] Downloading playlist: %s' % playlist)
 883
 884             playlist_results = []
 885
 886             playliststart = self.params.get('playliststart', 1) - 1
 887             playlistend = self.params.get('playlistend')
 888             # For backwards compatibility, interpret -1 as whole list
 889             if playlistend == -1:
 890                 playlistend = None
 891
 892             playlistitems_str = self.params.get('playlist_items')
 893             playlistitems = None
 894             if playlistitems_str is not None:
 895                 def iter_playlistitems(format):
 896                     for string_segment in format.split(','):
 897                         if '-' in string_segment:
 898                             start, end = string_segment.split('-')
 899                             for item in range(int(start), int(end) + 1):
 900                                 yield int(item)
 901                         else:
 902                             yield int(string_segment)
 903                 playlistitems = iter_playlistitems(playlistitems_str)
 904
 905             ie_entries = ie_result['entries']
 906             if isinstance(ie_entries, list):
 907                 n_all_entries = len(ie_entries)
 908                 if playlistitems:
 909                     entries = [
 910                         ie_entries[i - 1] for i in playlistitems
 911                         if -n_all_entries <= i - 1 < n_all_entries]
 912                 else:
 913                     entries = ie_entries[playliststart:playlistend]
 914                 n_entries = len(entries)
 915                 self.to_screen(
 916                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
 917                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 918             elif isinstance(ie_entries, PagedList):
 919                 if playlistitems:
 920                     entries = []
 921                     for item in playlistitems:
 922                         entries.extend(ie_entries.getslice(
 923                             item - 1, item
 924                         ))
 925                 else:
 926                     entries = ie_entries.getslice(
 927                         playliststart, playlistend)
 928                 n_entries = len(entries)
 929                 self.to_screen(
 930                     '[%s] playlist %s: Downloading %d videos' %
 931                     (ie_result['extractor'], playlist, n_entries))
 932             else:  # iterable
 933                 if playlistitems:
 934                     entry_list = list(ie_entries)
 935                     entries = [entry_list[i - 1] for i in playlistitems]
 936                 else:
 937                     entries = list(itertools.islice(
 938                         ie_entries, playliststart, playlistend))
 939                 n_entries = len(entries)
 940                 self.to_screen(
 941                     '[%s] playlist %s: Downloading %d videos' %
 942                     (ie_result['extractor'], playlist, n_entries))
 943
 944             if self.params.get('playlistreverse', False):
 945                 entries = entries[::-1]
 946
 947             if self.params.get('playlistrandom', False):
 948                 random.shuffle(entries)
 949
 950             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
 951
 952             for i, entry in enumerate(entries, 1):
 953                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 954                 # This __x_forwarded_for_ip thing is a bit ugly but requires
 955                 # minimal changes
 956                 if x_forwarded_for:
 957                     entry['__x_forwarded_for_ip'] = x_forwarded_for
 958                 extra = {
 959                     'n_entries': n_entries,
 960                     'playlist': playlist,
 961                     'playlist_id': ie_result.get('id'),
 962                     'playlist_title': ie_result.get('title'),
 963                     'playlist_index': i + playliststart,
 964                     'extractor': ie_result['extractor'],
 965                     'webpage_url': ie_result['webpage_url'],
 966                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 967                     'extractor_key': ie_result['extractor_key'],
 968                 }
 969
 970                 reason = self._match_entry(entry, incomplete=True)
 971                 if reason is not None:
 972                     self.to_screen('[download] ' + reason)
 973                     continue
 974
 975                 entry_result = self.process_ie_result(entry,
 976                                                       download=download,
 977                                                       extra_info=extra)
 978                 playlist_results.append(entry_result)
 979             ie_result['entries'] = playlist_results
 980             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
 981             return ie_result
 982         elif result_type == 'compat_list':
 983             self.report_warning(
 984                 'Extractor %s returned a compat_list result. '
 985                 'It needs to be updated.' % ie_result.get('extractor'))
 986
 987             def _fixup(r):
 988                 self.add_extra_info(
 989                     r,
 990                     {
 991                         'extractor': ie_result['extractor'],
 992                         'webpage_url': ie_result['webpage_url'],
 993                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 994                         'extractor_key': ie_result['extractor_key'],
 995                     }
 996                 )
 997                 return r
 998             ie_result['entries'] = [
 999                 self.process_ie_result(_fixup(r), download, extra_info)
1000                 for r in ie_result['entries']
1001             ]
1002             return ie_result
1003         else:
1004             raise Exception('Invalid result type: %s' % result_type)
1005
1006     def _build_format_filter(self, filter_spec):
1007         " Returns a function to filter the formats according to the filter_spec "
1008
1009         OPERATORS = {
1010             '<': operator.lt,
1011             '<=': operator.le,
1012             '>': operator.gt,
1013             '>=': operator.ge,
1014             '=': operator.eq,
1015             '!=': operator.ne,
1016         }
1017         operator_rex = re.compile(r'''(?x)\s*
1018             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1019             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1020             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1021             $
1022             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1023         m = operator_rex.search(filter_spec)
1024         if m:
1025             try:
1026                 comparison_value = int(m.group('value'))
1027             except ValueError:
1028                 comparison_value = parse_filesize(m.group('value'))
1029                 if comparison_value is None:
1030                     comparison_value = parse_filesize(m.group('value') + 'B')
1031                 if comparison_value is None:
1032                     raise ValueError(
1033                         'Invalid value %r in format specification %r' % (
1034                             m.group('value'), filter_spec))
1035             op = OPERATORS[m.group('op')]
1036
1037         if not m:
1038             STR_OPERATORS = {
1039                 '=': operator.eq,
1040                 '!=': operator.ne,
1041                 '^=': lambda attr, value: attr.startswith(value),
1042                 '$=': lambda attr, value: attr.endswith(value),
1043                 '*=': lambda attr, value: value in attr,
1044             }
1045             str_operator_rex = re.compile(r'''(?x)
1046                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1047                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1048                 \s*(?P<value>[a-zA-Z0-9._-]+)
1049                 \s*$
1050                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1051             m = str_operator_rex.search(filter_spec)
1052             if m:
1053                 comparison_value = m.group('value')
1054                 op = STR_OPERATORS[m.group('op')]
1055
1056         if not m:
1057             raise ValueError('Invalid filter specification %r' % filter_spec)
1058
1059         def _filter(f):
1060             actual_value = f.get(m.group('key'))
1061             if actual_value is None:
1062                 return m.group('none_inclusive')
1063             return op(actual_value, comparison_value)
1064         return _filter
1065
1066     def build_format_selector(self, format_spec):
1067         def syntax_error(note, start):
1068             message = (
1069                 'Invalid format specification: '
1070                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1071             return SyntaxError(message)
1072
1073         PICKFIRST = 'PICKFIRST'
1074         MERGE = 'MERGE'
1075         SINGLE = 'SINGLE'
1076         GROUP = 'GROUP'
1077         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1078
1079         def _parse_filter(tokens):
1080             filter_parts = []
1081             for type, string, start, _, _ in tokens:
1082                 if type == tokenize.OP and string == ']':
1083                     return ''.join(filter_parts)
1084                 else:
1085                     filter_parts.append(string)
1086
1087         def _remove_unused_ops(tokens):
1088             # Remove operators that we don't use and join them with the surrounding strings
1089             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1090             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1091             last_string, last_start, last_end, last_line = None, None, None, None
1092             for type, string, start, end, line in tokens:
1093                 if type == tokenize.OP and string == '[':
1094                     if last_string:
1095                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1096                         last_string = None
1097                     yield type, string, start, end, line
1098                     # everything inside brackets will be handled by _parse_filter
1099                     for type, string, start, end, line in tokens:
1100                         yield type, string, start, end, line
1101                         if type == tokenize.OP and string == ']':
1102                             break
1103                 elif type == tokenize.OP and string in ALLOWED_OPS:
1104                     if last_string:
1105                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1106                         last_string = None
1107                     yield type, string, start, end, line
1108                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1109                     if not last_string:
1110                         last_string = string
1111                         last_start = start
1112                         last_end = end
1113                     else:
1114                         last_string += string
1115             if last_string:
1116                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1117
1118         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1119             selectors = []
1120             current_selector = None
1121             for type, string, start, _, _ in tokens:
1122                 # ENCODING is only defined in python 3.x
1123                 if type == getattr(tokenize, 'ENCODING', None):
1124                     continue
1125                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1126                     current_selector = FormatSelector(SINGLE, string, [])
1127                 elif type == tokenize.OP:
1128                     if string == ')':
1129                         if not inside_group:
1130                             # ')' will be handled by the parentheses group
1131                             tokens.restore_last_token()
1132                         break
1133                     elif inside_merge and string in ['/', ',']:
1134                         tokens.restore_last_token()
1135                         break
1136                     elif inside_choice and string == ',':
1137                         tokens.restore_last_token()
1138                         break
1139                     elif string == ',':
1140                         if not current_selector:
1141                             raise syntax_error('"," must follow a format selector', start)
1142                         selectors.append(current_selector)
1143                         current_selector = None
1144                     elif string == '/':
1145                         if not current_selector:
1146                             raise syntax_error('"/" must follow a format selector', start)
1147                         first_choice = current_selector
1148                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1149                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1150                     elif string == '[':
1151                         if not current_selector:
1152                             current_selector = FormatSelector(SINGLE, 'best', [])
1153                         format_filter = _parse_filter(tokens)
1154                         current_selector.filters.append(format_filter)
1155                     elif string == '(':
1156                         if current_selector:
1157                             raise syntax_error('Unexpected "("', start)
1158                         group = _parse_format_selection(tokens, inside_group=True)
1159                         current_selector = FormatSelector(GROUP, group, [])
1160                     elif string == '+':
1161                         video_selector = current_selector
1162                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1163                         if not video_selector or not audio_selector:
1164                             raise syntax_error('"+" must be between two format selectors', start)
1165                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1166                     else:
1167                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1168                 elif type == tokenize.ENDMARKER:
1169                     break
1170             if current_selector:
1171                 selectors.append(current_selector)
1172             return selectors
1173
1174         def _build_selector_function(selector):
1175             if isinstance(selector, list):
1176                 fs = [_build_selector_function(s) for s in selector]
1177
1178                 def selector_function(ctx):
1179                     for f in fs:
1180                         for format in f(ctx):
1181                             yield format
1182                 return selector_function
1183             elif selector.type == GROUP:
1184                 selector_function = _build_selector_function(selector.selector)
1185             elif selector.type == PICKFIRST:
1186                 fs = [_build_selector_function(s) for s in selector.selector]
1187
1188                 def selector_function(ctx):
1189                     for f in fs:
1190                         picked_formats = list(f(ctx))
1191                         if picked_formats:
1192                             return picked_formats
1193                     return []
1194             elif selector.type == SINGLE:
1195                 format_spec = selector.selector
1196
1197                 def selector_function(ctx):
1198                     formats = list(ctx['formats'])
1199                     if not formats:
1200                         return
1201                     if format_spec == 'all':
1202                         for f in formats:
1203                             yield f
1204                     elif format_spec in ['best', 'worst', None]:
1205                         format_idx = 0 if format_spec == 'worst' else -1
1206                         audiovideo_formats = [
1207                             f for f in formats
1208                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1209                         if audiovideo_formats:
1210                             yield audiovideo_formats[format_idx]
1211                         # for extractors with incomplete formats (audio only (soundcloud)
1212                         # or video only (imgur)) we will fallback to best/worst
1213                         # {video,audio}-only format
1214                         elif ctx['incomplete_formats']:
1215                             yield formats[format_idx]
1216                     elif format_spec == 'bestaudio':
1217                         audio_formats = [
1218                             f for f in formats
1219                             if f.get('vcodec') == 'none']
1220                         if audio_formats:
1221                             yield audio_formats[-1]
1222                     elif format_spec == 'worstaudio':
1223                         audio_formats = [
1224                             f for f in formats
1225                             if f.get('vcodec') == 'none']
1226                         if audio_formats:
1227                             yield audio_formats[0]
1228                     elif format_spec == 'bestvideo':
1229                         video_formats = [
1230                             f for f in formats
1231                             if f.get('acodec') == 'none']
1232                         if video_formats:
1233                             yield video_formats[-1]
1234                     elif format_spec == 'worstvideo':
1235                         video_formats = [
1236                             f for f in formats
1237                             if f.get('acodec') == 'none']
1238                         if video_formats:
1239                             yield video_formats[0]
1240                     else:
1241                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1242                         if format_spec in extensions:
1243                             filter_f = lambda f: f['ext'] == format_spec
1244                         else:
1245                             filter_f = lambda f: f['format_id'] == format_spec
1246                         matches = list(filter(filter_f, formats))
1247                         if matches:
1248                             yield matches[-1]
1249             elif selector.type == MERGE:
1250                 def _merge(formats_info):
1251                     format_1, format_2 = [f['format_id'] for f in formats_info]
1252                     # The first format must contain the video and the
1253                     # second the audio
1254                     if formats_info[0].get('vcodec') == 'none':
1255                         self.report_error('The first format must '
1256                                           'contain the video, try using '
1257                                           '"-f %s+%s"' % (format_2, format_1))
1258                         return
1259                     # Formats must be opposite (video+audio)
1260                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1261                         self.report_error(
1262                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1263                             % (format_1, format_2))
1264                         return
1265                     output_ext = (
1266                         formats_info[0]['ext']
1267                         if self.params.get('merge_output_format') is None
1268                         else self.params['merge_output_format'])
1269                     return {
1270                         'requested_formats': formats_info,
1271                         'format': '%s+%s' % (formats_info[0].get('format'),
1272                                              formats_info[1].get('format')),
1273                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1274                                                 formats_info[1].get('format_id')),
1275                         'width': formats_info[0].get('width'),
1276                         'height': formats_info[0].get('height'),
1277                         'resolution': formats_info[0].get('resolution'),
1278                         'fps': formats_info[0].get('fps'),
1279                         'vcodec': formats_info[0].get('vcodec'),
1280                         'vbr': formats_info[0].get('vbr'),
1281                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1282                         'acodec': formats_info[1].get('acodec'),
1283                         'abr': formats_info[1].get('abr'),
1284                         'ext': output_ext,
1285                     }
1286                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1287
1288                 def selector_function(ctx):
1289                     for pair in itertools.product(
1290                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1291                         yield _merge(pair)
1292
1293             filters = [self._build_format_filter(f) for f in selector.filters]
1294
1295             def final_selector(ctx):
1296                 ctx_copy = copy.deepcopy(ctx)
1297                 for _filter in filters:
1298                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1299                 return selector_function(ctx_copy)
1300             return final_selector
1301
1302         stream = io.BytesIO(format_spec.encode('utf-8'))
1303         try:
1304             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1305         except tokenize.TokenError:
1306             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1307
1308         class TokenIterator(object):
1309             def __init__(self, tokens):
1310                 self.tokens = tokens
1311                 self.counter = 0
1312
1313             def __iter__(self):
1314                 return self
1315
1316             def __next__(self):
1317                 if self.counter >= len(self.tokens):
1318                     raise StopIteration()
1319                 value = self.tokens[self.counter]
1320                 self.counter += 1
1321                 return value
1322
1323             next = __next__
1324
1325             def restore_last_token(self):
1326                 self.counter -= 1
1327
1328         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1329         return _build_selector_function(parsed_selector)
1330
1331     def _calc_headers(self, info_dict):
1332         res = std_headers.copy()
1333
1334         add_headers = info_dict.get('http_headers')
1335         if add_headers:
1336             res.update(add_headers)
1337
1338         cookies = self._calc_cookies(info_dict)
1339         if cookies:
1340             res['Cookie'] = cookies
1341
1342         if 'X-Forwarded-For' not in res:
1343             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1344             if x_forwarded_for_ip:
1345                 res['X-Forwarded-For'] = x_forwarded_for_ip
1346
1347         return res
1348
1349     def _calc_cookies(self, info_dict):
1350         pr = sanitized_Request(info_dict['url'])
1351         self.cookiejar.add_cookie_header(pr)
1352         return pr.get_header('Cookie')
1353
1354     def process_video_result(self, info_dict, download=True):
1355         assert info_dict.get('_type', 'video') == 'video'
1356
1357         if 'id' not in info_dict:
1358             raise ExtractorError('Missing "id" field in extractor result')
1359         if 'title' not in info_dict:
1360             raise ExtractorError('Missing "title" field in extractor result')
1361
1362         def report_force_conversion(field, field_not, conversion):
1363             self.report_warning(
1364                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1365                 % (field, field_not, conversion))
1366
1367         def sanitize_string_field(info, string_field):
1368             field = info.get(string_field)
1369             if field is None or isinstance(field, compat_str):
1370                 return
1371             report_force_conversion(string_field, 'a string', 'string')
1372             info[string_field] = compat_str(field)
1373
1374         def sanitize_numeric_fields(info):
1375             for numeric_field in self._NUMERIC_FIELDS:
1376                 field = info.get(numeric_field)
1377                 if field is None or isinstance(field, compat_numeric_types):
1378                     continue
1379                 report_force_conversion(numeric_field, 'numeric', 'int')
1380                 info[numeric_field] = int_or_none(field)
1381
1382         sanitize_string_field(info_dict, 'id')
1383         sanitize_numeric_fields(info_dict)
1384
1385         if 'playlist' not in info_dict:
1386             # It isn't part of a playlist
1387             info_dict['playlist'] = None
1388             info_dict['playlist_index'] = None
1389
1390         thumbnails = info_dict.get('thumbnails')
1391         if thumbnails is None:
1392             thumbnail = info_dict.get('thumbnail')
1393             if thumbnail:
1394                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1395         if thumbnails:
1396             thumbnails.sort(key=lambda t: (
1397                 t.get('preference') if t.get('preference') is not None else -1,
1398                 t.get('width') if t.get('width') is not None else -1,
1399                 t.get('height') if t.get('height') is not None else -1,
1400                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1401             for i, t in enumerate(thumbnails):
1402                 t['url'] = sanitize_url(t['url'])
1403                 if t.get('width') and t.get('height'):
1404                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1405                 if t.get('id') is None:
1406                     t['id'] = '%d' % i
1407
1408         if self.params.get('list_thumbnails'):
1409             self.list_thumbnails(info_dict)
1410             return
1411
1412         thumbnail = info_dict.get('thumbnail')
1413         if thumbnail:
1414             info_dict['thumbnail'] = sanitize_url(thumbnail)
1415         elif thumbnails:
1416             info_dict['thumbnail'] = thumbnails[-1]['url']
1417
1418         if 'display_id' not in info_dict and 'id' in info_dict:
1419             info_dict['display_id'] = info_dict['id']
1420
1421         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1422             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1423             # see http://bugs.python.org/issue1646728)
1424             try:
1425                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1426                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1427             except (ValueError, OverflowError, OSError):
1428                 pass
1429
1430         # Auto generate title fields corresponding to the *_number fields when missing
1431         # in order to always have clean titles. This is very common for TV series.
1432         for field in ('chapter', 'season', 'episode'):
1433             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1434                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1435
1436         subtitles = info_dict.get('subtitles')
1437         if subtitles:
1438             for _, subtitle in subtitles.items():
1439                 for subtitle_format in subtitle:
1440                     if subtitle_format.get('url'):
1441                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1442                     if subtitle_format.get('ext') is None:
1443                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1444
1445         if self.params.get('listsubtitles', False):
1446             if 'automatic_captions' in info_dict:
1447                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1448             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1449             return
1450         info_dict['requested_subtitles'] = self.process_subtitles(
1451             info_dict['id'], subtitles,
1452             info_dict.get('automatic_captions'))
1453
1454         # We now pick which formats have to be downloaded
1455         if info_dict.get('formats') is None:
1456             # There's only one format available
1457             formats = [info_dict]
1458         else:
1459             formats = info_dict['formats']
1460
1461         if not formats:
1462             raise ExtractorError('No video formats found!')
1463
1464         def is_wellformed(f):
1465             url = f.get('url')
1466             valid_url = url and isinstance(url, compat_str)
1467             if not valid_url:
1468                 self.report_warning(
1469                     '"url" field is missing or empty - skipping format, '
1470                     'there is an error in extractor')
1471             return valid_url
1472
1473         # Filter out malformed formats for better extraction robustness
1474         formats = list(filter(is_wellformed, formats))
1475
1476         formats_dict = {}
1477
1478         # We check that all the formats have the format and format_id fields
1479         for i, format in enumerate(formats):
1480             sanitize_string_field(format, 'format_id')
1481             sanitize_numeric_fields(format)
1482             format['url'] = sanitize_url(format['url'])
1483             if format.get('format_id') is None:
1484                 format['format_id'] = compat_str(i)
1485             else:
1486                 # Sanitize format_id from characters used in format selector expression
1487                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1488             format_id = format['format_id']
1489             if format_id not in formats_dict:
1490                 formats_dict[format_id] = []
1491             formats_dict[format_id].append(format)
1492
1493         # Make sure all formats have unique format_id
1494         for format_id, ambiguous_formats in formats_dict.items():
1495             if len(ambiguous_formats) > 1:
1496                 for i, format in enumerate(ambiguous_formats):
1497                     format['format_id'] = '%s-%d' % (format_id, i)
1498
1499         for i, format in enumerate(formats):
1500             if format.get('format') is None:
1501                 format['format'] = '{id} - {res}{note}'.format(
1502                     id=format['format_id'],
1503                     res=self.format_resolution(format),
1504                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1505                 )
1506             # Automatically determine file extension if missing
1507             if format.get('ext') is None:
1508                 format['ext'] = determine_ext(format['url']).lower()
1509             # Automatically determine protocol if missing (useful for format
1510             # selection purposes)
1511             if format.get('protocol') is None:
1512                 format['protocol'] = determine_protocol(format)
1513             # Add HTTP headers, so that external programs can use them from the
1514             # json output
1515             full_format_info = info_dict.copy()
1516             full_format_info.update(format)
1517             format['http_headers'] = self._calc_headers(full_format_info)
1518         # Remove private housekeeping stuff
1519         if '__x_forwarded_for_ip' in info_dict:
1520             del info_dict['__x_forwarded_for_ip']
1521
1522         # TODO Central sorting goes here
1523
1524         if formats[0] is not info_dict:
1525             # only set the 'formats' fields if the original info_dict list them
1526             # otherwise we end up with a circular reference, the first (and unique)
1527             # element in the 'formats' field in info_dict is info_dict itself,
1528             # which can't be exported to json
1529             info_dict['formats'] = formats
1530         if self.params.get('listformats'):
1531             self.list_formats(info_dict)
1532             return
1533
1534         req_format = self.params.get('format')
1535         if req_format is None:
1536             req_format_list = []
1537             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1538                     not info_dict.get('is_live')):
1539                 merger = FFmpegMergerPP(self)
1540                 if merger.available and merger.can_merge():
1541                     req_format_list.append('bestvideo+bestaudio')
1542             req_format_list.append('best')
1543             req_format = '/'.join(req_format_list)
1544         format_selector = self.build_format_selector(req_format)
1545
1546         # While in format selection we may need to have an access to the original
1547         # format set in order to calculate some metrics or do some processing.
1548         # For now we need to be able to guess whether original formats provided
1549         # by extractor are incomplete or not (i.e. whether extractor provides only
1550         # video-only or audio-only formats) for proper formats selection for
1551         # extractors with such incomplete formats (see
1552         # https://github.com/rg3/youtube-dl/pull/5556).
1553         # Since formats may be filtered during format selection and may not match
1554         # the original formats the results may be incorrect. Thus original formats
1555         # or pre-calculated metrics should be passed to format selection routines
1556         # as well.
1557         # We will pass a context object containing all necessary additional data
1558         # instead of just formats.
1559         # This fixes incorrect format selection issue (see
1560         # https://github.com/rg3/youtube-dl/issues/10083).
1561         incomplete_formats = (
1562             # All formats are video-only or
1563             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1564             # all formats are audio-only
1565             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1566
1567         ctx = {
1568             'formats': formats,
1569             'incomplete_formats': incomplete_formats,
1570         }
1571
1572         formats_to_download = list(format_selector(ctx))
1573         if not formats_to_download:
1574             raise ExtractorError('requested format not available',
1575                                  expected=True)
1576
1577         if download:
1578             if len(formats_to_download) > 1:
1579                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1580             for format in formats_to_download:
1581                 new_info = dict(info_dict)
1582                 new_info.update(format)
1583                 self.process_info(new_info)
1584         # We update the info dict with the best quality format (backwards compatibility)
1585         info_dict.update(formats_to_download[-1])
1586         return info_dict
1587
1588     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1589         """Select the requested subtitles and their format"""
1590         available_subs = {}
1591         if normal_subtitles and self.params.get('writesubtitles'):
1592             available_subs.update(normal_subtitles)
1593         if automatic_captions and self.params.get('writeautomaticsub'):
1594             for lang, cap_info in automatic_captions.items():
1595                 if lang not in available_subs:
1596                     available_subs[lang] = cap_info
1597
1598         if (not self.params.get('writesubtitles') and not
1599                 self.params.get('writeautomaticsub') or not
1600                 available_subs):
1601             return None
1602
1603         if self.params.get('allsubtitles', False):
1604             requested_langs = available_subs.keys()
1605         else:
1606             if self.params.get('subtitleslangs', False):
1607                 requested_langs = self.params.get('subtitleslangs')
1608             elif 'en' in available_subs:
1609                 requested_langs = ['en']
1610             else:
1611                 requested_langs = [list(available_subs.keys())[0]]
1612
1613         formats_query = self.params.get('subtitlesformat', 'best')
1614         formats_preference = formats_query.split('/') if formats_query else []
1615         subs = {}
1616         for lang in requested_langs:
1617             formats = available_subs.get(lang)
1618             if formats is None:
1619                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1620                 continue
1621             for ext in formats_preference:
1622                 if ext == 'best':
1623                     f = formats[-1]
1624                     break
1625                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1626                 if matches:
1627                     f = matches[-1]
1628                     break
1629             else:
1630                 f = formats[-1]
1631                 self.report_warning(
1632                     'No subtitle format found matching "%s" for language %s, '
1633                     'using %s' % (formats_query, lang, f['ext']))
1634             subs[lang] = f
1635         return subs
1636
1637     def process_info(self, info_dict):
1638         """Process a single resolved IE result."""
1639
1640         assert info_dict.get('_type', 'video') == 'video'
1641
1642         max_downloads = self.params.get('max_downloads')
1643         if max_downloads is not None:
1644             if self._num_downloads >= int(max_downloads):
1645                 raise MaxDownloadsReached()
1646
1647         info_dict['fulltitle'] = info_dict['title']
1648         if len(info_dict['title']) > 200:
1649             info_dict['title'] = info_dict['title'][:197] + '...'
1650
1651         if 'format' not in info_dict:
1652             info_dict['format'] = info_dict['ext']
1653
1654         reason = self._match_entry(info_dict, incomplete=False)
1655         if reason is not None:
1656             self.to_screen('[download] ' + reason)
1657             return
1658
1659         self._num_downloads += 1
1660
1661         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1662
1663         # Forced printings
1664         if self.params.get('forcetitle', False):
1665             self.to_stdout(info_dict['fulltitle'])
1666         if self.params.get('forceid', False):
1667             self.to_stdout(info_dict['id'])
1668         if self.params.get('forceurl', False):
1669             if info_dict.get('requested_formats') is not None:
1670                 for f in info_dict['requested_formats']:
1671                     self.to_stdout(f['url'] + f.get('play_path', ''))
1672             else:
1673                 # For RTMP URLs, also include the playpath
1674                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1675         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1676             self.to_stdout(info_dict['thumbnail'])
1677         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1678             self.to_stdout(info_dict['description'])
1679         if self.params.get('forcefilename', False) and filename is not None:
1680             self.to_stdout(filename)
1681         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1682             self.to_stdout(formatSeconds(info_dict['duration']))
1683         if self.params.get('forceformat', False):
1684             self.to_stdout(info_dict['format'])
1685         if self.params.get('forcejson', False):
1686             self.to_stdout(json.dumps(info_dict))
1687
1688         # Do nothing else if in simulate mode
1689         if self.params.get('simulate', False):
1690             return
1691
1692         if filename is None:
1693             return
1694
1695         try:
1696             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1697             if dn and not os.path.exists(dn):
1698                 os.makedirs(dn)
1699         except (OSError, IOError) as err:
1700             self.report_error('unable to create directory ' + error_to_compat_str(err))
1701             return
1702
1703         if self.params.get('writedescription', False):
1704             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1705             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1706                 self.to_screen('[info] Video description is already present')
1707             elif info_dict.get('description') is None:
1708                 self.report_warning('There\'s no description to write.')
1709             else:
1710                 try:
1711                     self.to_screen('[info] Writing video description to: ' + descfn)
1712                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1713                         descfile.write(info_dict['description'])
1714                 except (OSError, IOError):
1715                     self.report_error('Cannot write description file ' + descfn)
1716                     return
1717
1718         if self.params.get('writeannotations', False):
1719             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1720             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1721                 self.to_screen('[info] Video annotations are already present')
1722             else:
1723                 try:
1724                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1725                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1726                         annofile.write(info_dict['annotations'])
1727                 except (KeyError, TypeError):
1728                     self.report_warning('There are no annotations to write.')
1729                 except (OSError, IOError):
1730                     self.report_error('Cannot write annotations file: ' + annofn)
1731                     return
1732
1733         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1734                                        self.params.get('writeautomaticsub')])
1735
1736         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1737             # subtitles download errors are already managed as troubles in relevant IE
1738             # that way it will silently go on when used with unsupporting IE
1739             subtitles = info_dict['requested_subtitles']
1740             ie = self.get_info_extractor(info_dict['extractor_key'])
1741             for sub_lang, sub_info in subtitles.items():
1742                 sub_format = sub_info['ext']
1743                 if sub_info.get('data') is not None:
1744                     sub_data = sub_info['data']
1745                 else:
1746                     try:
1747                         sub_data = ie._download_webpage(
1748                             sub_info['url'], info_dict['id'], note=False)
1749                     except ExtractorError as err:
1750                         self.report_warning('Unable to download subtitle for "%s": %s' %
1751                                             (sub_lang, error_to_compat_str(err.cause)))
1752                         continue
1753                 try:
1754                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1755                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1756                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1757                     else:
1758                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1759                         # Use newline='' to prevent conversion of newline characters
1760                         # See https://github.com/rg3/youtube-dl/issues/10268
1761                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1762                             subfile.write(sub_data)
1763                 except (OSError, IOError):
1764                     self.report_error('Cannot write subtitles file ' + sub_filename)
1765                     return
1766
1767         if self.params.get('writeinfojson', False):
1768             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1769             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1770                 self.to_screen('[info] Video description metadata is already present')
1771             else:
1772                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1773                 try:
1774                     write_json_file(self.filter_requested_info(info_dict), infofn)
1775                 except (OSError, IOError):
1776                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1777                     return
1778
1779         self._write_thumbnails(info_dict, filename)
1780
1781         if not self.params.get('skip_download', False):
1782             try:
1783                 def dl(name, info):
1784                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1785                     for ph in self._progress_hooks:
1786                         fd.add_progress_hook(ph)
1787                     if self.params.get('verbose'):
1788                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1789                     return fd.download(name, info)
1790
1791                 if info_dict.get('requested_formats') is not None:
1792                     downloaded = []
1793                     success = True
1794                     merger = FFmpegMergerPP(self)
1795                     if not merger.available:
1796                         postprocessors = []
1797                         self.report_warning('You have requested multiple '
1798                                             'formats but ffmpeg or avconv are not installed.'
1799                                             ' The formats won\'t be merged.')
1800                     else:
1801                         postprocessors = [merger]
1802
1803                     def compatible_formats(formats):
1804                         video, audio = formats
1805                         # Check extension
1806                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1807                         if video_ext and audio_ext:
1808                             COMPATIBLE_EXTS = (
1809                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1810                                 ('webm')
1811                             )
1812                             for exts in COMPATIBLE_EXTS:
1813                                 if video_ext in exts and audio_ext in exts:
1814                                     return True
1815                         # TODO: Check acodec/vcodec
1816                         return False
1817
1818                     filename_real_ext = os.path.splitext(filename)[1][1:]
1819                     filename_wo_ext = (
1820                         os.path.splitext(filename)[0]
1821                         if filename_real_ext == info_dict['ext']
1822                         else filename)
1823                     requested_formats = info_dict['requested_formats']
1824                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1825                         info_dict['ext'] = 'mkv'
1826                         self.report_warning(
1827                             'Requested formats are incompatible for merge and will be merged into mkv.')
1828                     # Ensure filename always has a correct extension for successful merge
1829                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1830                     if os.path.exists(encodeFilename(filename)):
1831                         self.to_screen(
1832                             '[download] %s has already been downloaded and '
1833                             'merged' % filename)
1834                     else:
1835                         for f in requested_formats:
1836                             new_info = dict(info_dict)
1837                             new_info.update(f)
1838                             fname = self.prepare_filename(new_info)
1839                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1840                             downloaded.append(fname)
1841                             partial_success = dl(fname, new_info)
1842                             success = success and partial_success
1843                         info_dict['__postprocessors'] = postprocessors
1844                         info_dict['__files_to_merge'] = downloaded
1845                 else:
1846                     # Just a single file
1847                     success = dl(filename, info_dict)
1848             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1850                 return
1851             except (OSError, IOError) as err:
1852                 raise UnavailableVideoError(err)
1853             except (ContentTooShortError, ) as err:
1854                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1855                 return
1856
1857             if success and filename != '-':
1858                 # Fixup content
1859                 fixup_policy = self.params.get('fixup')
1860                 if fixup_policy is None:
1861                     fixup_policy = 'detect_or_warn'
1862
1863                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1864
1865                 stretched_ratio = info_dict.get('stretched_ratio')
1866                 if stretched_ratio is not None and stretched_ratio != 1:
1867                     if fixup_policy == 'warn':
1868                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1869                             info_dict['id'], stretched_ratio))
1870                     elif fixup_policy == 'detect_or_warn':
1871                         stretched_pp = FFmpegFixupStretchedPP(self)
1872                         if stretched_pp.available:
1873                             info_dict.setdefault('__postprocessors', [])
1874                             info_dict['__postprocessors'].append(stretched_pp)
1875                         else:
1876                             self.report_warning(
1877                                 '%s: Non-uniform pixel ratio (%s). %s'
1878                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1879                     else:
1880                         assert fixup_policy in ('ignore', 'never')
1881
1882                 if (info_dict.get('requested_formats') is None and
1883                         info_dict.get('container') == 'm4a_dash'):
1884                     if fixup_policy == 'warn':
1885                         self.report_warning(
1886                             '%s: writing DASH m4a. '
1887                             'Only some players support this container.'
1888                             % info_dict['id'])
1889                     elif fixup_policy == 'detect_or_warn':
1890                         fixup_pp = FFmpegFixupM4aPP(self)
1891                         if fixup_pp.available:
1892                             info_dict.setdefault('__postprocessors', [])
1893                             info_dict['__postprocessors'].append(fixup_pp)
1894                         else:
1895                             self.report_warning(
1896                                 '%s: writing DASH m4a. '
1897                                 'Only some players support this container. %s'
1898                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1899                     else:
1900                         assert fixup_policy in ('ignore', 'never')
1901
1902                 if (info_dict.get('protocol') == 'm3u8_native' or
1903                         info_dict.get('protocol') == 'm3u8' and
1904                         self.params.get('hls_prefer_native')):
1905                     if fixup_policy == 'warn':
1906                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1907                             info_dict['id']))
1908                     elif fixup_policy == 'detect_or_warn':
1909                         fixup_pp = FFmpegFixupM3u8PP(self)
1910                         if fixup_pp.available:
1911                             info_dict.setdefault('__postprocessors', [])
1912                             info_dict['__postprocessors'].append(fixup_pp)
1913                         else:
1914                             self.report_warning(
1915                                 '%s: malformed AAC bitstream detected. %s'
1916                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1917                     else:
1918                         assert fixup_policy in ('ignore', 'never')
1919
1920                 try:
1921                     self.post_process(filename, info_dict)
1922                 except (PostProcessingError) as err:
1923                     self.report_error('postprocessing: %s' % str(err))
1924                     return
1925                 self.record_download_archive(info_dict)
1926
1927     def download(self, url_list):
1928         """Download a given list of URLs."""
1929         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1930         if (len(url_list) > 1 and
1931                 outtmpl != '-' and
1932                 '%' not in outtmpl and
1933                 self.params.get('max_downloads') != 1):
1934             raise SameFileError(outtmpl)
1935
1936         for url in url_list:
1937             try:
1938                 # It also downloads the videos
1939                 res = self.extract_info(
1940                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1941             except UnavailableVideoError:
1942                 self.report_error('unable to download video')
1943             except MaxDownloadsReached:
1944                 self.to_screen('[info] Maximum number of downloaded files reached.')
1945                 raise
1946             else:
1947                 if self.params.get('dump_single_json', False):
1948                     self.to_stdout(json.dumps(res))
1949
1950         return self._download_retcode
1951
1952     def download_with_info_file(self, info_filename):
1953         with contextlib.closing(fileinput.FileInput(
1954                 [info_filename], mode='r',
1955                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1956             # FileInput doesn't have a read method, we can't call json.load
1957             info = self.filter_requested_info(json.loads('\n'.join(f)))
1958         try:
1959             self.process_ie_result(info, download=True)
1960         except DownloadError:
1961             webpage_url = info.get('webpage_url')
1962             if webpage_url is not None:
1963                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1964                 return self.download([webpage_url])
1965             else:
1966                 raise
1967         return self._download_retcode
1968
1969     @staticmethod
1970     def filter_requested_info(info_dict):
1971         return dict(
1972             (k, v) for k, v in info_dict.items()
1973             if k not in ['requested_formats', 'requested_subtitles'])
1974
1975     def post_process(self, filename, ie_info):
1976         """Run all the postprocessors on the given file."""
1977         info = dict(ie_info)
1978         info['filepath'] = filename
1979         pps_chain = []
1980         if ie_info.get('__postprocessors') is not None:
1981             pps_chain.extend(ie_info['__postprocessors'])
1982         pps_chain.extend(self._pps)
1983         for pp in pps_chain:
1984             files_to_delete = []
1985             try:
1986                 files_to_delete, info = pp.run(info)
1987             except PostProcessingError as e:
1988                 self.report_error(e.msg)
1989             if files_to_delete and not self.params.get('keepvideo', False):
1990                 for old_filename in files_to_delete:
1991                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1992                     try:
1993                         os.remove(encodeFilename(old_filename))
1994                     except (IOError, OSError):
1995                         self.report_warning('Unable to remove downloaded original file')
1996
1997     def _make_archive_id(self, info_dict):
1998         # Future-proof against any change in case
1999         # and backwards compatibility with prior versions
2000         extractor = info_dict.get('extractor_key')
2001         if extractor is None:
2002             if 'id' in info_dict:
2003                 extractor = info_dict.get('ie_key')  # key in a playlist
2004         if extractor is None:
2005             return None  # Incomplete video information
2006         return extractor.lower() + ' ' + info_dict['id']
2007
2008     def in_download_archive(self, info_dict):
2009         fn = self.params.get('download_archive')
2010         if fn is None:
2011             return False
2012
2013         vid_id = self._make_archive_id(info_dict)
2014         if vid_id is None:
2015             return False  # Incomplete video information
2016
2017         try:
2018             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2019                 for line in archive_file:
2020                     if line.strip() == vid_id:
2021                         return True
2022         except IOError as ioe:
2023             if ioe.errno != errno.ENOENT:
2024                 raise
2025         return False
2026
2027     def record_download_archive(self, info_dict):
2028         fn = self.params.get('download_archive')
2029         if fn is None:
2030             return
2031         vid_id = self._make_archive_id(info_dict)
2032         assert vid_id
2033         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2034             archive_file.write(vid_id + '\n')
2035
2036     @staticmethod
2037     def format_resolution(format, default='unknown'):
2038         if format.get('vcodec') == 'none':
2039             return 'audio only'
2040         if format.get('resolution') is not None:
2041             return format['resolution']
2042         if format.get('height') is not None:
2043             if format.get('width') is not None:
2044                 res = '%sx%s' % (format['width'], format['height'])
2045             else:
2046                 res = '%sp' % format['height']
2047         elif format.get('width') is not None:
2048             res = '%dx?' % format['width']
2049         else:
2050             res = default
2051         return res
2052
2053     def _format_note(self, fdict):
2054         res = ''
2055         if fdict.get('ext') in ['f4f', 'f4m']:
2056             res += '(unsupported) '
2057         if fdict.get('language'):
2058             if res:
2059                 res += ' '
2060             res += '[%s] ' % fdict['language']
2061         if fdict.get('format_note') is not None:
2062             res += fdict['format_note'] + ' '
2063         if fdict.get('tbr') is not None:
2064             res += '%4dk ' % fdict['tbr']
2065         if fdict.get('container') is not None:
2066             if res:
2067                 res += ', '
2068             res += '%s container' % fdict['container']
2069         if (fdict.get('vcodec') is not None and
2070                 fdict.get('vcodec') != 'none'):
2071             if res:
2072                 res += ', '
2073             res += fdict['vcodec']
2074             if fdict.get('vbr') is not None:
2075                 res += '@'
2076         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2077             res += 'video@'
2078         if fdict.get('vbr') is not None:
2079             res += '%4dk' % fdict['vbr']
2080         if fdict.get('fps') is not None:
2081             if res:
2082                 res += ', '
2083             res += '%sfps' % fdict['fps']
2084         if fdict.get('acodec') is not None:
2085             if res:
2086                 res += ', '
2087             if fdict['acodec'] == 'none':
2088                 res += 'video only'
2089             else:
2090                 res += '%-5s' % fdict['acodec']
2091         elif fdict.get('abr') is not None:
2092             if res:
2093                 res += ', '
2094             res += 'audio'
2095         if fdict.get('abr') is not None:
2096             res += '@%3dk' % fdict['abr']
2097         if fdict.get('asr') is not None:
2098             res += ' (%5dHz)' % fdict['asr']
2099         if fdict.get('filesize') is not None:
2100             if res:
2101                 res += ', '
2102             res += format_bytes(fdict['filesize'])
2103         elif fdict.get('filesize_approx') is not None:
2104             if res:
2105                 res += ', '
2106             res += '~' + format_bytes(fdict['filesize_approx'])
2107         return res
2108
2109     def list_formats(self, info_dict):
2110         formats = info_dict.get('formats', [info_dict])
2111         table = [
2112             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2113             for f in formats
2114             if f.get('preference') is None or f['preference'] >= -1000]
2115         if len(formats) > 1:
2116             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2117
2118         header_line = ['format code', 'extension', 'resolution', 'note']
2119         self.to_screen(
2120             '[info] Available formats for %s:\n%s' %
2121             (info_dict['id'], render_table(header_line, table)))
2122
2123     def list_thumbnails(self, info_dict):
2124         thumbnails = info_dict.get('thumbnails')
2125         if not thumbnails:
2126             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2127             return
2128
2129         self.to_screen(
2130             '[info] Thumbnails for %s:' % info_dict['id'])
2131         self.to_screen(render_table(
2132             ['ID', 'width', 'height', 'URL'],
2133             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2134
2135     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2136         if not subtitles:
2137             self.to_screen('%s has no %s' % (video_id, name))
2138             return
2139         self.to_screen(
2140             'Available %s for %s:' % (name, video_id))
2141         self.to_screen(render_table(
2142             ['Language', 'formats'],
2143             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2144                 for lang, formats in subtitles.items()]))
2145
2146     def urlopen(self, req):
2147         """ Start an HTTP download """
2148         if isinstance(req, compat_basestring):
2149             req = sanitized_Request(req)
2150         return self._opener.open(req, timeout=self._socket_timeout)
2151
2152     def print_debug_header(self):
2153         if not self.params.get('verbose'):
2154             return
2155
2156         if type('') is not compat_str:
2157             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2158             self.report_warning(
2159                 'Your Python is broken! Update to a newer and supported version')
2160
2161         stdout_encoding = getattr(
2162             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2163         encoding_str = (
2164             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2165                 locale.getpreferredencoding(),
2166                 sys.getfilesystemencoding(),
2167                 stdout_encoding,
2168                 self.get_encoding()))
2169         write_string(encoding_str, encoding=None)
2170
2171         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2172         if _LAZY_LOADER:
2173             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2174         try:
2175             sp = subprocess.Popen(
2176                 ['git', 'rev-parse', '--short', 'HEAD'],
2177                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2178                 cwd=os.path.dirname(os.path.abspath(__file__)))
2179             out, err = sp.communicate()
2180             out = out.decode().strip()
2181             if re.match('[0-9a-f]+', out):
2182                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2183         except Exception:
2184             try:
2185                 sys.exc_clear()
2186             except Exception:
2187                 pass
2188         self._write_string('[debug] Python version %s - %s\n' % (
2189             platform.python_version(), platform_name()))
2190
2191         exe_versions = FFmpegPostProcessor.get_versions(self)
2192         exe_versions['rtmpdump'] = rtmpdump_version()
2193         exe_str = ', '.join(
2194             '%s %s' % (exe, v)
2195             for exe, v in sorted(exe_versions.items())
2196             if v
2197         )
2198         if not exe_str:
2199             exe_str = 'none'
2200         self._write_string('[debug] exe versions: %s\n' % exe_str)
2201
2202         proxy_map = {}
2203         for handler in self._opener.handlers:
2204             if hasattr(handler, 'proxies'):
2205                 proxy_map.update(handler.proxies)
2206         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2207
2208         if self.params.get('call_home', False):
2209             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2210             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2211             latest_version = self.urlopen(
2212                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2213             if version_tuple(latest_version) > version_tuple(__version__):
2214                 self.report_warning(
2215                     'You are using an outdated version (newest version: %s)! '
2216                     'See https://yt-dl.org/update if you need help updating.' %
2217                     latest_version)
2218
2219     def _setup_opener(self):
2220         timeout_val = self.params.get('socket_timeout')
2221         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2222
2223         opts_cookiefile = self.params.get('cookiefile')
2224         opts_proxy = self.params.get('proxy')
2225
2226         if opts_cookiefile is None:
2227             self.cookiejar = compat_cookiejar.CookieJar()
2228         else:
2229             opts_cookiefile = expand_path(opts_cookiefile)
2230             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2231                 opts_cookiefile)
2232             if os.access(opts_cookiefile, os.R_OK):
2233                 self.cookiejar.load()
2234
2235         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2236         if opts_proxy is not None:
2237             if opts_proxy == '':
2238                 proxies = {}
2239             else:
2240                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2241         else:
2242             proxies = compat_urllib_request.getproxies()
2243             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2244             if 'http' in proxies and 'https' not in proxies:
2245                 proxies['https'] = proxies['http']
2246         proxy_handler = PerRequestProxyHandler(proxies)
2247
2248         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2249         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2250         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2251         data_handler = compat_urllib_request_DataHandler()
2252
2253         # When passing our own FileHandler instance, build_opener won't add the
2254         # default FileHandler and allows us to disable the file protocol, which
2255         # can be used for malicious purposes (see
2256         # https://github.com/rg3/youtube-dl/issues/8227)
2257         file_handler = compat_urllib_request.FileHandler()
2258
2259         def file_open(*args, **kwargs):
2260             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2261         file_handler.file_open = file_open
2262
2263         opener = compat_urllib_request.build_opener(
2264             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2265
2266         # Delete the default user-agent header, which would otherwise apply in
2267         # cases where our custom HTTP handler doesn't come into play
2268         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2269         opener.addheaders = []
2270         self._opener = opener
2271
2272     def encode(self, s):
2273         if isinstance(s, bytes):
2274             return s  # Already encoded
2275
2276         try:
2277             return s.encode(self.get_encoding())
2278         except UnicodeEncodeError as err:
2279             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2280             raise
2281
2282     def get_encoding(self):
2283         encoding = self.params.get('encoding')
2284         if encoding is None:
2285             encoding = preferredencoding()
2286         return encoding
2287
2288     def _write_thumbnails(self, info_dict, filename):
2289         if self.params.get('writethumbnail', False):
2290             thumbnails = info_dict.get('thumbnails')
2291             if thumbnails:
2292                 thumbnails = [thumbnails[-1]]
2293         elif self.params.get('write_all_thumbnails', False):
2294             thumbnails = info_dict.get('thumbnails')
2295         else:
2296             return
2297
2298         if not thumbnails:
2299             # No thumbnails present, so return immediately
2300             return
2301
2302         for t in thumbnails:
2303             thumb_ext = determine_ext(t['url'], 'jpg')
2304             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2305             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2306             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2307
2308             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2309                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2310                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2311             else:
2312                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2313                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2314                 try:
2315                     uf = self.urlopen(t['url'])
2316                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2317                         shutil.copyfileobj(uf, thumbf)
2318                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2319                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2320                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2321                     self.report_warning('Unable to download thumbnail "%s": %s' %
2322                                         (t['url'], error_to_compat_str(err)))