[YoutubeDL] check annotations availabilty(closes #18582)
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     str_or_none,
86     subtitles_filename,
87     UnavailableVideoError,
88     url_basename,
89     version_tuple,
90     write_json_file,
91     write_string,
92     YoutubeDLCookieJar,
93     YoutubeDLCookieProcessor,
94     YoutubeDLHandler,
95 )
96 from .cache import Cache
97 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
98 from .extractor.openload import PhantomJSwrapper
99 from .downloader import get_suitable_downloader
100 from .downloader.rtmp import rtmpdump_version
101 from .postprocessor import (
102     FFmpegFixupM3u8PP,
103     FFmpegFixupM4aPP,
104     FFmpegFixupStretchedPP,
105     FFmpegMergerPP,
106     FFmpegPostProcessor,
107     get_postprocessor,
108 )
109 from .version import __version__
110
111 if compat_os_name == 'nt':
112     import ctypes
113
114
115 class YoutubeDL(object):
116     """YoutubeDL class.
117
118     YoutubeDL objects are the ones responsible of downloading the
119     actual video file and writing it to disk if the user has requested
120     it, among some other tasks. In most cases there should be one per
121     program. As, given a video URL, the downloader doesn't know how to
122     extract all the needed information, task that InfoExtractors do, it
123     has to pass the URL to one of them.
124
125     For this, YoutubeDL objects have a method that allows
126     InfoExtractors to be registered in a given order. When it is passed
127     a URL, the YoutubeDL object handles it to the first InfoExtractor it
128     finds that reports being able to handle it. The InfoExtractor extracts
129     all the information about the video or videos the URL refers to, and
130     YoutubeDL process the extracted information, possibly using a File
131     Downloader to download the video.
132
133     YoutubeDL objects accept a lot of parameters. In order not to saturate
134     the object constructor with arguments, it receives a dictionary of
135     options instead. These options are available through the params
136     attribute for the InfoExtractors to use. The YoutubeDL also
137     registers itself as the downloader in charge for the InfoExtractors
138     that are added to it, so this is a "mutual registration".
139
140     Available options:
141
142     username:          Username for authentication purposes.
143     password:          Password for authentication purposes.
144     videopassword:     Password for accessing a video.
145     ap_mso:            Adobe Pass multiple-system operator identifier.
146     ap_username:       Multiple-system operator account username.
147     ap_password:       Multiple-system operator account password.
148     usenetrc:          Use netrc for authentication instead.
149     verbose:           Print additional info to stdout.
150     quiet:             Do not print messages to stdout.
151     no_warnings:       Do not print out anything for warnings.
152     forceurl:          Force printing final URL.
153     forcetitle:        Force printing title.
154     forceid:           Force printing ID.
155     forcethumbnail:    Force printing thumbnail URL.
156     forcedescription:  Force printing description.
157     forcefilename:     Force printing final filename.
158     forceduration:     Force printing duration.
159     forcejson:         Force printing info_dict as JSON.
160     dump_single_json:  Force printing the info_dict of the whole playlist
161                        (or video) as a single JSON line.
162     simulate:          Do not download the video files.
163     format:            Video format code. See options.py for more information.
164     outtmpl:           Template for output names.
165     restrictfilenames: Do not allow "&" and spaces in file names
166     ignoreerrors:      Do not stop on download errors.
167     force_generic_extractor: Force downloader to use the generic extractor
168     nooverwrites:      Prevent overwriting files.
169     playliststart:     Playlist item to start at.
170     playlistend:       Playlist item to end at.
171     playlist_items:    Specific indices of playlist to download.
172     playlistreverse:   Download playlist items in reverse order.
173     playlistrandom:    Download playlist items in random order.
174     matchtitle:        Download only matching titles.
175     rejecttitle:       Reject downloads for matching titles.
176     logger:            Log messages to a logging.Logger instance.
177     logtostderr:       Log messages to stderr instead of stdout.
178     writedescription:  Write the video description to a .description file
179     writeinfojson:     Write the video description to a .info.json file
180     writeannotations:  Write the video annotations to a .annotations.xml file
181     writethumbnail:    Write the thumbnail image to a file
182     write_all_thumbnails:  Write all thumbnail formats to files
183     writesubtitles:    Write the video subtitles to a file
184     writeautomaticsub: Write the automatically generated subtitles to a file
185     allsubtitles:      Downloads all the subtitles of the video
186                        (requires writesubtitles or writeautomaticsub)
187     listsubtitles:     Lists all available subtitles for the video
188     subtitlesformat:   The format code for subtitles
189     subtitleslangs:    List of languages of the subtitles to download
190     keepvideo:         Keep the video file after post-processing
191     daterange:         A DateRange object, download only if the upload_date is in the range.
192     skip_download:     Skip the actual download of the video file
193     cachedir:          Location of the cache files in the filesystem.
194                        False to disable filesystem cache.
195     noplaylist:        Download single video instead of a playlist if in doubt.
196     age_limit:         An integer representing the user's age in years.
197                        Unsuitable videos for the given age are skipped.
198     min_views:         An integer representing the minimum view count the video
199                        must have in order to not be skipped.
200                        Videos without view count information are always
201                        downloaded. None for no limit.
202     max_views:         An integer representing the maximum view count.
203                        Videos that are more popular than that are not
204                        downloaded.
205                        Videos without view count information are always
206                        downloaded. None for no limit.
207     download_archive:  File name of a file where all downloads are recorded.
208                        Videos already present in the file are not downloaded
209                        again.
210     cookiefile:        File name where cookies should be read from and dumped to.
211     nocheckcertificate:Do not verify SSL certificates
212     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
213                        At the moment, this is only supported by YouTube.
214     proxy:             URL of the proxy server to use
215     geo_verification_proxy:  URL of the proxy to use for IP address verification
216                        on geo-restricted sites.
217     socket_timeout:    Time to wait for unresponsive hosts, in seconds
218     bidi_workaround:   Work around buggy terminals without bidirectional text
219                        support, using fridibi
220     debug_printtraffic:Print out sent and received HTTP traffic
221     include_ads:       Download ads as well
222     default_search:    Prepend this string if an input url is not valid.
223                        'auto' for elaborate guessing
224     encoding:          Use this encoding instead of the system-specified.
225     extract_flat:      Do not resolve URLs, return the immediate result.
226                        Pass in 'in_playlist' to only show this behavior for
227                        playlist items.
228     postprocessors:    A list of dictionaries, each with an entry
229                        * key:  The name of the postprocessor. See
230                                youtube_dl/postprocessor/__init__.py for a list.
231                        as well as any further keyword arguments for the
232                        postprocessor.
233     progress_hooks:    A list of functions that get called on download
234                        progress, with a dictionary with the entries
235                        * status: One of "downloading", "error", or "finished".
236                                  Check this first and ignore unknown values.
237
238                        If status is one of "downloading", or "finished", the
239                        following properties may also be present:
240                        * filename: The final filename (always present)
241                        * tmpfilename: The filename we're currently writing to
242                        * downloaded_bytes: Bytes on disk
243                        * total_bytes: Size of the whole file, None if unknown
244                        * total_bytes_estimate: Guess of the eventual file size,
245                                                None if unavailable.
246                        * elapsed: The number of seconds since download started.
247                        * eta: The estimated time in seconds, None if unknown
248                        * speed: The download speed in bytes/second, None if
249                                 unknown
250                        * fragment_index: The counter of the currently
251                                          downloaded video fragment.
252                        * fragment_count: The number of fragments (= individual
253                                          files that will be merged)
254
255                        Progress hooks are guaranteed to be called at least once
256                        (with status "finished") if the download is successful.
257     merge_output_format: Extension to use when merging formats.
258     fixup:             Automatically correct known faults of the file.
259                        One of:
260                        - "never": do nothing
261                        - "warn": only emit a warning
262                        - "detect_or_warn": check whether we can do anything
263                                            about it, warn otherwise (default)
264     source_address:    Client-side IP address to bind to.
265     call_home:         Boolean, true iff we are allowed to contact the
266                        youtube-dl servers for debugging.
267     sleep_interval:    Number of seconds to sleep before each download when
268                        used alone or a lower bound of a range for randomized
269                        sleep before each download (minimum possible number
270                        of seconds to sleep) when used along with
271                        max_sleep_interval.
272     max_sleep_interval:Upper bound of a range for randomized sleep before each
273                        download (maximum possible number of seconds to sleep).
274                        Must only be used along with sleep_interval.
275                        Actual sleep time will be a random float from range
276                        [sleep_interval; max_sleep_interval].
277     listformats:       Print an overview of available video formats and exit.
278     list_thumbnails:   Print a table of all thumbnails and exit.
279     match_filter:      A function that gets called with the info_dict of
280                        every video.
281                        If it returns a message, the video is ignored.
282                        If it returns None, the video is downloaded.
283                        match_filter_func in utils.py is one example for this.
284     no_color:          Do not emit color codes in output.
285     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
286                        HTTP header
287     geo_bypass_country:
288                        Two-letter ISO 3166-2 country code that will be used for
289                        explicit geographic restriction bypassing via faking
290                        X-Forwarded-For HTTP header
291     geo_bypass_ip_block:
292                        IP range in CIDR notation that will be used similarly to
293                        geo_bypass_country
294
295     The following options determine which downloader is picked:
296     external_downloader: Executable of the external downloader to call.
297                        None or unset for standard (built-in) downloader.
298     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
299                        if True, otherwise use ffmpeg/avconv if False, otherwise
300                        use downloader suggested by extractor if None.
301
302     The following parameters are not used by YoutubeDL itself, they are used by
303     the downloader (see youtube_dl/downloader/common.py):
304     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
305     noresizebuffer, retries, continuedl, noprogress, consoletitle,
306     xattr_set_filesize, external_downloader_args, hls_use_mpegts,
307     http_chunk_size.
308
309     The following options are used by the post processors:
310     prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,
311                        otherwise prefer ffmpeg.
312     ffmpeg_location:   Location of the ffmpeg/avconv binary; either the path
313                        to the binary or its containing directory.
314     postprocessor_args: A list of additional command-line arguments for the
315                         postprocessor.
316
317     The following options are used by the Youtube extractor:
318     youtube_include_dash_manifest: If True (default), DASH manifests and related
319                         data will be downloaded and processed by extractor.
320                         You can reduce network I/O by disabling it if you don't
321                         care about DASH.
322     """
323
324     _NUMERIC_FIELDS = set((
325         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
326         'timestamp', 'upload_year', 'upload_month', 'upload_day',
327         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
328         'average_rating', 'comment_count', 'age_limit',
329         'start_time', 'end_time',
330         'chapter_number', 'season_number', 'episode_number',
331         'track_number', 'disc_number', 'release_year',
332         'playlist_index',
333     ))
334
335     params = None
336     _ies = []
337     _pps = []
338     _download_retcode = None
339     _num_downloads = None
340     _screen_file = None
341
342     def __init__(self, params=None, auto_init=True):
343         """Create a FileDownloader object with the given options."""
344         if params is None:
345             params = {}
346         self._ies = []
347         self._ies_instances = {}
348         self._pps = []
349         self._progress_hooks = []
350         self._download_retcode = 0
351         self._num_downloads = 0
352         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
353         self._err_file = sys.stderr
354         self.params = {
355             # Default parameters
356             'nocheckcertificate': False,
357         }
358         self.params.update(params)
359         self.cache = Cache(self)
360
361         def check_deprecated(param, option, suggestion):
362             if self.params.get(param) is not None:
363                 self.report_warning(
364                     '%s is deprecated. Use %s instead.' % (option, suggestion))
365                 return True
366             return False
367
368         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
369             if self.params.get('geo_verification_proxy') is None:
370                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
371
372         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
373         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
374         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
375
376         if params.get('bidi_workaround', False):
377             try:
378                 import pty
379                 master, slave = pty.openpty()
380                 width = compat_get_terminal_size().columns
381                 if width is None:
382                     width_args = []
383                 else:
384                     width_args = ['-w', str(width)]
385                 sp_kwargs = dict(
386                     stdin=subprocess.PIPE,
387                     stdout=slave,
388                     stderr=self._err_file)
389                 try:
390                     self._output_process = subprocess.Popen(
391                         ['bidiv'] + width_args, **sp_kwargs
392                     )
393                 except OSError:
394                     self._output_process = subprocess.Popen(
395                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
396                 self._output_channel = os.fdopen(master, 'rb')
397             except OSError as ose:
398                 if ose.errno == errno.ENOENT:
399                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
400                 else:
401                     raise
402
403         if (sys.platform != 'win32'
404                 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
405                 and not params.get('restrictfilenames', False)):
406             # Unicode filesystem API will throw errors (#1474, #13027)
407             self.report_warning(
408                 'Assuming --restrict-filenames since file system encoding '
409                 'cannot encode all characters. '
410                 'Set the LC_ALL environment variable to fix this.')
411             self.params['restrictfilenames'] = True
412
413         if isinstance(params.get('outtmpl'), bytes):
414             self.report_warning(
415                 'Parameter outtmpl is bytes, but should be a unicode string. '
416                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
417
418         self._setup_opener()
419
420         if auto_init:
421             self.print_debug_header()
422             self.add_default_info_extractors()
423
424         for pp_def_raw in self.params.get('postprocessors', []):
425             pp_class = get_postprocessor(pp_def_raw['key'])
426             pp_def = dict(pp_def_raw)
427             del pp_def['key']
428             pp = pp_class(self, **compat_kwargs(pp_def))
429             self.add_post_processor(pp)
430
431         for ph in self.params.get('progress_hooks', []):
432             self.add_progress_hook(ph)
433
434         register_socks_protocols()
435
436     def warn_if_short_id(self, argv):
437         # short YouTube ID starting with dash?
438         idxs = [
439             i for i, a in enumerate(argv)
440             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
441         if idxs:
442             correct_argv = (
443                 ['youtube-dl']
444                 + [a for i, a in enumerate(argv) if i not in idxs]
445                 + ['--'] + [argv[i] for i in idxs]
446             )
447             self.report_warning(
448                 'Long argument string detected. '
449                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
450                 args_to_str(correct_argv))
451
452     def add_info_extractor(self, ie):
453         """Add an InfoExtractor object to the end of the list."""
454         self._ies.append(ie)
455         if not isinstance(ie, type):
456             self._ies_instances[ie.ie_key()] = ie
457             ie.set_downloader(self)
458
459     def get_info_extractor(self, ie_key):
460         """
461         Get an instance of an IE with name ie_key, it will try to get one from
462         the _ies list, if there's no instance it will create a new one and add
463         it to the extractor list.
464         """
465         ie = self._ies_instances.get(ie_key)
466         if ie is None:
467             ie = get_info_extractor(ie_key)()
468             self.add_info_extractor(ie)
469         return ie
470
471     def add_default_info_extractors(self):
472         """
473         Add the InfoExtractors returned by gen_extractors to the end of the list
474         """
475         for ie in gen_extractor_classes():
476             self.add_info_extractor(ie)
477
478     def add_post_processor(self, pp):
479         """Add a PostProcessor object to the end of the chain."""
480         self._pps.append(pp)
481         pp.set_downloader(self)
482
483     def add_progress_hook(self, ph):
484         """Add the progress hook (currently only for the file downloader)"""
485         self._progress_hooks.append(ph)
486
487     def _bidi_workaround(self, message):
488         if not hasattr(self, '_output_channel'):
489             return message
490
491         assert hasattr(self, '_output_process')
492         assert isinstance(message, compat_str)
493         line_count = message.count('\n') + 1
494         self._output_process.stdin.write((message + '\n').encode('utf-8'))
495         self._output_process.stdin.flush()
496         res = ''.join(self._output_channel.readline().decode('utf-8')
497                       for _ in range(line_count))
498         return res[:-len('\n')]
499
500     def to_screen(self, message, skip_eol=False):
501         """Print message to stdout if not in quiet mode."""
502         return self.to_stdout(message, skip_eol, check_quiet=True)
503
504     def _write_string(self, s, out=None):
505         write_string(s, out=out, encoding=self.params.get('encoding'))
506
507     def to_stdout(self, message, skip_eol=False, check_quiet=False):
508         """Print message to stdout if not in quiet mode."""
509         if self.params.get('logger'):
510             self.params['logger'].debug(message)
511         elif not check_quiet or not self.params.get('quiet', False):
512             message = self._bidi_workaround(message)
513             terminator = ['\n', ''][skip_eol]
514             output = message + terminator
515
516             self._write_string(output, self._screen_file)
517
518     def to_stderr(self, message):
519         """Print message to stderr."""
520         assert isinstance(message, compat_str)
521         if self.params.get('logger'):
522             self.params['logger'].error(message)
523         else:
524             message = self._bidi_workaround(message)
525             output = message + '\n'
526             self._write_string(output, self._err_file)
527
528     def to_console_title(self, message):
529         if not self.params.get('consoletitle', False):
530             return
531         if compat_os_name == 'nt':
532             if ctypes.windll.kernel32.GetConsoleWindow():
533                 # c_wchar_p() might not be necessary if `message` is
534                 # already of type unicode()
535                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
536         elif 'TERM' in os.environ:
537             self._write_string('\033]0;%s\007' % message, self._screen_file)
538
539     def save_console_title(self):
540         if not self.params.get('consoletitle', False):
541             return
542         if self.params.get('simulate', False):
543             return
544         if compat_os_name != 'nt' and 'TERM' in os.environ:
545             # Save the title on stack
546             self._write_string('\033[22;0t', self._screen_file)
547
548     def restore_console_title(self):
549         if not self.params.get('consoletitle', False):
550             return
551         if self.params.get('simulate', False):
552             return
553         if compat_os_name != 'nt' and 'TERM' in os.environ:
554             # Restore the title from stack
555             self._write_string('\033[23;0t', self._screen_file)
556
557     def __enter__(self):
558         self.save_console_title()
559         return self
560
561     def __exit__(self, *args):
562         self.restore_console_title()
563
564         if self.params.get('cookiefile') is not None:
565             self.cookiejar.save(ignore_discard=True, ignore_expires=True)
566
567     def trouble(self, message=None, tb=None):
568         """Determine action to take when a download problem appears.
569
570         Depending on if the downloader has been configured to ignore
571         download errors or not, this method may throw an exception or
572         not when errors are found, after printing the message.
573
574         tb, if given, is additional traceback information.
575         """
576         if message is not None:
577             self.to_stderr(message)
578         if self.params.get('verbose'):
579             if tb is None:
580                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
581                     tb = ''
582                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
583                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
584                     tb += encode_compat_str(traceback.format_exc())
585                 else:
586                     tb_data = traceback.format_list(traceback.extract_stack())
587                     tb = ''.join(tb_data)
588             self.to_stderr(tb)
589         if not self.params.get('ignoreerrors', False):
590             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
591                 exc_info = sys.exc_info()[1].exc_info
592             else:
593                 exc_info = sys.exc_info()
594             raise DownloadError(message, exc_info)
595         self._download_retcode = 1
596
597     def report_warning(self, message):
598         '''
599         Print the message to stderr, it will be prefixed with 'WARNING:'
600         If stderr is a tty file the 'WARNING:' will be colored
601         '''
602         if self.params.get('logger') is not None:
603             self.params['logger'].warning(message)
604         else:
605             if self.params.get('no_warnings'):
606                 return
607             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
608                 _msg_header = '\033[0;33mWARNING:\033[0m'
609             else:
610                 _msg_header = 'WARNING:'
611             warning_message = '%s %s' % (_msg_header, message)
612             self.to_stderr(warning_message)
613
614     def report_error(self, message, tb=None):
615         '''
616         Do the same as trouble, but prefixes the message with 'ERROR:', colored
617         in red if stderr is a tty file.
618         '''
619         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
620             _msg_header = '\033[0;31mERROR:\033[0m'
621         else:
622             _msg_header = 'ERROR:'
623         error_message = '%s %s' % (_msg_header, message)
624         self.trouble(error_message, tb)
625
626     def report_file_already_downloaded(self, file_name):
627         """Report file has already been fully downloaded."""
628         try:
629             self.to_screen('[download] %s has already been downloaded' % file_name)
630         except UnicodeEncodeError:
631             self.to_screen('[download] The file has already been downloaded')
632
633     def prepare_filename(self, info_dict):
634         """Generate the output filename."""
635         try:
636             template_dict = dict(info_dict)
637
638             template_dict['epoch'] = int(time.time())
639             autonumber_size = self.params.get('autonumber_size')
640             if autonumber_size is None:
641                 autonumber_size = 5
642             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
643             if template_dict.get('resolution') is None:
644                 if template_dict.get('width') and template_dict.get('height'):
645                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
646                 elif template_dict.get('height'):
647                     template_dict['resolution'] = '%sp' % template_dict['height']
648                 elif template_dict.get('width'):
649                     template_dict['resolution'] = '%dx?' % template_dict['width']
650
651             sanitize = lambda k, v: sanitize_filename(
652                 compat_str(v),
653                 restricted=self.params.get('restrictfilenames'),
654                 is_id=(k == 'id' or k.endswith('_id')))
655             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
656                                  for k, v in template_dict.items()
657                                  if v is not None and not isinstance(v, (list, tuple, dict)))
658             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
659
660             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
661
662             # For fields playlist_index and autonumber convert all occurrences
663             # of %(field)s to %(field)0Nd for backward compatibility
664             field_size_compat_map = {
665                 'playlist_index': len(str(template_dict['n_entries'])),
666                 'autonumber': autonumber_size,
667             }
668             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
669             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
670             if mobj:
671                 outtmpl = re.sub(
672                     FIELD_SIZE_COMPAT_RE,
673                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
674                     outtmpl)
675
676             # Missing numeric fields used together with integer presentation types
677             # in format specification will break the argument substitution since
678             # string 'NA' is returned for missing fields. We will patch output
679             # template for missing fields to meet string presentation type.
680             for numeric_field in self._NUMERIC_FIELDS:
681                 if numeric_field not in template_dict:
682                     # As of [1] format syntax is:
683                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
684                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
685                     FORMAT_RE = r'''(?x)
686                         (?<!%)
687                         %
688                         \({0}\)  # mapping key
689                         (?:[#0\-+ ]+)?  # conversion flags (optional)
690                         (?:\d+)?  # minimum field width (optional)
691                         (?:\.\d+)?  # precision (optional)
692                         [hlL]?  # length modifier (optional)
693                         [diouxXeEfFgGcrs%]  # conversion type
694                     '''
695                     outtmpl = re.sub(
696                         FORMAT_RE.format(numeric_field),
697                         r'%({0})s'.format(numeric_field), outtmpl)
698
699             # expand_path translates '%%' into '%' and '$$' into '$'
700             # correspondingly that is not what we want since we need to keep
701             # '%%' intact for template dict substitution step. Working around
702             # with boundary-alike separator hack.
703             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
704             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
705
706             # outtmpl should be expand_path'ed before template dict substitution
707             # because meta fields may contain env variables we don't want to
708             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
709             # title "Hello $PATH", we don't want `$PATH` to be expanded.
710             filename = expand_path(outtmpl).replace(sep, '') % template_dict
711
712             # Temporary fix for #4787
713             # 'Treat' all problem characters by passing filename through preferredencoding
714             # to workaround encoding issues with subprocess on python2 @ Windows
715             if sys.version_info < (3, 0) and sys.platform == 'win32':
716                 filename = encodeFilename(filename, True).decode(preferredencoding())
717             return sanitize_path(filename)
718         except ValueError as err:
719             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
720             return None
721
722     def _match_entry(self, info_dict, incomplete):
723         """ Returns None iff the file should be downloaded """
724
725         video_title = info_dict.get('title', info_dict.get('id', 'video'))
726         if 'title' in info_dict:
727             # This can happen when we're just evaluating the playlist
728             title = info_dict['title']
729             matchtitle = self.params.get('matchtitle', False)
730             if matchtitle:
731                 if not re.search(matchtitle, title, re.IGNORECASE):
732                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
733             rejecttitle = self.params.get('rejecttitle', False)
734             if rejecttitle:
735                 if re.search(rejecttitle, title, re.IGNORECASE):
736                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
737         date = info_dict.get('upload_date')
738         if date is not None:
739             dateRange = self.params.get('daterange', DateRange())
740             if date not in dateRange:
741                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
742         view_count = info_dict.get('view_count')
743         if view_count is not None:
744             min_views = self.params.get('min_views')
745             if min_views is not None and view_count < min_views:
746                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
747             max_views = self.params.get('max_views')
748             if max_views is not None and view_count > max_views:
749                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
750         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
751             return 'Skipping "%s" because it is age restricted' % video_title
752         if self.in_download_archive(info_dict):
753             return '%s has already been recorded in archive' % video_title
754
755         if not incomplete:
756             match_filter = self.params.get('match_filter')
757             if match_filter is not None:
758                 ret = match_filter(info_dict)
759                 if ret is not None:
760                     return ret
761
762         return None
763
764     @staticmethod
765     def add_extra_info(info_dict, extra_info):
766         '''Set the keys from extra_info in info dict if they are missing'''
767         for key, value in extra_info.items():
768             info_dict.setdefault(key, value)
769
770     def extract_info(self, url, download=True, ie_key=None, extra_info={},
771                      process=True, force_generic_extractor=False):
772         '''
773         Returns a list with a dictionary for each video we find.
774         If 'download', also downloads the videos.
775         extra_info is a dict containing the extra values to add to each result
776         '''
777
778         if not ie_key and force_generic_extractor:
779             ie_key = 'Generic'
780
781         if ie_key:
782             ies = [self.get_info_extractor(ie_key)]
783         else:
784             ies = self._ies
785
786         for ie in ies:
787             if not ie.suitable(url):
788                 continue
789
790             ie = self.get_info_extractor(ie.ie_key())
791             if not ie.working():
792                 self.report_warning('The program functionality for this site has been marked as broken, '
793                                     'and will probably not work.')
794
795             try:
796                 ie_result = ie.extract(url)
797                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
798                     break
799                 if isinstance(ie_result, list):
800                     # Backwards compatibility: old IE result format
801                     ie_result = {
802                         '_type': 'compat_list',
803                         'entries': ie_result,
804                     }
805                 self.add_default_extra_info(ie_result, ie, url)
806                 if process:
807                     return self.process_ie_result(ie_result, download, extra_info)
808                 else:
809                     return ie_result
810             except GeoRestrictedError as e:
811                 msg = e.msg
812                 if e.countries:
813                     msg += '\nThis video is available in %s.' % ', '.join(
814                         map(ISO3166Utils.short2full, e.countries))
815                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
816                 self.report_error(msg)
817                 break
818             except ExtractorError as e:  # An error we somewhat expected
819                 self.report_error(compat_str(e), e.format_traceback())
820                 break
821             except MaxDownloadsReached:
822                 raise
823             except Exception as e:
824                 if self.params.get('ignoreerrors', False):
825                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
826                     break
827                 else:
828                     raise
829         else:
830             self.report_error('no suitable InfoExtractor for URL %s' % url)
831
832     def add_default_extra_info(self, ie_result, ie, url):
833         self.add_extra_info(ie_result, {
834             'extractor': ie.IE_NAME,
835             'webpage_url': url,
836             'webpage_url_basename': url_basename(url),
837             'extractor_key': ie.ie_key(),
838         })
839
840     def process_ie_result(self, ie_result, download=True, extra_info={}):
841         """
842         Take the result of the ie(may be modified) and resolve all unresolved
843         references (URLs, playlist items).
844
845         It will also download the videos if 'download'.
846         Returns the resolved ie_result.
847         """
848         result_type = ie_result.get('_type', 'video')
849
850         if result_type in ('url', 'url_transparent'):
851             ie_result['url'] = sanitize_url(ie_result['url'])
852             extract_flat = self.params.get('extract_flat', False)
853             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
854                     or extract_flat is True):
855                 if self.params.get('forcejson', False):
856                     self.to_stdout(json.dumps(ie_result))
857                 return ie_result
858
859         if result_type == 'video':
860             self.add_extra_info(ie_result, extra_info)
861             return self.process_video_result(ie_result, download=download)
862         elif result_type == 'url':
863             # We have to add extra_info to the results because it may be
864             # contained in a playlist
865             return self.extract_info(ie_result['url'],
866                                      download,
867                                      ie_key=ie_result.get('ie_key'),
868                                      extra_info=extra_info)
869         elif result_type == 'url_transparent':
870             # Use the information from the embedding page
871             info = self.extract_info(
872                 ie_result['url'], ie_key=ie_result.get('ie_key'),
873                 extra_info=extra_info, download=False, process=False)
874
875             # extract_info may return None when ignoreerrors is enabled and
876             # extraction failed with an error, don't crash and return early
877             # in this case
878             if not info:
879                 return info
880
881             force_properties = dict(
882                 (k, v) for k, v in ie_result.items() if v is not None)
883             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
884                 if f in force_properties:
885                     del force_properties[f]
886             new_result = info.copy()
887             new_result.update(force_properties)
888
889             # Extracted info may not be a video result (i.e.
890             # info.get('_type', 'video') != video) but rather an url or
891             # url_transparent. In such cases outer metadata (from ie_result)
892             # should be propagated to inner one (info). For this to happen
893             # _type of info should be overridden with url_transparent. This
894             # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
895             if new_result.get('_type') == 'url':
896                 new_result['_type'] = 'url_transparent'
897
898             return self.process_ie_result(
899                 new_result, download=download, extra_info=extra_info)
900         elif result_type in ('playlist', 'multi_video'):
901             # We process each entry in the playlist
902             playlist = ie_result.get('title') or ie_result.get('id')
903             self.to_screen('[download] Downloading playlist: %s' % playlist)
904
905             playlist_results = []
906
907             playliststart = self.params.get('playliststart', 1) - 1
908             playlistend = self.params.get('playlistend')
909             # For backwards compatibility, interpret -1 as whole list
910             if playlistend == -1:
911                 playlistend = None
912
913             playlistitems_str = self.params.get('playlist_items')
914             playlistitems = None
915             if playlistitems_str is not None:
916                 def iter_playlistitems(format):
917                     for string_segment in format.split(','):
918                         if '-' in string_segment:
919                             start, end = string_segment.split('-')
920                             for item in range(int(start), int(end) + 1):
921                                 yield int(item)
922                         else:
923                             yield int(string_segment)
924                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
925
926             ie_entries = ie_result['entries']
927
928             def make_playlistitems_entries(list_ie_entries):
929                 num_entries = len(list_ie_entries)
930                 return [
931                     list_ie_entries[i - 1] for i in playlistitems
932                     if -num_entries <= i - 1 < num_entries]
933
934             def report_download(num_entries):
935                 self.to_screen(
936                     '[%s] playlist %s: Downloading %d videos' %
937                     (ie_result['extractor'], playlist, num_entries))
938
939             if isinstance(ie_entries, list):
940                 n_all_entries = len(ie_entries)
941                 if playlistitems:
942                     entries = make_playlistitems_entries(ie_entries)
943                 else:
944                     entries = ie_entries[playliststart:playlistend]
945                 n_entries = len(entries)
946                 self.to_screen(
947                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
948                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
949             elif isinstance(ie_entries, PagedList):
950                 if playlistitems:
951                     entries = []
952                     for item in playlistitems:
953                         entries.extend(ie_entries.getslice(
954                             item - 1, item
955                         ))
956                 else:
957                     entries = ie_entries.getslice(
958                         playliststart, playlistend)
959                 n_entries = len(entries)
960                 report_download(n_entries)
961             else:  # iterable
962                 if playlistitems:
963                     entries = make_playlistitems_entries(list(itertools.islice(
964                         ie_entries, 0, max(playlistitems))))
965                 else:
966                     entries = list(itertools.islice(
967                         ie_entries, playliststart, playlistend))
968                 n_entries = len(entries)
969                 report_download(n_entries)
970
971             if self.params.get('playlistreverse', False):
972                 entries = entries[::-1]
973
974             if self.params.get('playlistrandom', False):
975                 random.shuffle(entries)
976
977             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
978
979             for i, entry in enumerate(entries, 1):
980                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
981                 # This __x_forwarded_for_ip thing is a bit ugly but requires
982                 # minimal changes
983                 if x_forwarded_for:
984                     entry['__x_forwarded_for_ip'] = x_forwarded_for
985                 extra = {
986                     'n_entries': n_entries,
987                     'playlist': playlist,
988                     'playlist_id': ie_result.get('id'),
989                     'playlist_title': ie_result.get('title'),
990                     'playlist_uploader': ie_result.get('uploader'),
991                     'playlist_uploader_id': ie_result.get('uploader_id'),
992                     'playlist_index': i + playliststart,
993                     'extractor': ie_result['extractor'],
994                     'webpage_url': ie_result['webpage_url'],
995                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
996                     'extractor_key': ie_result['extractor_key'],
997                 }
998
999                 reason = self._match_entry(entry, incomplete=True)
1000                 if reason is not None:
1001                     self.to_screen('[download] ' + reason)
1002                     continue
1003
1004                 entry_result = self.process_ie_result(entry,
1005                                                       download=download,
1006                                                       extra_info=extra)
1007                 playlist_results.append(entry_result)
1008             ie_result['entries'] = playlist_results
1009             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1010             return ie_result
1011         elif result_type == 'compat_list':
1012             self.report_warning(
1013                 'Extractor %s returned a compat_list result. '
1014                 'It needs to be updated.' % ie_result.get('extractor'))
1015
1016             def _fixup(r):
1017                 self.add_extra_info(
1018                     r,
1019                     {
1020                         'extractor': ie_result['extractor'],
1021                         'webpage_url': ie_result['webpage_url'],
1022                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1023                         'extractor_key': ie_result['extractor_key'],
1024                     }
1025                 )
1026                 return r
1027             ie_result['entries'] = [
1028                 self.process_ie_result(_fixup(r), download, extra_info)
1029                 for r in ie_result['entries']
1030             ]
1031             return ie_result
1032         else:
1033             raise Exception('Invalid result type: %s' % result_type)
1034
1035     def _build_format_filter(self, filter_spec):
1036         " Returns a function to filter the formats according to the filter_spec "
1037
1038         OPERATORS = {
1039             '<': operator.lt,
1040             '<=': operator.le,
1041             '>': operator.gt,
1042             '>=': operator.ge,
1043             '=': operator.eq,
1044             '!=': operator.ne,
1045         }
1046         operator_rex = re.compile(r'''(?x)\s*
1047             (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
1048             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1049             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1050             $
1051             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1052         m = operator_rex.search(filter_spec)
1053         if m:
1054             try:
1055                 comparison_value = int(m.group('value'))
1056             except ValueError:
1057                 comparison_value = parse_filesize(m.group('value'))
1058                 if comparison_value is None:
1059                     comparison_value = parse_filesize(m.group('value') + 'B')
1060                 if comparison_value is None:
1061                     raise ValueError(
1062                         'Invalid value %r in format specification %r' % (
1063                             m.group('value'), filter_spec))
1064             op = OPERATORS[m.group('op')]
1065
1066         if not m:
1067             STR_OPERATORS = {
1068                 '=': operator.eq,
1069                 '^=': lambda attr, value: attr.startswith(value),
1070                 '$=': lambda attr, value: attr.endswith(value),
1071                 '*=': lambda attr, value: value in attr,
1072             }
1073             str_operator_rex = re.compile(r'''(?x)
1074                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1075                 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
1076                 \s*(?P<value>[a-zA-Z0-9._-]+)
1077                 \s*$
1078                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1079             m = str_operator_rex.search(filter_spec)
1080             if m:
1081                 comparison_value = m.group('value')
1082                 str_op = STR_OPERATORS[m.group('op')]
1083                 if m.group('negation'):
1084                     op = lambda attr, value: not str_op(attr, value)
1085                 else:
1086                     op = str_op
1087
1088         if not m:
1089             raise ValueError('Invalid filter specification %r' % filter_spec)
1090
1091         def _filter(f):
1092             actual_value = f.get(m.group('key'))
1093             if actual_value is None:
1094                 return m.group('none_inclusive')
1095             return op(actual_value, comparison_value)
1096         return _filter
1097
1098     def _default_format_spec(self, info_dict, download=True):
1099
1100         def can_merge():
1101             merger = FFmpegMergerPP(self)
1102             return merger.available and merger.can_merge()
1103
1104         def prefer_best():
1105             if self.params.get('simulate', False):
1106                 return False
1107             if not download:
1108                 return False
1109             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1110                 return True
1111             if info_dict.get('is_live'):
1112                 return True
1113             if not can_merge():
1114                 return True
1115             return False
1116
1117         req_format_list = ['bestvideo+bestaudio', 'best']
1118         if prefer_best():
1119             req_format_list.reverse()
1120         return '/'.join(req_format_list)
1121
1122     def build_format_selector(self, format_spec):
1123         def syntax_error(note, start):
1124             message = (
1125                 'Invalid format specification: '
1126                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1127             return SyntaxError(message)
1128
1129         PICKFIRST = 'PICKFIRST'
1130         MERGE = 'MERGE'
1131         SINGLE = 'SINGLE'
1132         GROUP = 'GROUP'
1133         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1134
1135         def _parse_filter(tokens):
1136             filter_parts = []
1137             for type, string, start, _, _ in tokens:
1138                 if type == tokenize.OP and string == ']':
1139                     return ''.join(filter_parts)
1140                 else:
1141                     filter_parts.append(string)
1142
1143         def _remove_unused_ops(tokens):
1144             # Remove operators that we don't use and join them with the surrounding strings
1145             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1146             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1147             last_string, last_start, last_end, last_line = None, None, None, None
1148             for type, string, start, end, line in tokens:
1149                 if type == tokenize.OP and string == '[':
1150                     if last_string:
1151                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1152                         last_string = None
1153                     yield type, string, start, end, line
1154                     # everything inside brackets will be handled by _parse_filter
1155                     for type, string, start, end, line in tokens:
1156                         yield type, string, start, end, line
1157                         if type == tokenize.OP and string == ']':
1158                             break
1159                 elif type == tokenize.OP and string in ALLOWED_OPS:
1160                     if last_string:
1161                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1162                         last_string = None
1163                     yield type, string, start, end, line
1164                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1165                     if not last_string:
1166                         last_string = string
1167                         last_start = start
1168                         last_end = end
1169                     else:
1170                         last_string += string
1171             if last_string:
1172                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1173
1174         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1175             selectors = []
1176             current_selector = None
1177             for type, string, start, _, _ in tokens:
1178                 # ENCODING is only defined in python 3.x
1179                 if type == getattr(tokenize, 'ENCODING', None):
1180                     continue
1181                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1182                     current_selector = FormatSelector(SINGLE, string, [])
1183                 elif type == tokenize.OP:
1184                     if string == ')':
1185                         if not inside_group:
1186                             # ')' will be handled by the parentheses group
1187                             tokens.restore_last_token()
1188                         break
1189                     elif inside_merge and string in ['/', ',']:
1190                         tokens.restore_last_token()
1191                         break
1192                     elif inside_choice and string == ',':
1193                         tokens.restore_last_token()
1194                         break
1195                     elif string == ',':
1196                         if not current_selector:
1197                             raise syntax_error('"," must follow a format selector', start)
1198                         selectors.append(current_selector)
1199                         current_selector = None
1200                     elif string == '/':
1201                         if not current_selector:
1202                             raise syntax_error('"/" must follow a format selector', start)
1203                         first_choice = current_selector
1204                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1205                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1206                     elif string == '[':
1207                         if not current_selector:
1208                             current_selector = FormatSelector(SINGLE, 'best', [])
1209                         format_filter = _parse_filter(tokens)
1210                         current_selector.filters.append(format_filter)
1211                     elif string == '(':
1212                         if current_selector:
1213                             raise syntax_error('Unexpected "("', start)
1214                         group = _parse_format_selection(tokens, inside_group=True)
1215                         current_selector = FormatSelector(GROUP, group, [])
1216                     elif string == '+':
1217                         video_selector = current_selector
1218                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1219                         if not video_selector or not audio_selector:
1220                             raise syntax_error('"+" must be between two format selectors', start)
1221                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1222                     else:
1223                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1224                 elif type == tokenize.ENDMARKER:
1225                     break
1226             if current_selector:
1227                 selectors.append(current_selector)
1228             return selectors
1229
1230         def _build_selector_function(selector):
1231             if isinstance(selector, list):
1232                 fs = [_build_selector_function(s) for s in selector]
1233
1234                 def selector_function(ctx):
1235                     for f in fs:
1236                         for format in f(ctx):
1237                             yield format
1238                 return selector_function
1239             elif selector.type == GROUP:
1240                 selector_function = _build_selector_function(selector.selector)
1241             elif selector.type == PICKFIRST:
1242                 fs = [_build_selector_function(s) for s in selector.selector]
1243
1244                 def selector_function(ctx):
1245                     for f in fs:
1246                         picked_formats = list(f(ctx))
1247                         if picked_formats:
1248                             return picked_formats
1249                     return []
1250             elif selector.type == SINGLE:
1251                 format_spec = selector.selector
1252
1253                 def selector_function(ctx):
1254                     formats = list(ctx['formats'])
1255                     if not formats:
1256                         return
1257                     if format_spec == 'all':
1258                         for f in formats:
1259                             yield f
1260                     elif format_spec in ['best', 'worst', None]:
1261                         format_idx = 0 if format_spec == 'worst' else -1
1262                         audiovideo_formats = [
1263                             f for f in formats
1264                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1265                         if audiovideo_formats:
1266                             yield audiovideo_formats[format_idx]
1267                         # for extractors with incomplete formats (audio only (soundcloud)
1268                         # or video only (imgur)) we will fallback to best/worst
1269                         # {video,audio}-only format
1270                         elif ctx['incomplete_formats']:
1271                             yield formats[format_idx]
1272                     elif format_spec == 'bestaudio':
1273                         audio_formats = [
1274                             f for f in formats
1275                             if f.get('vcodec') == 'none']
1276                         if audio_formats:
1277                             yield audio_formats[-1]
1278                     elif format_spec == 'worstaudio':
1279                         audio_formats = [
1280                             f for f in formats
1281                             if f.get('vcodec') == 'none']
1282                         if audio_formats:
1283                             yield audio_formats[0]
1284                     elif format_spec == 'bestvideo':
1285                         video_formats = [
1286                             f for f in formats
1287                             if f.get('acodec') == 'none']
1288                         if video_formats:
1289                             yield video_formats[-1]
1290                     elif format_spec == 'worstvideo':
1291                         video_formats = [
1292                             f for f in formats
1293                             if f.get('acodec') == 'none']
1294                         if video_formats:
1295                             yield video_formats[0]
1296                     else:
1297                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1298                         if format_spec in extensions:
1299                             filter_f = lambda f: f['ext'] == format_spec
1300                         else:
1301                             filter_f = lambda f: f['format_id'] == format_spec
1302                         matches = list(filter(filter_f, formats))
1303                         if matches:
1304                             yield matches[-1]
1305             elif selector.type == MERGE:
1306                 def _merge(formats_info):
1307                     format_1, format_2 = [f['format_id'] for f in formats_info]
1308                     # The first format must contain the video and the
1309                     # second the audio
1310                     if formats_info[0].get('vcodec') == 'none':
1311                         self.report_error('The first format must '
1312                                           'contain the video, try using '
1313                                           '"-f %s+%s"' % (format_2, format_1))
1314                         return
1315                     # Formats must be opposite (video+audio)
1316                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1317                         self.report_error(
1318                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1319                             % (format_1, format_2))
1320                         return
1321                     output_ext = (
1322                         formats_info[0]['ext']
1323                         if self.params.get('merge_output_format') is None
1324                         else self.params['merge_output_format'])
1325                     return {
1326                         'requested_formats': formats_info,
1327                         'format': '%s+%s' % (formats_info[0].get('format'),
1328                                              formats_info[1].get('format')),
1329                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1330                                                 formats_info[1].get('format_id')),
1331                         'width': formats_info[0].get('width'),
1332                         'height': formats_info[0].get('height'),
1333                         'resolution': formats_info[0].get('resolution'),
1334                         'fps': formats_info[0].get('fps'),
1335                         'vcodec': formats_info[0].get('vcodec'),
1336                         'vbr': formats_info[0].get('vbr'),
1337                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1338                         'acodec': formats_info[1].get('acodec'),
1339                         'abr': formats_info[1].get('abr'),
1340                         'ext': output_ext,
1341                     }
1342                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1343
1344                 def selector_function(ctx):
1345                     for pair in itertools.product(
1346                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1347                         yield _merge(pair)
1348
1349             filters = [self._build_format_filter(f) for f in selector.filters]
1350
1351             def final_selector(ctx):
1352                 ctx_copy = copy.deepcopy(ctx)
1353                 for _filter in filters:
1354                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1355                 return selector_function(ctx_copy)
1356             return final_selector
1357
1358         stream = io.BytesIO(format_spec.encode('utf-8'))
1359         try:
1360             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1361         except tokenize.TokenError:
1362             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1363
1364         class TokenIterator(object):
1365             def __init__(self, tokens):
1366                 self.tokens = tokens
1367                 self.counter = 0
1368
1369             def __iter__(self):
1370                 return self
1371
1372             def __next__(self):
1373                 if self.counter >= len(self.tokens):
1374                     raise StopIteration()
1375                 value = self.tokens[self.counter]
1376                 self.counter += 1
1377                 return value
1378
1379             next = __next__
1380
1381             def restore_last_token(self):
1382                 self.counter -= 1
1383
1384         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1385         return _build_selector_function(parsed_selector)
1386
1387     def _calc_headers(self, info_dict):
1388         res = std_headers.copy()
1389
1390         add_headers = info_dict.get('http_headers')
1391         if add_headers:
1392             res.update(add_headers)
1393
1394         cookies = self._calc_cookies(info_dict)
1395         if cookies:
1396             res['Cookie'] = cookies
1397
1398         if 'X-Forwarded-For' not in res:
1399             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1400             if x_forwarded_for_ip:
1401                 res['X-Forwarded-For'] = x_forwarded_for_ip
1402
1403         return res
1404
1405     def _calc_cookies(self, info_dict):
1406         pr = sanitized_Request(info_dict['url'])
1407         self.cookiejar.add_cookie_header(pr)
1408         return pr.get_header('Cookie')
1409
1410     def process_video_result(self, info_dict, download=True):
1411         assert info_dict.get('_type', 'video') == 'video'
1412
1413         if 'id' not in info_dict:
1414             raise ExtractorError('Missing "id" field in extractor result')
1415         if 'title' not in info_dict:
1416             raise ExtractorError('Missing "title" field in extractor result')
1417
1418         def report_force_conversion(field, field_not, conversion):
1419             self.report_warning(
1420                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1421                 % (field, field_not, conversion))
1422
1423         def sanitize_string_field(info, string_field):
1424             field = info.get(string_field)
1425             if field is None or isinstance(field, compat_str):
1426                 return
1427             report_force_conversion(string_field, 'a string', 'string')
1428             info[string_field] = compat_str(field)
1429
1430         def sanitize_numeric_fields(info):
1431             for numeric_field in self._NUMERIC_FIELDS:
1432                 field = info.get(numeric_field)
1433                 if field is None or isinstance(field, compat_numeric_types):
1434                     continue
1435                 report_force_conversion(numeric_field, 'numeric', 'int')
1436                 info[numeric_field] = int_or_none(field)
1437
1438         sanitize_string_field(info_dict, 'id')
1439         sanitize_numeric_fields(info_dict)
1440
1441         if 'playlist' not in info_dict:
1442             # It isn't part of a playlist
1443             info_dict['playlist'] = None
1444             info_dict['playlist_index'] = None
1445
1446         thumbnails = info_dict.get('thumbnails')
1447         if thumbnails is None:
1448             thumbnail = info_dict.get('thumbnail')
1449             if thumbnail:
1450                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1451         if thumbnails:
1452             thumbnails.sort(key=lambda t: (
1453                 t.get('preference') if t.get('preference') is not None else -1,
1454                 t.get('width') if t.get('width') is not None else -1,
1455                 t.get('height') if t.get('height') is not None else -1,
1456                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1457             for i, t in enumerate(thumbnails):
1458                 t['url'] = sanitize_url(t['url'])
1459                 if t.get('width') and t.get('height'):
1460                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1461                 if t.get('id') is None:
1462                     t['id'] = '%d' % i
1463
1464         if self.params.get('list_thumbnails'):
1465             self.list_thumbnails(info_dict)
1466             return
1467
1468         thumbnail = info_dict.get('thumbnail')
1469         if thumbnail:
1470             info_dict['thumbnail'] = sanitize_url(thumbnail)
1471         elif thumbnails:
1472             info_dict['thumbnail'] = thumbnails[-1]['url']
1473
1474         if 'display_id' not in info_dict and 'id' in info_dict:
1475             info_dict['display_id'] = info_dict['id']
1476
1477         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1478             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1479             # see http://bugs.python.org/issue1646728)
1480             try:
1481                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1482                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1483             except (ValueError, OverflowError, OSError):
1484                 pass
1485
1486         # Auto generate title fields corresponding to the *_number fields when missing
1487         # in order to always have clean titles. This is very common for TV series.
1488         for field in ('chapter', 'season', 'episode'):
1489             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1490                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1491
1492         for cc_kind in ('subtitles', 'automatic_captions'):
1493             cc = info_dict.get(cc_kind)
1494             if cc:
1495                 for _, subtitle in cc.items():
1496                     for subtitle_format in subtitle:
1497                         if subtitle_format.get('url'):
1498                             subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1499                         if subtitle_format.get('ext') is None:
1500                             subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1501
1502         automatic_captions = info_dict.get('automatic_captions')
1503         subtitles = info_dict.get('subtitles')
1504
1505         if self.params.get('listsubtitles', False):
1506             if 'automatic_captions' in info_dict:
1507                 self.list_subtitles(
1508                     info_dict['id'], automatic_captions, 'automatic captions')
1509             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1510             return
1511
1512         info_dict['requested_subtitles'] = self.process_subtitles(
1513             info_dict['id'], subtitles, automatic_captions)
1514
1515         # We now pick which formats have to be downloaded
1516         if info_dict.get('formats') is None:
1517             # There's only one format available
1518             formats = [info_dict]
1519         else:
1520             formats = info_dict['formats']
1521
1522         if not formats:
1523             raise ExtractorError('No video formats found!')
1524
1525         def is_wellformed(f):
1526             url = f.get('url')
1527             if not url:
1528                 self.report_warning(
1529                     '"url" field is missing or empty - skipping format, '
1530                     'there is an error in extractor')
1531                 return False
1532             if isinstance(url, bytes):
1533                 sanitize_string_field(f, 'url')
1534             return True
1535
1536         # Filter out malformed formats for better extraction robustness
1537         formats = list(filter(is_wellformed, formats))
1538
1539         formats_dict = {}
1540
1541         # We check that all the formats have the format and format_id fields
1542         for i, format in enumerate(formats):
1543             sanitize_string_field(format, 'format_id')
1544             sanitize_numeric_fields(format)
1545             format['url'] = sanitize_url(format['url'])
1546             if not format.get('format_id'):
1547                 format['format_id'] = compat_str(i)
1548             else:
1549                 # Sanitize format_id from characters used in format selector expression
1550                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1551             format_id = format['format_id']
1552             if format_id not in formats_dict:
1553                 formats_dict[format_id] = []
1554             formats_dict[format_id].append(format)
1555
1556         # Make sure all formats have unique format_id
1557         for format_id, ambiguous_formats in formats_dict.items():
1558             if len(ambiguous_formats) > 1:
1559                 for i, format in enumerate(ambiguous_formats):
1560                     format['format_id'] = '%s-%d' % (format_id, i)
1561
1562         for i, format in enumerate(formats):
1563             if format.get('format') is None:
1564                 format['format'] = '{id} - {res}{note}'.format(
1565                     id=format['format_id'],
1566                     res=self.format_resolution(format),
1567                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1568                 )
1569             # Automatically determine file extension if missing
1570             if format.get('ext') is None:
1571                 format['ext'] = determine_ext(format['url']).lower()
1572             # Automatically determine protocol if missing (useful for format
1573             # selection purposes)
1574             if format.get('protocol') is None:
1575                 format['protocol'] = determine_protocol(format)
1576             # Add HTTP headers, so that external programs can use them from the
1577             # json output
1578             full_format_info = info_dict.copy()
1579             full_format_info.update(format)
1580             format['http_headers'] = self._calc_headers(full_format_info)
1581         # Remove private housekeeping stuff
1582         if '__x_forwarded_for_ip' in info_dict:
1583             del info_dict['__x_forwarded_for_ip']
1584
1585         # TODO Central sorting goes here
1586
1587         if formats[0] is not info_dict:
1588             # only set the 'formats' fields if the original info_dict list them
1589             # otherwise we end up with a circular reference, the first (and unique)
1590             # element in the 'formats' field in info_dict is info_dict itself,
1591             # which can't be exported to json
1592             info_dict['formats'] = formats
1593         if self.params.get('listformats'):
1594             self.list_formats(info_dict)
1595             return
1596
1597         req_format = self.params.get('format')
1598         if req_format is None:
1599             req_format = self._default_format_spec(info_dict, download=download)
1600             if self.params.get('verbose'):
1601                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1602
1603         format_selector = self.build_format_selector(req_format)
1604
1605         # While in format selection we may need to have an access to the original
1606         # format set in order to calculate some metrics or do some processing.
1607         # For now we need to be able to guess whether original formats provided
1608         # by extractor are incomplete or not (i.e. whether extractor provides only
1609         # video-only or audio-only formats) for proper formats selection for
1610         # extractors with such incomplete formats (see
1611         # https://github.com/ytdl-org/youtube-dl/pull/5556).
1612         # Since formats may be filtered during format selection and may not match
1613         # the original formats the results may be incorrect. Thus original formats
1614         # or pre-calculated metrics should be passed to format selection routines
1615         # as well.
1616         # We will pass a context object containing all necessary additional data
1617         # instead of just formats.
1618         # This fixes incorrect format selection issue (see
1619         # https://github.com/ytdl-org/youtube-dl/issues/10083).
1620         incomplete_formats = (
1621             # All formats are video-only or
1622             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
1623             # all formats are audio-only
1624             or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1625
1626         ctx = {
1627             'formats': formats,
1628             'incomplete_formats': incomplete_formats,
1629         }
1630
1631         formats_to_download = list(format_selector(ctx))
1632         if not formats_to_download:
1633             raise ExtractorError('requested format not available',
1634                                  expected=True)
1635
1636         if download:
1637             if len(formats_to_download) > 1:
1638                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1639             for format in formats_to_download:
1640                 new_info = dict(info_dict)
1641                 new_info.update(format)
1642                 self.process_info(new_info)
1643         # We update the info dict with the best quality format (backwards compatibility)
1644         info_dict.update(formats_to_download[-1])
1645         return info_dict
1646
1647     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1648         """Select the requested subtitles and their format"""
1649         available_subs = {}
1650         if normal_subtitles and self.params.get('writesubtitles'):
1651             available_subs.update(normal_subtitles)
1652         if automatic_captions and self.params.get('writeautomaticsub'):
1653             for lang, cap_info in automatic_captions.items():
1654                 if lang not in available_subs:
1655                     available_subs[lang] = cap_info
1656
1657         if (not self.params.get('writesubtitles') and not
1658                 self.params.get('writeautomaticsub') or not
1659                 available_subs):
1660             return None
1661
1662         if self.params.get('allsubtitles', False):
1663             requested_langs = available_subs.keys()
1664         else:
1665             if self.params.get('subtitleslangs', False):
1666                 requested_langs = self.params.get('subtitleslangs')
1667             elif 'en' in available_subs:
1668                 requested_langs = ['en']
1669             else:
1670                 requested_langs = [list(available_subs.keys())[0]]
1671
1672         formats_query = self.params.get('subtitlesformat', 'best')
1673         formats_preference = formats_query.split('/') if formats_query else []
1674         subs = {}
1675         for lang in requested_langs:
1676             formats = available_subs.get(lang)
1677             if formats is None:
1678                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1679                 continue
1680             for ext in formats_preference:
1681                 if ext == 'best':
1682                     f = formats[-1]
1683                     break
1684                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1685                 if matches:
1686                     f = matches[-1]
1687                     break
1688             else:
1689                 f = formats[-1]
1690                 self.report_warning(
1691                     'No subtitle format found matching "%s" for language %s, '
1692                     'using %s' % (formats_query, lang, f['ext']))
1693             subs[lang] = f
1694         return subs
1695
1696     def process_info(self, info_dict):
1697         """Process a single resolved IE result."""
1698
1699         assert info_dict.get('_type', 'video') == 'video'
1700
1701         max_downloads = self.params.get('max_downloads')
1702         if max_downloads is not None:
1703             if self._num_downloads >= int(max_downloads):
1704                 raise MaxDownloadsReached()
1705
1706         info_dict['fulltitle'] = info_dict['title']
1707         if len(info_dict['title']) > 200:
1708             info_dict['title'] = info_dict['title'][:197] + '...'
1709
1710         if 'format' not in info_dict:
1711             info_dict['format'] = info_dict['ext']
1712
1713         reason = self._match_entry(info_dict, incomplete=False)
1714         if reason is not None:
1715             self.to_screen('[download] ' + reason)
1716             return
1717
1718         self._num_downloads += 1
1719
1720         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1721
1722         # Forced printings
1723         if self.params.get('forcetitle', False):
1724             self.to_stdout(info_dict['fulltitle'])
1725         if self.params.get('forceid', False):
1726             self.to_stdout(info_dict['id'])
1727         if self.params.get('forceurl', False):
1728             if info_dict.get('requested_formats') is not None:
1729                 for f in info_dict['requested_formats']:
1730                     self.to_stdout(f['url'] + f.get('play_path', ''))
1731             else:
1732                 # For RTMP URLs, also include the playpath
1733                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1734         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1735             self.to_stdout(info_dict['thumbnail'])
1736         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1737             self.to_stdout(info_dict['description'])
1738         if self.params.get('forcefilename', False) and filename is not None:
1739             self.to_stdout(filename)
1740         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1741             self.to_stdout(formatSeconds(info_dict['duration']))
1742         if self.params.get('forceformat', False):
1743             self.to_stdout(info_dict['format'])
1744         if self.params.get('forcejson', False):
1745             self.to_stdout(json.dumps(info_dict))
1746
1747         # Do nothing else if in simulate mode
1748         if self.params.get('simulate', False):
1749             return
1750
1751         if filename is None:
1752             return
1753
1754         def ensure_dir_exists(path):
1755             try:
1756                 dn = os.path.dirname(path)
1757                 if dn and not os.path.exists(dn):
1758                     os.makedirs(dn)
1759                 return True
1760             except (OSError, IOError) as err:
1761                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1762                 return False
1763
1764         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1765             return
1766
1767         if self.params.get('writedescription', False):
1768             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1769             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1770                 self.to_screen('[info] Video description is already present')
1771             elif info_dict.get('description') is None:
1772                 self.report_warning('There\'s no description to write.')
1773             else:
1774                 try:
1775                     self.to_screen('[info] Writing video description to: ' + descfn)
1776                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1777                         descfile.write(info_dict['description'])
1778                 except (OSError, IOError):
1779                     self.report_error('Cannot write description file ' + descfn)
1780                     return
1781
1782         if self.params.get('writeannotations', False):
1783             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1784             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1785                 self.to_screen('[info] Video annotations are already present')
1786             elif not info_dict.get('annotations'):
1787                 self.report_warning('There are no annotations to write.')
1788             else:
1789                 try:
1790                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1791                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1792                         annofile.write(info_dict['annotations'])
1793                 except (KeyError, TypeError):
1794                     self.report_warning('There are no annotations to write.')
1795                 except (OSError, IOError):
1796                     self.report_error('Cannot write annotations file: ' + annofn)
1797                     return
1798
1799         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1800                                        self.params.get('writeautomaticsub')])
1801
1802         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1803             # subtitles download errors are already managed as troubles in relevant IE
1804             # that way it will silently go on when used with unsupporting IE
1805             subtitles = info_dict['requested_subtitles']
1806             ie = self.get_info_extractor(info_dict['extractor_key'])
1807             for sub_lang, sub_info in subtitles.items():
1808                 sub_format = sub_info['ext']
1809                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1810                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1811                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1812                 else:
1813                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1814                     if sub_info.get('data') is not None:
1815                         try:
1816                             # Use newline='' to prevent conversion of newline characters
1817                             # See https://github.com/ytdl-org/youtube-dl/issues/10268
1818                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1819                                 subfile.write(sub_info['data'])
1820                         except (OSError, IOError):
1821                             self.report_error('Cannot write subtitles file ' + sub_filename)
1822                             return
1823                     else:
1824                         try:
1825                             sub_data = ie._request_webpage(
1826                                 sub_info['url'], info_dict['id'], note=False).read()
1827                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1828                                 subfile.write(sub_data)
1829                         except (ExtractorError, IOError, OSError, ValueError) as err:
1830                             self.report_warning('Unable to download subtitle for "%s": %s' %
1831                                                 (sub_lang, error_to_compat_str(err)))
1832                             continue
1833
1834         if self.params.get('writeinfojson', False):
1835             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1836             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1837                 self.to_screen('[info] Video description metadata is already present')
1838             else:
1839                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1840                 try:
1841                     write_json_file(self.filter_requested_info(info_dict), infofn)
1842                 except (OSError, IOError):
1843                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1844                     return
1845
1846         self._write_thumbnails(info_dict, filename)
1847
1848         if not self.params.get('skip_download', False):
1849             try:
1850                 def dl(name, info):
1851                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1852                     for ph in self._progress_hooks:
1853                         fd.add_progress_hook(ph)
1854                     if self.params.get('verbose'):
1855                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1856                     return fd.download(name, info)
1857
1858                 if info_dict.get('requested_formats') is not None:
1859                     downloaded = []
1860                     success = True
1861                     merger = FFmpegMergerPP(self)
1862                     if not merger.available:
1863                         postprocessors = []
1864                         self.report_warning('You have requested multiple '
1865                                             'formats but ffmpeg or avconv are not installed.'
1866                                             ' The formats won\'t be merged.')
1867                     else:
1868                         postprocessors = [merger]
1869
1870                     def compatible_formats(formats):
1871                         video, audio = formats
1872                         # Check extension
1873                         video_ext, audio_ext = video.get('ext'), audio.get('ext')
1874                         if video_ext and audio_ext:
1875                             COMPATIBLE_EXTS = (
1876                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1877                                 ('webm')
1878                             )
1879                             for exts in COMPATIBLE_EXTS:
1880                                 if video_ext in exts and audio_ext in exts:
1881                                     return True
1882                         # TODO: Check acodec/vcodec
1883                         return False
1884
1885                     filename_real_ext = os.path.splitext(filename)[1][1:]
1886                     filename_wo_ext = (
1887                         os.path.splitext(filename)[0]
1888                         if filename_real_ext == info_dict['ext']
1889                         else filename)
1890                     requested_formats = info_dict['requested_formats']
1891                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1892                         info_dict['ext'] = 'mkv'
1893                         self.report_warning(
1894                             'Requested formats are incompatible for merge and will be merged into mkv.')
1895                     # Ensure filename always has a correct extension for successful merge
1896                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1897                     if os.path.exists(encodeFilename(filename)):
1898                         self.to_screen(
1899                             '[download] %s has already been downloaded and '
1900                             'merged' % filename)
1901                     else:
1902                         for f in requested_formats:
1903                             new_info = dict(info_dict)
1904                             new_info.update(f)
1905                             fname = prepend_extension(
1906                                 self.prepare_filename(new_info),
1907                                 'f%s' % f['format_id'], new_info['ext'])
1908                             if not ensure_dir_exists(fname):
1909                                 return
1910                             downloaded.append(fname)
1911                             partial_success = dl(fname, new_info)
1912                             success = success and partial_success
1913                         info_dict['__postprocessors'] = postprocessors
1914                         info_dict['__files_to_merge'] = downloaded
1915                 else:
1916                     # Just a single file
1917                     success = dl(filename, info_dict)
1918             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1920                 return
1921             except (OSError, IOError) as err:
1922                 raise UnavailableVideoError(err)
1923             except (ContentTooShortError, ) as err:
1924                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1925                 return
1926
1927             if success and filename != '-':
1928                 # Fixup content
1929                 fixup_policy = self.params.get('fixup')
1930                 if fixup_policy is None:
1931                     fixup_policy = 'detect_or_warn'
1932
1933                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1934
1935                 stretched_ratio = info_dict.get('stretched_ratio')
1936                 if stretched_ratio is not None and stretched_ratio != 1:
1937                     if fixup_policy == 'warn':
1938                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1939                             info_dict['id'], stretched_ratio))
1940                     elif fixup_policy == 'detect_or_warn':
1941                         stretched_pp = FFmpegFixupStretchedPP(self)
1942                         if stretched_pp.available:
1943                             info_dict.setdefault('__postprocessors', [])
1944                             info_dict['__postprocessors'].append(stretched_pp)
1945                         else:
1946                             self.report_warning(
1947                                 '%s: Non-uniform pixel ratio (%s). %s'
1948                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1949                     else:
1950                         assert fixup_policy in ('ignore', 'never')
1951
1952                 if (info_dict.get('requested_formats') is None
1953                         and info_dict.get('container') == 'm4a_dash'):
1954                     if fixup_policy == 'warn':
1955                         self.report_warning(
1956                             '%s: writing DASH m4a. '
1957                             'Only some players support this container.'
1958                             % info_dict['id'])
1959                     elif fixup_policy == 'detect_or_warn':
1960                         fixup_pp = FFmpegFixupM4aPP(self)
1961                         if fixup_pp.available:
1962                             info_dict.setdefault('__postprocessors', [])
1963                             info_dict['__postprocessors'].append(fixup_pp)
1964                         else:
1965                             self.report_warning(
1966                                 '%s: writing DASH m4a. '
1967                                 'Only some players support this container. %s'
1968                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1969                     else:
1970                         assert fixup_policy in ('ignore', 'never')
1971
1972                 if (info_dict.get('protocol') == 'm3u8_native'
1973                         or info_dict.get('protocol') == 'm3u8'
1974                         and self.params.get('hls_prefer_native')):
1975                     if fixup_policy == 'warn':
1976                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1977                             info_dict['id']))
1978                     elif fixup_policy == 'detect_or_warn':
1979                         fixup_pp = FFmpegFixupM3u8PP(self)
1980                         if fixup_pp.available:
1981                             info_dict.setdefault('__postprocessors', [])
1982                             info_dict['__postprocessors'].append(fixup_pp)
1983                         else:
1984                             self.report_warning(
1985                                 '%s: malformed AAC bitstream detected. %s'
1986                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1987                     else:
1988                         assert fixup_policy in ('ignore', 'never')
1989
1990                 try:
1991                     self.post_process(filename, info_dict)
1992                 except (PostProcessingError) as err:
1993                     self.report_error('postprocessing: %s' % str(err))
1994                     return
1995                 self.record_download_archive(info_dict)
1996
1997     def download(self, url_list):
1998         """Download a given list of URLs."""
1999         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
2000         if (len(url_list) > 1
2001                 and outtmpl != '-'
2002                 and '%' not in outtmpl
2003                 and self.params.get('max_downloads') != 1):
2004             raise SameFileError(outtmpl)
2005
2006         for url in url_list:
2007             try:
2008                 # It also downloads the videos
2009                 res = self.extract_info(
2010                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2011             except UnavailableVideoError:
2012                 self.report_error('unable to download video')
2013             except MaxDownloadsReached:
2014                 self.to_screen('[info] Maximum number of downloaded files reached.')
2015                 raise
2016             else:
2017                 if self.params.get('dump_single_json', False):
2018                     self.to_stdout(json.dumps(res))
2019
2020         return self._download_retcode
2021
2022     def download_with_info_file(self, info_filename):
2023         with contextlib.closing(fileinput.FileInput(
2024                 [info_filename], mode='r',
2025                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2026             # FileInput doesn't have a read method, we can't call json.load
2027             info = self.filter_requested_info(json.loads('\n'.join(f)))
2028         try:
2029             self.process_ie_result(info, download=True)
2030         except DownloadError:
2031             webpage_url = info.get('webpage_url')
2032             if webpage_url is not None:
2033                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2034                 return self.download([webpage_url])
2035             else:
2036                 raise
2037         return self._download_retcode
2038
2039     @staticmethod
2040     def filter_requested_info(info_dict):
2041         return dict(
2042             (k, v) for k, v in info_dict.items()
2043             if k not in ['requested_formats', 'requested_subtitles'])
2044
2045     def post_process(self, filename, ie_info):
2046         """Run all the postprocessors on the given file."""
2047         info = dict(ie_info)
2048         info['filepath'] = filename
2049         pps_chain = []
2050         if ie_info.get('__postprocessors') is not None:
2051             pps_chain.extend(ie_info['__postprocessors'])
2052         pps_chain.extend(self._pps)
2053         for pp in pps_chain:
2054             files_to_delete = []
2055             try:
2056                 files_to_delete, info = pp.run(info)
2057             except PostProcessingError as e:
2058                 self.report_error(e.msg)
2059             if files_to_delete and not self.params.get('keepvideo', False):
2060                 for old_filename in files_to_delete:
2061                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2062                     try:
2063                         os.remove(encodeFilename(old_filename))
2064                     except (IOError, OSError):
2065                         self.report_warning('Unable to remove downloaded original file')
2066
2067     def _make_archive_id(self, info_dict):
2068         video_id = info_dict.get('id')
2069         if not video_id:
2070             return
2071         # Future-proof against any change in case
2072         # and backwards compatibility with prior versions
2073         extractor = info_dict.get('extractor_key') or info_dict.get('ie_key')  # key in a playlist
2074         if extractor is None:
2075             url = str_or_none(info_dict.get('url'))
2076             if not url:
2077                 return
2078             # Try to find matching extractor for the URL and take its ie_key
2079             for ie in self._ies:
2080                 if ie.suitable(url):
2081                     extractor = ie.ie_key()
2082                     break
2083             else:
2084                 return
2085         return extractor.lower() + ' ' + video_id
2086
2087     def in_download_archive(self, info_dict):
2088         fn = self.params.get('download_archive')
2089         if fn is None:
2090             return False
2091
2092         vid_id = self._make_archive_id(info_dict)
2093         if not vid_id:
2094             return False  # Incomplete video information
2095
2096         try:
2097             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2098                 for line in archive_file:
2099                     if line.strip() == vid_id:
2100                         return True
2101         except IOError as ioe:
2102             if ioe.errno != errno.ENOENT:
2103                 raise
2104         return False
2105
2106     def record_download_archive(self, info_dict):
2107         fn = self.params.get('download_archive')
2108         if fn is None:
2109             return
2110         vid_id = self._make_archive_id(info_dict)
2111         assert vid_id
2112         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2113             archive_file.write(vid_id + '\n')
2114
2115     @staticmethod
2116     def format_resolution(format, default='unknown'):
2117         if format.get('vcodec') == 'none':
2118             return 'audio only'
2119         if format.get('resolution') is not None:
2120             return format['resolution']
2121         if format.get('height') is not None:
2122             if format.get('width') is not None:
2123                 res = '%sx%s' % (format['width'], format['height'])
2124             else:
2125                 res = '%sp' % format['height']
2126         elif format.get('width') is not None:
2127             res = '%dx?' % format['width']
2128         else:
2129             res = default
2130         return res
2131
2132     def _format_note(self, fdict):
2133         res = ''
2134         if fdict.get('ext') in ['f4f', 'f4m']:
2135             res += '(unsupported) '
2136         if fdict.get('language'):
2137             if res:
2138                 res += ' '
2139             res += '[%s] ' % fdict['language']
2140         if fdict.get('format_note') is not None:
2141             res += fdict['format_note'] + ' '
2142         if fdict.get('tbr') is not None:
2143             res += '%4dk ' % fdict['tbr']
2144         if fdict.get('container') is not None:
2145             if res:
2146                 res += ', '
2147             res += '%s container' % fdict['container']
2148         if (fdict.get('vcodec') is not None
2149                 and fdict.get('vcodec') != 'none'):
2150             if res:
2151                 res += ', '
2152             res += fdict['vcodec']
2153             if fdict.get('vbr') is not None:
2154                 res += '@'
2155         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2156             res += 'video@'
2157         if fdict.get('vbr') is not None:
2158             res += '%4dk' % fdict['vbr']
2159         if fdict.get('fps') is not None:
2160             if res:
2161                 res += ', '
2162             res += '%sfps' % fdict['fps']
2163         if fdict.get('acodec') is not None:
2164             if res:
2165                 res += ', '
2166             if fdict['acodec'] == 'none':
2167                 res += 'video only'
2168             else:
2169                 res += '%-5s' % fdict['acodec']
2170         elif fdict.get('abr') is not None:
2171             if res:
2172                 res += ', '
2173             res += 'audio'
2174         if fdict.get('abr') is not None:
2175             res += '@%3dk' % fdict['abr']
2176         if fdict.get('asr') is not None:
2177             res += ' (%5dHz)' % fdict['asr']
2178         if fdict.get('filesize') is not None:
2179             if res:
2180                 res += ', '
2181             res += format_bytes(fdict['filesize'])
2182         elif fdict.get('filesize_approx') is not None:
2183             if res:
2184                 res += ', '
2185             res += '~' + format_bytes(fdict['filesize_approx'])
2186         return res
2187
2188     def list_formats(self, info_dict):
2189         formats = info_dict.get('formats', [info_dict])
2190         table = [
2191             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2192             for f in formats
2193             if f.get('preference') is None or f['preference'] >= -1000]
2194         if len(formats) > 1:
2195             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2196
2197         header_line = ['format code', 'extension', 'resolution', 'note']
2198         self.to_screen(
2199             '[info] Available formats for %s:\n%s' %
2200             (info_dict['id'], render_table(header_line, table)))
2201
2202     def list_thumbnails(self, info_dict):
2203         thumbnails = info_dict.get('thumbnails')
2204         if not thumbnails:
2205             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2206             return
2207
2208         self.to_screen(
2209             '[info] Thumbnails for %s:' % info_dict['id'])
2210         self.to_screen(render_table(
2211             ['ID', 'width', 'height', 'URL'],
2212             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2213
2214     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2215         if not subtitles:
2216             self.to_screen('%s has no %s' % (video_id, name))
2217             return
2218         self.to_screen(
2219             'Available %s for %s:' % (name, video_id))
2220         self.to_screen(render_table(
2221             ['Language', 'formats'],
2222             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2223                 for lang, formats in subtitles.items()]))
2224
2225     def urlopen(self, req):
2226         """ Start an HTTP download """
2227         if isinstance(req, compat_basestring):
2228             req = sanitized_Request(req)
2229         return self._opener.open(req, timeout=self._socket_timeout)
2230
2231     def print_debug_header(self):
2232         if not self.params.get('verbose'):
2233             return
2234
2235         if type('') is not compat_str:
2236             # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
2237             self.report_warning(
2238                 'Your Python is broken! Update to a newer and supported version')
2239
2240         stdout_encoding = getattr(
2241             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2242         encoding_str = (
2243             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2244                 locale.getpreferredencoding(),
2245                 sys.getfilesystemencoding(),
2246                 stdout_encoding,
2247                 self.get_encoding()))
2248         write_string(encoding_str, encoding=None)
2249
2250         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2251         if _LAZY_LOADER:
2252             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2253         try:
2254             sp = subprocess.Popen(
2255                 ['git', 'rev-parse', '--short', 'HEAD'],
2256                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2257                 cwd=os.path.dirname(os.path.abspath(__file__)))
2258             out, err = sp.communicate()
2259             out = out.decode().strip()
2260             if re.match('[0-9a-f]+', out):
2261                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2262         except Exception:
2263             try:
2264                 sys.exc_clear()
2265             except Exception:
2266                 pass
2267
2268         def python_implementation():
2269             impl_name = platform.python_implementation()
2270             if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2271                 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2272             return impl_name
2273
2274         self._write_string('[debug] Python version %s (%s) - %s\n' % (
2275             platform.python_version(), python_implementation(),
2276             platform_name()))
2277
2278         exe_versions = FFmpegPostProcessor.get_versions(self)
2279         exe_versions['rtmpdump'] = rtmpdump_version()
2280         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2281         exe_str = ', '.join(
2282             '%s %s' % (exe, v)
2283             for exe, v in sorted(exe_versions.items())
2284             if v
2285         )
2286         if not exe_str:
2287             exe_str = 'none'
2288         self._write_string('[debug] exe versions: %s\n' % exe_str)
2289
2290         proxy_map = {}
2291         for handler in self._opener.handlers:
2292             if hasattr(handler, 'proxies'):
2293                 proxy_map.update(handler.proxies)
2294         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2295
2296         if self.params.get('call_home', False):
2297             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2298             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2299             latest_version = self.urlopen(
2300                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2301             if version_tuple(latest_version) > version_tuple(__version__):
2302                 self.report_warning(
2303                     'You are using an outdated version (newest version: %s)! '
2304                     'See https://yt-dl.org/update if you need help updating.' %
2305                     latest_version)
2306
2307     def _setup_opener(self):
2308         timeout_val = self.params.get('socket_timeout')
2309         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2310
2311         opts_cookiefile = self.params.get('cookiefile')
2312         opts_proxy = self.params.get('proxy')
2313
2314         if opts_cookiefile is None:
2315             self.cookiejar = compat_cookiejar.CookieJar()
2316         else:
2317             opts_cookiefile = expand_path(opts_cookiefile)
2318             self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
2319             if os.access(opts_cookiefile, os.R_OK):
2320                 self.cookiejar.load(ignore_discard=True, ignore_expires=True)
2321
2322         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2323         if opts_proxy is not None:
2324             if opts_proxy == '':
2325                 proxies = {}
2326             else:
2327                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2328         else:
2329             proxies = compat_urllib_request.getproxies()
2330             # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
2331             if 'http' in proxies and 'https' not in proxies:
2332                 proxies['https'] = proxies['http']
2333         proxy_handler = PerRequestProxyHandler(proxies)
2334
2335         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2336         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2337         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2338         data_handler = compat_urllib_request_DataHandler()
2339
2340         # When passing our own FileHandler instance, build_opener won't add the
2341         # default FileHandler and allows us to disable the file protocol, which
2342         # can be used for malicious purposes (see
2343         # https://github.com/ytdl-org/youtube-dl/issues/8227)
2344         file_handler = compat_urllib_request.FileHandler()
2345
2346         def file_open(*args, **kwargs):
2347             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2348         file_handler.file_open = file_open
2349
2350         opener = compat_urllib_request.build_opener(
2351             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2352
2353         # Delete the default user-agent header, which would otherwise apply in
2354         # cases where our custom HTTP handler doesn't come into play
2355         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
2356         opener.addheaders = []
2357         self._opener = opener
2358
2359     def encode(self, s):
2360         if isinstance(s, bytes):
2361             return s  # Already encoded
2362
2363         try:
2364             return s.encode(self.get_encoding())
2365         except UnicodeEncodeError as err:
2366             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2367             raise
2368
2369     def get_encoding(self):
2370         encoding = self.params.get('encoding')
2371         if encoding is None:
2372             encoding = preferredencoding()
2373         return encoding
2374
2375     def _write_thumbnails(self, info_dict, filename):
2376         if self.params.get('writethumbnail', False):
2377             thumbnails = info_dict.get('thumbnails')
2378             if thumbnails:
2379                 thumbnails = [thumbnails[-1]]
2380         elif self.params.get('write_all_thumbnails', False):
2381             thumbnails = info_dict.get('thumbnails')
2382         else:
2383             return
2384
2385         if not thumbnails:
2386             # No thumbnails present, so return immediately
2387             return
2388
2389         for t in thumbnails:
2390             thumb_ext = determine_ext(t['url'], 'jpg')
2391             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2392             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2393             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2394
2395             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2396                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2397                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2398             else:
2399                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2400                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2401                 try:
2402                     uf = self.urlopen(t['url'])
2403                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2404                         shutil.copyfileobj(uf, thumbf)
2405                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2406                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2407                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408                     self.report_warning('Unable to download thumbnail "%s": %s' %
2409                                         (t['url'], error_to_compat_str(err)))