Document http_chunk_size
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     subtitles_filename,
86     UnavailableVideoError,
87     url_basename,
88     version_tuple,
89     write_json_file,
90     write_string,
91     YoutubeDLCookieProcessor,
92     YoutubeDLHandler,
93 )
94 from .cache import Cache
95 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
96 from .extractor.openload import PhantomJSwrapper
97 from .downloader import get_suitable_downloader
98 from .downloader.rtmp import rtmpdump_version
99 from .postprocessor import (
100     FFmpegFixupM3u8PP,
101     FFmpegFixupM4aPP,
102     FFmpegFixupStretchedPP,
103     FFmpegMergerPP,
104     FFmpegPostProcessor,
105     get_postprocessor,
106 )
107 from .version import __version__
108
109 if compat_os_name == 'nt':
110     import ctypes
111
112
113 class YoutubeDL(object):
114     """YoutubeDL class.
115
116     YoutubeDL objects are the ones responsible of downloading the
117     actual video file and writing it to disk if the user has requested
118     it, among some other tasks. In most cases there should be one per
119     program. As, given a video URL, the downloader doesn't know how to
120     extract all the needed information, task that InfoExtractors do, it
121     has to pass the URL to one of them.
122
123     For this, YoutubeDL objects have a method that allows
124     InfoExtractors to be registered in a given order. When it is passed
125     a URL, the YoutubeDL object handles it to the first InfoExtractor it
126     finds that reports being able to handle it. The InfoExtractor extracts
127     all the information about the video or videos the URL refers to, and
128     YoutubeDL process the extracted information, possibly using a File
129     Downloader to download the video.
130
131     YoutubeDL objects accept a lot of parameters. In order not to saturate
132     the object constructor with arguments, it receives a dictionary of
133     options instead. These options are available through the params
134     attribute for the InfoExtractors to use. The YoutubeDL also
135     registers itself as the downloader in charge for the InfoExtractors
136     that are added to it, so this is a "mutual registration".
137
138     Available options:
139
140     username:          Username for authentication purposes.
141     password:          Password for authentication purposes.
142     videopassword:     Password for accessing a video.
143     ap_mso:            Adobe Pass multiple-system operator identifier.
144     ap_username:       Multiple-system operator account username.
145     ap_password:       Multiple-system operator account password.
146     usenetrc:          Use netrc for authentication instead.
147     verbose:           Print additional info to stdout.
148     quiet:             Do not print messages to stdout.
149     no_warnings:       Do not print out anything for warnings.
150     forceurl:          Force printing final URL.
151     forcetitle:        Force printing title.
152     forceid:           Force printing ID.
153     forcethumbnail:    Force printing thumbnail URL.
154     forcedescription:  Force printing description.
155     forcefilename:     Force printing final filename.
156     forceduration:     Force printing duration.
157     forcejson:         Force printing info_dict as JSON.
158     dump_single_json:  Force printing the info_dict of the whole playlist
159                        (or video) as a single JSON line.
160     simulate:          Do not download the video files.
161     format:            Video format code. See options.py for more information.
162     outtmpl:           Template for output names.
163     restrictfilenames: Do not allow "&" and spaces in file names
164     ignoreerrors:      Do not stop on download errors.
165     force_generic_extractor: Force downloader to use the generic extractor
166     nooverwrites:      Prevent overwriting files.
167     playliststart:     Playlist item to start at.
168     playlistend:       Playlist item to end at.
169     playlist_items:    Specific indices of playlist to download.
170     playlistreverse:   Download playlist items in reverse order.
171     playlistrandom:    Download playlist items in random order.
172     matchtitle:        Download only matching titles.
173     rejecttitle:       Reject downloads for matching titles.
174     logger:            Log messages to a logging.Logger instance.
175     logtostderr:       Log messages to stderr instead of stdout.
176     writedescription:  Write the video description to a .description file
177     writeinfojson:     Write the video description to a .info.json file
178     writeannotations:  Write the video annotations to a .annotations.xml file
179     writethumbnail:    Write the thumbnail image to a file
180     write_all_thumbnails:  Write all thumbnail formats to files
181     writesubtitles:    Write the video subtitles to a file
182     writeautomaticsub: Write the automatically generated subtitles to a file
183     allsubtitles:      Downloads all the subtitles of the video
184                        (requires writesubtitles or writeautomaticsub)
185     listsubtitles:     Lists all available subtitles for the video
186     subtitlesformat:   The format code for subtitles
187     subtitleslangs:    List of languages of the subtitles to download
188     keepvideo:         Keep the video file after post-processing
189     daterange:         A DateRange object, download only if the upload_date is in the range.
190     skip_download:     Skip the actual download of the video file
191     cachedir:          Location of the cache files in the filesystem.
192                        False to disable filesystem cache.
193     noplaylist:        Download single video instead of a playlist if in doubt.
194     age_limit:         An integer representing the user's age in years.
195                        Unsuitable videos for the given age are skipped.
196     min_views:         An integer representing the minimum view count the video
197                        must have in order to not be skipped.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     max_views:         An integer representing the maximum view count.
201                        Videos that are more popular than that are not
202                        downloaded.
203                        Videos without view count information are always
204                        downloaded. None for no limit.
205     download_archive:  File name of a file where all downloads are recorded.
206                        Videos already present in the file are not downloaded
207                        again.
208     cookiefile:        File name where cookies should be read from and dumped to.
209     nocheckcertificate:Do not verify SSL certificates
210     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
211                        At the moment, this is only supported by YouTube.
212     proxy:             URL of the proxy server to use
213     geo_verification_proxy:  URL of the proxy to use for IP address verification
214                        on geo-restricted sites. (Experimental)
215     socket_timeout:    Time to wait for unresponsive hosts, in seconds
216     bidi_workaround:   Work around buggy terminals without bidirectional text
217                        support, using fridibi
218     debug_printtraffic:Print out sent and received HTTP traffic
219     include_ads:       Download ads as well
220     default_search:    Prepend this string if an input url is not valid.
221                        'auto' for elaborate guessing
222     encoding:          Use this encoding instead of the system-specified.
223     extract_flat:      Do not resolve URLs, return the immediate result.
224                        Pass in 'in_playlist' to only show this behavior for
225                        playlist items.
226     postprocessors:    A list of dictionaries, each with an entry
227                        * key:  The name of the postprocessor. See
228                                youtube_dl/postprocessor/__init__.py for a list.
229                        as well as any further keyword arguments for the
230                        postprocessor.
231     progress_hooks:    A list of functions that get called on download
232                        progress, with a dictionary with the entries
233                        * status: One of "downloading", "error", or "finished".
234                                  Check this first and ignore unknown values.
235
236                        If status is one of "downloading", or "finished", the
237                        following properties may also be present:
238                        * filename: The final filename (always present)
239                        * tmpfilename: The filename we're currently writing to
240                        * downloaded_bytes: Bytes on disk
241                        * total_bytes: Size of the whole file, None if unknown
242                        * total_bytes_estimate: Guess of the eventual file size,
243                                                None if unavailable.
244                        * elapsed: The number of seconds since download started.
245                        * eta: The estimated time in seconds, None if unknown
246                        * speed: The download speed in bytes/second, None if
247                                 unknown
248                        * fragment_index: The counter of the currently
249                                          downloaded video fragment.
250                        * fragment_count: The number of fragments (= individual
251                                          files that will be merged)
252
253                        Progress hooks are guaranteed to be called at least once
254                        (with status "finished") if the download is successful.
255     merge_output_format: Extension to use when merging formats.
256     fixup:             Automatically correct known faults of the file.
257                        One of:
258                        - "never": do nothing
259                        - "warn": only emit a warning
260                        - "detect_or_warn": check whether we can do anything
261                                            about it, warn otherwise (default)
262     source_address:    (Experimental) Client-side IP address to bind to.
263     call_home:         Boolean, true iff we are allowed to contact the
264                        youtube-dl servers for debugging.
265     sleep_interval:    Number of seconds to sleep before each download when
266                        used alone or a lower bound of a range for randomized
267                        sleep before each download (minimum possible number
268                        of seconds to sleep) when used along with
269                        max_sleep_interval.
270     max_sleep_interval:Upper bound of a range for randomized sleep before each
271                        download (maximum possible number of seconds to sleep).
272                        Must only be used along with sleep_interval.
273                        Actual sleep time will be a random float from range
274                        [sleep_interval; max_sleep_interval].
275     listformats:       Print an overview of available video formats and exit.
276     list_thumbnails:   Print a table of all thumbnails and exit.
277     match_filter:      A function that gets called with the info_dict of
278                        every video.
279                        If it returns a message, the video is ignored.
280                        If it returns None, the video is downloaded.
281                        match_filter_func in utils.py is one example for this.
282     no_color:          Do not emit color codes in output.
283     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
284                        HTTP header (experimental)
285     geo_bypass_country:
286                        Two-letter ISO 3166-2 country code that will be used for
287                        explicit geographic restriction bypassing via faking
288                        X-Forwarded-For HTTP header (experimental)
289
290     The following options determine which downloader is picked:
291     external_downloader: Executable of the external downloader to call.
292                        None or unset for standard (built-in) downloader.
293     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
294                        if True, otherwise use ffmpeg/avconv if False, otherwise
295                        use downloader suggested by extractor if None.
296
297     The following parameters are not used by YoutubeDL itself, they are used by
298     the downloader (see youtube_dl/downloader/common.py):
299     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
300     noresizebuffer, retries, continuedl, noprogress, consoletitle,
301     xattr_set_filesize, external_downloader_args, hls_use_mpegts,
302     http_chunk_size.
303
304     The following options are used by the post processors:
305     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
306                        otherwise prefer avconv.
307     postprocessor_args: A list of additional command-line arguments for the
308                         postprocessor.
309
310     The following options are used by the Youtube extractor:
311     youtube_include_dash_manifest: If True (default), DASH manifests and related
312                         data will be downloaded and processed by extractor.
313                         You can reduce network I/O by disabling it if you don't
314                         care about DASH.
315     """
316
317     _NUMERIC_FIELDS = set((
318         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
319         'timestamp', 'upload_year', 'upload_month', 'upload_day',
320         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
321         'average_rating', 'comment_count', 'age_limit',
322         'start_time', 'end_time',
323         'chapter_number', 'season_number', 'episode_number',
324         'track_number', 'disc_number', 'release_year',
325         'playlist_index',
326     ))
327
328     params = None
329     _ies = []
330     _pps = []
331     _download_retcode = None
332     _num_downloads = None
333     _screen_file = None
334
335     def __init__(self, params=None, auto_init=True):
336         """Create a FileDownloader object with the given options."""
337         if params is None:
338             params = {}
339         self._ies = []
340         self._ies_instances = {}
341         self._pps = []
342         self._progress_hooks = []
343         self._download_retcode = 0
344         self._num_downloads = 0
345         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
346         self._err_file = sys.stderr
347         self.params = {
348             # Default parameters
349             'nocheckcertificate': False,
350         }
351         self.params.update(params)
352         self.cache = Cache(self)
353
354         def check_deprecated(param, option, suggestion):
355             if self.params.get(param) is not None:
356                 self.report_warning(
357                     '%s is deprecated. Use %s instead.' % (option, suggestion))
358                 return True
359             return False
360
361         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
362             if self.params.get('geo_verification_proxy') is None:
363                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
364
365         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
366         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
367         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
368
369         if params.get('bidi_workaround', False):
370             try:
371                 import pty
372                 master, slave = pty.openpty()
373                 width = compat_get_terminal_size().columns
374                 if width is None:
375                     width_args = []
376                 else:
377                     width_args = ['-w', str(width)]
378                 sp_kwargs = dict(
379                     stdin=subprocess.PIPE,
380                     stdout=slave,
381                     stderr=self._err_file)
382                 try:
383                     self._output_process = subprocess.Popen(
384                         ['bidiv'] + width_args, **sp_kwargs
385                     )
386                 except OSError:
387                     self._output_process = subprocess.Popen(
388                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
389                 self._output_channel = os.fdopen(master, 'rb')
390             except OSError as ose:
391                 if ose.errno == errno.ENOENT:
392                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
393                 else:
394                     raise
395
396         if (sys.platform != 'win32' and
397                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
398                 not params.get('restrictfilenames', False)):
399             # Unicode filesystem API will throw errors (#1474, #13027)
400             self.report_warning(
401                 'Assuming --restrict-filenames since file system encoding '
402                 'cannot encode all characters. '
403                 'Set the LC_ALL environment variable to fix this.')
404             self.params['restrictfilenames'] = True
405
406         if isinstance(params.get('outtmpl'), bytes):
407             self.report_warning(
408                 'Parameter outtmpl is bytes, but should be a unicode string. '
409                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
410
411         self._setup_opener()
412
413         if auto_init:
414             self.print_debug_header()
415             self.add_default_info_extractors()
416
417         for pp_def_raw in self.params.get('postprocessors', []):
418             pp_class = get_postprocessor(pp_def_raw['key'])
419             pp_def = dict(pp_def_raw)
420             del pp_def['key']
421             pp = pp_class(self, **compat_kwargs(pp_def))
422             self.add_post_processor(pp)
423
424         for ph in self.params.get('progress_hooks', []):
425             self.add_progress_hook(ph)
426
427         register_socks_protocols()
428
429     def warn_if_short_id(self, argv):
430         # short YouTube ID starting with dash?
431         idxs = [
432             i for i, a in enumerate(argv)
433             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
434         if idxs:
435             correct_argv = (
436                 ['youtube-dl'] +
437                 [a for i, a in enumerate(argv) if i not in idxs] +
438                 ['--'] + [argv[i] for i in idxs]
439             )
440             self.report_warning(
441                 'Long argument string detected. '
442                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
443                 args_to_str(correct_argv))
444
445     def add_info_extractor(self, ie):
446         """Add an InfoExtractor object to the end of the list."""
447         self._ies.append(ie)
448         if not isinstance(ie, type):
449             self._ies_instances[ie.ie_key()] = ie
450             ie.set_downloader(self)
451
452     def get_info_extractor(self, ie_key):
453         """
454         Get an instance of an IE with name ie_key, it will try to get one from
455         the _ies list, if there's no instance it will create a new one and add
456         it to the extractor list.
457         """
458         ie = self._ies_instances.get(ie_key)
459         if ie is None:
460             ie = get_info_extractor(ie_key)()
461             self.add_info_extractor(ie)
462         return ie
463
464     def add_default_info_extractors(self):
465         """
466         Add the InfoExtractors returned by gen_extractors to the end of the list
467         """
468         for ie in gen_extractor_classes():
469             self.add_info_extractor(ie)
470
471     def add_post_processor(self, pp):
472         """Add a PostProcessor object to the end of the chain."""
473         self._pps.append(pp)
474         pp.set_downloader(self)
475
476     def add_progress_hook(self, ph):
477         """Add the progress hook (currently only for the file downloader)"""
478         self._progress_hooks.append(ph)
479
480     def _bidi_workaround(self, message):
481         if not hasattr(self, '_output_channel'):
482             return message
483
484         assert hasattr(self, '_output_process')
485         assert isinstance(message, compat_str)
486         line_count = message.count('\n') + 1
487         self._output_process.stdin.write((message + '\n').encode('utf-8'))
488         self._output_process.stdin.flush()
489         res = ''.join(self._output_channel.readline().decode('utf-8')
490                       for _ in range(line_count))
491         return res[:-len('\n')]
492
493     def to_screen(self, message, skip_eol=False):
494         """Print message to stdout if not in quiet mode."""
495         return self.to_stdout(message, skip_eol, check_quiet=True)
496
497     def _write_string(self, s, out=None):
498         write_string(s, out=out, encoding=self.params.get('encoding'))
499
500     def to_stdout(self, message, skip_eol=False, check_quiet=False):
501         """Print message to stdout if not in quiet mode."""
502         if self.params.get('logger'):
503             self.params['logger'].debug(message)
504         elif not check_quiet or not self.params.get('quiet', False):
505             message = self._bidi_workaround(message)
506             terminator = ['\n', ''][skip_eol]
507             output = message + terminator
508
509             self._write_string(output, self._screen_file)
510
511     def to_stderr(self, message):
512         """Print message to stderr."""
513         assert isinstance(message, compat_str)
514         if self.params.get('logger'):
515             self.params['logger'].error(message)
516         else:
517             message = self._bidi_workaround(message)
518             output = message + '\n'
519             self._write_string(output, self._err_file)
520
521     def to_console_title(self, message):
522         if not self.params.get('consoletitle', False):
523             return
524         if compat_os_name == 'nt':
525             if ctypes.windll.kernel32.GetConsoleWindow():
526                 # c_wchar_p() might not be necessary if `message` is
527                 # already of type unicode()
528                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
529         elif 'TERM' in os.environ:
530             self._write_string('\033]0;%s\007' % message, self._screen_file)
531
532     def save_console_title(self):
533         if not self.params.get('consoletitle', False):
534             return
535         if compat_os_name != 'nt' and 'TERM' in os.environ:
536             # Save the title on stack
537             self._write_string('\033[22;0t', self._screen_file)
538
539     def restore_console_title(self):
540         if not self.params.get('consoletitle', False):
541             return
542         if compat_os_name != 'nt' and 'TERM' in os.environ:
543             # Restore the title from stack
544             self._write_string('\033[23;0t', self._screen_file)
545
546     def __enter__(self):
547         self.save_console_title()
548         return self
549
550     def __exit__(self, *args):
551         self.restore_console_title()
552
553         if self.params.get('cookiefile') is not None:
554             self.cookiejar.save()
555
556     def trouble(self, message=None, tb=None):
557         """Determine action to take when a download problem appears.
558
559         Depending on if the downloader has been configured to ignore
560         download errors or not, this method may throw an exception or
561         not when errors are found, after printing the message.
562
563         tb, if given, is additional traceback information.
564         """
565         if message is not None:
566             self.to_stderr(message)
567         if self.params.get('verbose'):
568             if tb is None:
569                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
570                     tb = ''
571                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
572                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
573                     tb += encode_compat_str(traceback.format_exc())
574                 else:
575                     tb_data = traceback.format_list(traceback.extract_stack())
576                     tb = ''.join(tb_data)
577             self.to_stderr(tb)
578         if not self.params.get('ignoreerrors', False):
579             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
580                 exc_info = sys.exc_info()[1].exc_info
581             else:
582                 exc_info = sys.exc_info()
583             raise DownloadError(message, exc_info)
584         self._download_retcode = 1
585
586     def report_warning(self, message):
587         '''
588         Print the message to stderr, it will be prefixed with 'WARNING:'
589         If stderr is a tty file the 'WARNING:' will be colored
590         '''
591         if self.params.get('logger') is not None:
592             self.params['logger'].warning(message)
593         else:
594             if self.params.get('no_warnings'):
595                 return
596             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
597                 _msg_header = '\033[0;33mWARNING:\033[0m'
598             else:
599                 _msg_header = 'WARNING:'
600             warning_message = '%s %s' % (_msg_header, message)
601             self.to_stderr(warning_message)
602
603     def report_error(self, message, tb=None):
604         '''
605         Do the same as trouble, but prefixes the message with 'ERROR:', colored
606         in red if stderr is a tty file.
607         '''
608         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
609             _msg_header = '\033[0;31mERROR:\033[0m'
610         else:
611             _msg_header = 'ERROR:'
612         error_message = '%s %s' % (_msg_header, message)
613         self.trouble(error_message, tb)
614
615     def report_file_already_downloaded(self, file_name):
616         """Report file has already been fully downloaded."""
617         try:
618             self.to_screen('[download] %s has already been downloaded' % file_name)
619         except UnicodeEncodeError:
620             self.to_screen('[download] The file has already been downloaded')
621
622     def prepare_filename(self, info_dict):
623         """Generate the output filename."""
624         try:
625             template_dict = dict(info_dict)
626
627             template_dict['epoch'] = int(time.time())
628             autonumber_size = self.params.get('autonumber_size')
629             if autonumber_size is None:
630                 autonumber_size = 5
631             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
632             if template_dict.get('resolution') is None:
633                 if template_dict.get('width') and template_dict.get('height'):
634                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
635                 elif template_dict.get('height'):
636                     template_dict['resolution'] = '%sp' % template_dict['height']
637                 elif template_dict.get('width'):
638                     template_dict['resolution'] = '%dx?' % template_dict['width']
639
640             sanitize = lambda k, v: sanitize_filename(
641                 compat_str(v),
642                 restricted=self.params.get('restrictfilenames'),
643                 is_id=(k == 'id' or k.endswith('_id')))
644             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
645                                  for k, v in template_dict.items()
646                                  if v is not None and not isinstance(v, (list, tuple, dict)))
647             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
648
649             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
650
651             # For fields playlist_index and autonumber convert all occurrences
652             # of %(field)s to %(field)0Nd for backward compatibility
653             field_size_compat_map = {
654                 'playlist_index': len(str(template_dict['n_entries'])),
655                 'autonumber': autonumber_size,
656             }
657             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
658             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
659             if mobj:
660                 outtmpl = re.sub(
661                     FIELD_SIZE_COMPAT_RE,
662                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
663                     outtmpl)
664
665             # Missing numeric fields used together with integer presentation types
666             # in format specification will break the argument substitution since
667             # string 'NA' is returned for missing fields. We will patch output
668             # template for missing fields to meet string presentation type.
669             for numeric_field in self._NUMERIC_FIELDS:
670                 if numeric_field not in template_dict:
671                     # As of [1] format syntax is:
672                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
673                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
674                     FORMAT_RE = r'''(?x)
675                         (?<!%)
676                         %
677                         \({0}\)  # mapping key
678                         (?:[#0\-+ ]+)?  # conversion flags (optional)
679                         (?:\d+)?  # minimum field width (optional)
680                         (?:\.\d+)?  # precision (optional)
681                         [hlL]?  # length modifier (optional)
682                         [diouxXeEfFgGcrs%]  # conversion type
683                     '''
684                     outtmpl = re.sub(
685                         FORMAT_RE.format(numeric_field),
686                         r'%({0})s'.format(numeric_field), outtmpl)
687
688             # expand_path translates '%%' into '%' and '$$' into '$'
689             # correspondingly that is not what we want since we need to keep
690             # '%%' intact for template dict substitution step. Working around
691             # with boundary-alike separator hack.
692             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
693             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
694
695             # outtmpl should be expand_path'ed before template dict substitution
696             # because meta fields may contain env variables we don't want to
697             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
698             # title "Hello $PATH", we don't want `$PATH` to be expanded.
699             filename = expand_path(outtmpl).replace(sep, '') % template_dict
700
701             # Temporary fix for #4787
702             # 'Treat' all problem characters by passing filename through preferredencoding
703             # to workaround encoding issues with subprocess on python2 @ Windows
704             if sys.version_info < (3, 0) and sys.platform == 'win32':
705                 filename = encodeFilename(filename, True).decode(preferredencoding())
706             return sanitize_path(filename)
707         except ValueError as err:
708             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
709             return None
710
711     def _match_entry(self, info_dict, incomplete):
712         """ Returns None iff the file should be downloaded """
713
714         video_title = info_dict.get('title', info_dict.get('id', 'video'))
715         if 'title' in info_dict:
716             # This can happen when we're just evaluating the playlist
717             title = info_dict['title']
718             matchtitle = self.params.get('matchtitle', False)
719             if matchtitle:
720                 if not re.search(matchtitle, title, re.IGNORECASE):
721                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
722             rejecttitle = self.params.get('rejecttitle', False)
723             if rejecttitle:
724                 if re.search(rejecttitle, title, re.IGNORECASE):
725                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
726         date = info_dict.get('upload_date')
727         if date is not None:
728             dateRange = self.params.get('daterange', DateRange())
729             if date not in dateRange:
730                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
731         view_count = info_dict.get('view_count')
732         if view_count is not None:
733             min_views = self.params.get('min_views')
734             if min_views is not None and view_count < min_views:
735                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
736             max_views = self.params.get('max_views')
737             if max_views is not None and view_count > max_views:
738                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
739         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
740             return 'Skipping "%s" because it is age restricted' % video_title
741         if self.in_download_archive(info_dict):
742             return '%s has already been recorded in archive' % video_title
743
744         if not incomplete:
745             match_filter = self.params.get('match_filter')
746             if match_filter is not None:
747                 ret = match_filter(info_dict)
748                 if ret is not None:
749                     return ret
750
751         return None
752
753     @staticmethod
754     def add_extra_info(info_dict, extra_info):
755         '''Set the keys from extra_info in info dict if they are missing'''
756         for key, value in extra_info.items():
757             info_dict.setdefault(key, value)
758
759     def extract_info(self, url, download=True, ie_key=None, extra_info={},
760                      process=True, force_generic_extractor=False):
761         '''
762         Returns a list with a dictionary for each video we find.
763         If 'download', also downloads the videos.
764         extra_info is a dict containing the extra values to add to each result
765         '''
766
767         if not ie_key and force_generic_extractor:
768             ie_key = 'Generic'
769
770         if ie_key:
771             ies = [self.get_info_extractor(ie_key)]
772         else:
773             ies = self._ies
774
775         for ie in ies:
776             if not ie.suitable(url):
777                 continue
778
779             ie = self.get_info_extractor(ie.ie_key())
780             if not ie.working():
781                 self.report_warning('The program functionality for this site has been marked as broken, '
782                                     'and will probably not work.')
783
784             try:
785                 ie_result = ie.extract(url)
786                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
787                     break
788                 if isinstance(ie_result, list):
789                     # Backwards compatibility: old IE result format
790                     ie_result = {
791                         '_type': 'compat_list',
792                         'entries': ie_result,
793                     }
794                 self.add_default_extra_info(ie_result, ie, url)
795                 if process:
796                     return self.process_ie_result(ie_result, download, extra_info)
797                 else:
798                     return ie_result
799             except GeoRestrictedError as e:
800                 msg = e.msg
801                 if e.countries:
802                     msg += '\nThis video is available in %s.' % ', '.join(
803                         map(ISO3166Utils.short2full, e.countries))
804                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
805                 self.report_error(msg)
806                 break
807             except ExtractorError as e:  # An error we somewhat expected
808                 self.report_error(compat_str(e), e.format_traceback())
809                 break
810             except MaxDownloadsReached:
811                 raise
812             except Exception as e:
813                 if self.params.get('ignoreerrors', False):
814                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
815                     break
816                 else:
817                     raise
818         else:
819             self.report_error('no suitable InfoExtractor for URL %s' % url)
820
821     def add_default_extra_info(self, ie_result, ie, url):
822         self.add_extra_info(ie_result, {
823             'extractor': ie.IE_NAME,
824             'webpage_url': url,
825             'webpage_url_basename': url_basename(url),
826             'extractor_key': ie.ie_key(),
827         })
828
829     def process_ie_result(self, ie_result, download=True, extra_info={}):
830         """
831         Take the result of the ie(may be modified) and resolve all unresolved
832         references (URLs, playlist items).
833
834         It will also download the videos if 'download'.
835         Returns the resolved ie_result.
836         """
837         result_type = ie_result.get('_type', 'video')
838
839         if result_type in ('url', 'url_transparent'):
840             ie_result['url'] = sanitize_url(ie_result['url'])
841             extract_flat = self.params.get('extract_flat', False)
842             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
843                     extract_flat is True):
844                 if self.params.get('forcejson', False):
845                     self.to_stdout(json.dumps(ie_result))
846                 return ie_result
847
848         if result_type == 'video':
849             self.add_extra_info(ie_result, extra_info)
850             return self.process_video_result(ie_result, download=download)
851         elif result_type == 'url':
852             # We have to add extra_info to the results because it may be
853             # contained in a playlist
854             return self.extract_info(ie_result['url'],
855                                      download,
856                                      ie_key=ie_result.get('ie_key'),
857                                      extra_info=extra_info)
858         elif result_type == 'url_transparent':
859             # Use the information from the embedding page
860             info = self.extract_info(
861                 ie_result['url'], ie_key=ie_result.get('ie_key'),
862                 extra_info=extra_info, download=False, process=False)
863
864             # extract_info may return None when ignoreerrors is enabled and
865             # extraction failed with an error, don't crash and return early
866             # in this case
867             if not info:
868                 return info
869
870             force_properties = dict(
871                 (k, v) for k, v in ie_result.items() if v is not None)
872             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
873                 if f in force_properties:
874                     del force_properties[f]
875             new_result = info.copy()
876             new_result.update(force_properties)
877
878             # Extracted info may not be a video result (i.e.
879             # info.get('_type', 'video') != video) but rather an url or
880             # url_transparent. In such cases outer metadata (from ie_result)
881             # should be propagated to inner one (info). For this to happen
882             # _type of info should be overridden with url_transparent. This
883             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
884             if new_result.get('_type') == 'url':
885                 new_result['_type'] = 'url_transparent'
886
887             return self.process_ie_result(
888                 new_result, download=download, extra_info=extra_info)
889         elif result_type in ('playlist', 'multi_video'):
890             # We process each entry in the playlist
891             playlist = ie_result.get('title') or ie_result.get('id')
892             self.to_screen('[download] Downloading playlist: %s' % playlist)
893
894             playlist_results = []
895
896             playliststart = self.params.get('playliststart', 1) - 1
897             playlistend = self.params.get('playlistend')
898             # For backwards compatibility, interpret -1 as whole list
899             if playlistend == -1:
900                 playlistend = None
901
902             playlistitems_str = self.params.get('playlist_items')
903             playlistitems = None
904             if playlistitems_str is not None:
905                 def iter_playlistitems(format):
906                     for string_segment in format.split(','):
907                         if '-' in string_segment:
908                             start, end = string_segment.split('-')
909                             for item in range(int(start), int(end) + 1):
910                                 yield int(item)
911                         else:
912                             yield int(string_segment)
913                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
914
915             ie_entries = ie_result['entries']
916
917             def make_playlistitems_entries(list_ie_entries):
918                 num_entries = len(list_ie_entries)
919                 return [
920                     list_ie_entries[i - 1] for i in playlistitems
921                     if -num_entries <= i - 1 < num_entries]
922
923             def report_download(num_entries):
924                 self.to_screen(
925                     '[%s] playlist %s: Downloading %d videos' %
926                     (ie_result['extractor'], playlist, num_entries))
927
928             if isinstance(ie_entries, list):
929                 n_all_entries = len(ie_entries)
930                 if playlistitems:
931                     entries = make_playlistitems_entries(ie_entries)
932                 else:
933                     entries = ie_entries[playliststart:playlistend]
934                 n_entries = len(entries)
935                 self.to_screen(
936                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
937                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
938             elif isinstance(ie_entries, PagedList):
939                 if playlistitems:
940                     entries = []
941                     for item in playlistitems:
942                         entries.extend(ie_entries.getslice(
943                             item - 1, item
944                         ))
945                 else:
946                     entries = ie_entries.getslice(
947                         playliststart, playlistend)
948                 n_entries = len(entries)
949                 report_download(n_entries)
950             else:  # iterable
951                 if playlistitems:
952                     entries = make_playlistitems_entries(list(itertools.islice(
953                         ie_entries, 0, max(playlistitems))))
954                 else:
955                     entries = list(itertools.islice(
956                         ie_entries, playliststart, playlistend))
957                 n_entries = len(entries)
958                 report_download(n_entries)
959
960             if self.params.get('playlistreverse', False):
961                 entries = entries[::-1]
962
963             if self.params.get('playlistrandom', False):
964                 random.shuffle(entries)
965
966             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
967
968             for i, entry in enumerate(entries, 1):
969                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
970                 # This __x_forwarded_for_ip thing is a bit ugly but requires
971                 # minimal changes
972                 if x_forwarded_for:
973                     entry['__x_forwarded_for_ip'] = x_forwarded_for
974                 extra = {
975                     'n_entries': n_entries,
976                     'playlist': playlist,
977                     'playlist_id': ie_result.get('id'),
978                     'playlist_title': ie_result.get('title'),
979                     'playlist_uploader': ie_result.get('uploader'),
980                     'playlist_uploader_id': ie_result.get('uploader_id'),
981                     'playlist_index': i + playliststart,
982                     'extractor': ie_result['extractor'],
983                     'webpage_url': ie_result['webpage_url'],
984                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
985                     'extractor_key': ie_result['extractor_key'],
986                 }
987
988                 reason = self._match_entry(entry, incomplete=True)
989                 if reason is not None:
990                     self.to_screen('[download] ' + reason)
991                     continue
992
993                 entry_result = self.process_ie_result(entry,
994                                                       download=download,
995                                                       extra_info=extra)
996                 playlist_results.append(entry_result)
997             ie_result['entries'] = playlist_results
998             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
999             return ie_result
1000         elif result_type == 'compat_list':
1001             self.report_warning(
1002                 'Extractor %s returned a compat_list result. '
1003                 'It needs to be updated.' % ie_result.get('extractor'))
1004
1005             def _fixup(r):
1006                 self.add_extra_info(
1007                     r,
1008                     {
1009                         'extractor': ie_result['extractor'],
1010                         'webpage_url': ie_result['webpage_url'],
1011                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1012                         'extractor_key': ie_result['extractor_key'],
1013                     }
1014                 )
1015                 return r
1016             ie_result['entries'] = [
1017                 self.process_ie_result(_fixup(r), download, extra_info)
1018                 for r in ie_result['entries']
1019             ]
1020             return ie_result
1021         else:
1022             raise Exception('Invalid result type: %s' % result_type)
1023
1024     def _build_format_filter(self, filter_spec):
1025         " Returns a function to filter the formats according to the filter_spec "
1026
1027         OPERATORS = {
1028             '<': operator.lt,
1029             '<=': operator.le,
1030             '>': operator.gt,
1031             '>=': operator.ge,
1032             '=': operator.eq,
1033             '!=': operator.ne,
1034         }
1035         operator_rex = re.compile(r'''(?x)\s*
1036             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1037             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1038             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1039             $
1040             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1041         m = operator_rex.search(filter_spec)
1042         if m:
1043             try:
1044                 comparison_value = int(m.group('value'))
1045             except ValueError:
1046                 comparison_value = parse_filesize(m.group('value'))
1047                 if comparison_value is None:
1048                     comparison_value = parse_filesize(m.group('value') + 'B')
1049                 if comparison_value is None:
1050                     raise ValueError(
1051                         'Invalid value %r in format specification %r' % (
1052                             m.group('value'), filter_spec))
1053             op = OPERATORS[m.group('op')]
1054
1055         if not m:
1056             STR_OPERATORS = {
1057                 '=': operator.eq,
1058                 '!=': operator.ne,
1059                 '^=': lambda attr, value: attr.startswith(value),
1060                 '$=': lambda attr, value: attr.endswith(value),
1061                 '*=': lambda attr, value: value in attr,
1062             }
1063             str_operator_rex = re.compile(r'''(?x)
1064                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1065                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1066                 \s*(?P<value>[a-zA-Z0-9._-]+)
1067                 \s*$
1068                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1069             m = str_operator_rex.search(filter_spec)
1070             if m:
1071                 comparison_value = m.group('value')
1072                 op = STR_OPERATORS[m.group('op')]
1073
1074         if not m:
1075             raise ValueError('Invalid filter specification %r' % filter_spec)
1076
1077         def _filter(f):
1078             actual_value = f.get(m.group('key'))
1079             if actual_value is None:
1080                 return m.group('none_inclusive')
1081             return op(actual_value, comparison_value)
1082         return _filter
1083
1084     def _default_format_spec(self, info_dict, download=True):
1085
1086         def can_merge():
1087             merger = FFmpegMergerPP(self)
1088             return merger.available and merger.can_merge()
1089
1090         def prefer_best():
1091             if self.params.get('simulate', False):
1092                 return False
1093             if not download:
1094                 return False
1095             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1096                 return True
1097             if info_dict.get('is_live'):
1098                 return True
1099             if not can_merge():
1100                 return True
1101             return False
1102
1103         req_format_list = ['bestvideo+bestaudio', 'best']
1104         if prefer_best():
1105             req_format_list.reverse()
1106         return '/'.join(req_format_list)
1107
1108     def build_format_selector(self, format_spec):
1109         def syntax_error(note, start):
1110             message = (
1111                 'Invalid format specification: '
1112                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1113             return SyntaxError(message)
1114
1115         PICKFIRST = 'PICKFIRST'
1116         MERGE = 'MERGE'
1117         SINGLE = 'SINGLE'
1118         GROUP = 'GROUP'
1119         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1120
1121         def _parse_filter(tokens):
1122             filter_parts = []
1123             for type, string, start, _, _ in tokens:
1124                 if type == tokenize.OP and string == ']':
1125                     return ''.join(filter_parts)
1126                 else:
1127                     filter_parts.append(string)
1128
1129         def _remove_unused_ops(tokens):
1130             # Remove operators that we don't use and join them with the surrounding strings
1131             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1132             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1133             last_string, last_start, last_end, last_line = None, None, None, None
1134             for type, string, start, end, line in tokens:
1135                 if type == tokenize.OP and string == '[':
1136                     if last_string:
1137                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1138                         last_string = None
1139                     yield type, string, start, end, line
1140                     # everything inside brackets will be handled by _parse_filter
1141                     for type, string, start, end, line in tokens:
1142                         yield type, string, start, end, line
1143                         if type == tokenize.OP and string == ']':
1144                             break
1145                 elif type == tokenize.OP and string in ALLOWED_OPS:
1146                     if last_string:
1147                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1148                         last_string = None
1149                     yield type, string, start, end, line
1150                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1151                     if not last_string:
1152                         last_string = string
1153                         last_start = start
1154                         last_end = end
1155                     else:
1156                         last_string += string
1157             if last_string:
1158                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1159
1160         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1161             selectors = []
1162             current_selector = None
1163             for type, string, start, _, _ in tokens:
1164                 # ENCODING is only defined in python 3.x
1165                 if type == getattr(tokenize, 'ENCODING', None):
1166                     continue
1167                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1168                     current_selector = FormatSelector(SINGLE, string, [])
1169                 elif type == tokenize.OP:
1170                     if string == ')':
1171                         if not inside_group:
1172                             # ')' will be handled by the parentheses group
1173                             tokens.restore_last_token()
1174                         break
1175                     elif inside_merge and string in ['/', ',']:
1176                         tokens.restore_last_token()
1177                         break
1178                     elif inside_choice and string == ',':
1179                         tokens.restore_last_token()
1180                         break
1181                     elif string == ',':
1182                         if not current_selector:
1183                             raise syntax_error('"," must follow a format selector', start)
1184                         selectors.append(current_selector)
1185                         current_selector = None
1186                     elif string == '/':
1187                         if not current_selector:
1188                             raise syntax_error('"/" must follow a format selector', start)
1189                         first_choice = current_selector
1190                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1191                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1192                     elif string == '[':
1193                         if not current_selector:
1194                             current_selector = FormatSelector(SINGLE, 'best', [])
1195                         format_filter = _parse_filter(tokens)
1196                         current_selector.filters.append(format_filter)
1197                     elif string == '(':
1198                         if current_selector:
1199                             raise syntax_error('Unexpected "("', start)
1200                         group = _parse_format_selection(tokens, inside_group=True)
1201                         current_selector = FormatSelector(GROUP, group, [])
1202                     elif string == '+':
1203                         video_selector = current_selector
1204                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1205                         if not video_selector or not audio_selector:
1206                             raise syntax_error('"+" must be between two format selectors', start)
1207                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1208                     else:
1209                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1210                 elif type == tokenize.ENDMARKER:
1211                     break
1212             if current_selector:
1213                 selectors.append(current_selector)
1214             return selectors
1215
1216         def _build_selector_function(selector):
1217             if isinstance(selector, list):
1218                 fs = [_build_selector_function(s) for s in selector]
1219
1220                 def selector_function(ctx):
1221                     for f in fs:
1222                         for format in f(ctx):
1223                             yield format
1224                 return selector_function
1225             elif selector.type == GROUP:
1226                 selector_function = _build_selector_function(selector.selector)
1227             elif selector.type == PICKFIRST:
1228                 fs = [_build_selector_function(s) for s in selector.selector]
1229
1230                 def selector_function(ctx):
1231                     for f in fs:
1232                         picked_formats = list(f(ctx))
1233                         if picked_formats:
1234                             return picked_formats
1235                     return []
1236             elif selector.type == SINGLE:
1237                 format_spec = selector.selector
1238
1239                 def selector_function(ctx):
1240                     formats = list(ctx['formats'])
1241                     if not formats:
1242                         return
1243                     if format_spec == 'all':
1244                         for f in formats:
1245                             yield f
1246                     elif format_spec in ['best', 'worst', None]:
1247                         format_idx = 0 if format_spec == 'worst' else -1
1248                         audiovideo_formats = [
1249                             f for f in formats
1250                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1251                         if audiovideo_formats:
1252                             yield audiovideo_formats[format_idx]
1253                         # for extractors with incomplete formats (audio only (soundcloud)
1254                         # or video only (imgur)) we will fallback to best/worst
1255                         # {video,audio}-only format
1256                         elif ctx['incomplete_formats']:
1257                             yield formats[format_idx]
1258                     elif format_spec == 'bestaudio':
1259                         audio_formats = [
1260                             f for f in formats
1261                             if f.get('vcodec') == 'none']
1262                         if audio_formats:
1263                             yield audio_formats[-1]
1264                     elif format_spec == 'worstaudio':
1265                         audio_formats = [
1266                             f for f in formats
1267                             if f.get('vcodec') == 'none']
1268                         if audio_formats:
1269                             yield audio_formats[0]
1270                     elif format_spec == 'bestvideo':
1271                         video_formats = [
1272                             f for f in formats
1273                             if f.get('acodec') == 'none']
1274                         if video_formats:
1275                             yield video_formats[-1]
1276                     elif format_spec == 'worstvideo':
1277                         video_formats = [
1278                             f for f in formats
1279                             if f.get('acodec') == 'none']
1280                         if video_formats:
1281                             yield video_formats[0]
1282                     else:
1283                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1284                         if format_spec in extensions:
1285                             filter_f = lambda f: f['ext'] == format_spec
1286                         else:
1287                             filter_f = lambda f: f['format_id'] == format_spec
1288                         matches = list(filter(filter_f, formats))
1289                         if matches:
1290                             yield matches[-1]
1291             elif selector.type == MERGE:
1292                 def _merge(formats_info):
1293                     format_1, format_2 = [f['format_id'] for f in formats_info]
1294                     # The first format must contain the video and the
1295                     # second the audio
1296                     if formats_info[0].get('vcodec') == 'none':
1297                         self.report_error('The first format must '
1298                                           'contain the video, try using '
1299                                           '"-f %s+%s"' % (format_2, format_1))
1300                         return
1301                     # Formats must be opposite (video+audio)
1302                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1303                         self.report_error(
1304                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1305                             % (format_1, format_2))
1306                         return
1307                     output_ext = (
1308                         formats_info[0]['ext']
1309                         if self.params.get('merge_output_format') is None
1310                         else self.params['merge_output_format'])
1311                     return {
1312                         'requested_formats': formats_info,
1313                         'format': '%s+%s' % (formats_info[0].get('format'),
1314                                              formats_info[1].get('format')),
1315                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1316                                                 formats_info[1].get('format_id')),
1317                         'width': formats_info[0].get('width'),
1318                         'height': formats_info[0].get('height'),
1319                         'resolution': formats_info[0].get('resolution'),
1320                         'fps': formats_info[0].get('fps'),
1321                         'vcodec': formats_info[0].get('vcodec'),
1322                         'vbr': formats_info[0].get('vbr'),
1323                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1324                         'acodec': formats_info[1].get('acodec'),
1325                         'abr': formats_info[1].get('abr'),
1326                         'ext': output_ext,
1327                     }
1328                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1329
1330                 def selector_function(ctx):
1331                     for pair in itertools.product(
1332                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1333                         yield _merge(pair)
1334
1335             filters = [self._build_format_filter(f) for f in selector.filters]
1336
1337             def final_selector(ctx):
1338                 ctx_copy = copy.deepcopy(ctx)
1339                 for _filter in filters:
1340                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1341                 return selector_function(ctx_copy)
1342             return final_selector
1343
1344         stream = io.BytesIO(format_spec.encode('utf-8'))
1345         try:
1346             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1347         except tokenize.TokenError:
1348             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1349
1350         class TokenIterator(object):
1351             def __init__(self, tokens):
1352                 self.tokens = tokens
1353                 self.counter = 0
1354
1355             def __iter__(self):
1356                 return self
1357
1358             def __next__(self):
1359                 if self.counter >= len(self.tokens):
1360                     raise StopIteration()
1361                 value = self.tokens[self.counter]
1362                 self.counter += 1
1363                 return value
1364
1365             next = __next__
1366
1367             def restore_last_token(self):
1368                 self.counter -= 1
1369
1370         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1371         return _build_selector_function(parsed_selector)
1372
1373     def _calc_headers(self, info_dict):
1374         res = std_headers.copy()
1375
1376         add_headers = info_dict.get('http_headers')
1377         if add_headers:
1378             res.update(add_headers)
1379
1380         cookies = self._calc_cookies(info_dict)
1381         if cookies:
1382             res['Cookie'] = cookies
1383
1384         if 'X-Forwarded-For' not in res:
1385             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1386             if x_forwarded_for_ip:
1387                 res['X-Forwarded-For'] = x_forwarded_for_ip
1388
1389         return res
1390
1391     def _calc_cookies(self, info_dict):
1392         pr = sanitized_Request(info_dict['url'])
1393         self.cookiejar.add_cookie_header(pr)
1394         return pr.get_header('Cookie')
1395
1396     def process_video_result(self, info_dict, download=True):
1397         assert info_dict.get('_type', 'video') == 'video'
1398
1399         if 'id' not in info_dict:
1400             raise ExtractorError('Missing "id" field in extractor result')
1401         if 'title' not in info_dict:
1402             raise ExtractorError('Missing "title" field in extractor result')
1403
1404         def report_force_conversion(field, field_not, conversion):
1405             self.report_warning(
1406                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1407                 % (field, field_not, conversion))
1408
1409         def sanitize_string_field(info, string_field):
1410             field = info.get(string_field)
1411             if field is None or isinstance(field, compat_str):
1412                 return
1413             report_force_conversion(string_field, 'a string', 'string')
1414             info[string_field] = compat_str(field)
1415
1416         def sanitize_numeric_fields(info):
1417             for numeric_field in self._NUMERIC_FIELDS:
1418                 field = info.get(numeric_field)
1419                 if field is None or isinstance(field, compat_numeric_types):
1420                     continue
1421                 report_force_conversion(numeric_field, 'numeric', 'int')
1422                 info[numeric_field] = int_or_none(field)
1423
1424         sanitize_string_field(info_dict, 'id')
1425         sanitize_numeric_fields(info_dict)
1426
1427         if 'playlist' not in info_dict:
1428             # It isn't part of a playlist
1429             info_dict['playlist'] = None
1430             info_dict['playlist_index'] = None
1431
1432         thumbnails = info_dict.get('thumbnails')
1433         if thumbnails is None:
1434             thumbnail = info_dict.get('thumbnail')
1435             if thumbnail:
1436                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1437         if thumbnails:
1438             thumbnails.sort(key=lambda t: (
1439                 t.get('preference') if t.get('preference') is not None else -1,
1440                 t.get('width') if t.get('width') is not None else -1,
1441                 t.get('height') if t.get('height') is not None else -1,
1442                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1443             for i, t in enumerate(thumbnails):
1444                 t['url'] = sanitize_url(t['url'])
1445                 if t.get('width') and t.get('height'):
1446                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1447                 if t.get('id') is None:
1448                     t['id'] = '%d' % i
1449
1450         if self.params.get('list_thumbnails'):
1451             self.list_thumbnails(info_dict)
1452             return
1453
1454         thumbnail = info_dict.get('thumbnail')
1455         if thumbnail:
1456             info_dict['thumbnail'] = sanitize_url(thumbnail)
1457         elif thumbnails:
1458             info_dict['thumbnail'] = thumbnails[-1]['url']
1459
1460         if 'display_id' not in info_dict and 'id' in info_dict:
1461             info_dict['display_id'] = info_dict['id']
1462
1463         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1464             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1465             # see http://bugs.python.org/issue1646728)
1466             try:
1467                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1468                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1469             except (ValueError, OverflowError, OSError):
1470                 pass
1471
1472         # Auto generate title fields corresponding to the *_number fields when missing
1473         # in order to always have clean titles. This is very common for TV series.
1474         for field in ('chapter', 'season', 'episode'):
1475             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1476                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1477
1478         subtitles = info_dict.get('subtitles')
1479         if subtitles:
1480             for _, subtitle in subtitles.items():
1481                 for subtitle_format in subtitle:
1482                     if subtitle_format.get('url'):
1483                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1484                     if subtitle_format.get('ext') is None:
1485                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1486
1487         if self.params.get('listsubtitles', False):
1488             if 'automatic_captions' in info_dict:
1489                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1490             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1491             return
1492         info_dict['requested_subtitles'] = self.process_subtitles(
1493             info_dict['id'], subtitles,
1494             info_dict.get('automatic_captions'))
1495
1496         # We now pick which formats have to be downloaded
1497         if info_dict.get('formats') is None:
1498             # There's only one format available
1499             formats = [info_dict]
1500         else:
1501             formats = info_dict['formats']
1502
1503         if not formats:
1504             raise ExtractorError('No video formats found!')
1505
1506         def is_wellformed(f):
1507             url = f.get('url')
1508             if not url:
1509                 self.report_warning(
1510                     '"url" field is missing or empty - skipping format, '
1511                     'there is an error in extractor')
1512                 return False
1513             if isinstance(url, bytes):
1514                 sanitize_string_field(f, 'url')
1515             return True
1516
1517         # Filter out malformed formats for better extraction robustness
1518         formats = list(filter(is_wellformed, formats))
1519
1520         formats_dict = {}
1521
1522         # We check that all the formats have the format and format_id fields
1523         for i, format in enumerate(formats):
1524             sanitize_string_field(format, 'format_id')
1525             sanitize_numeric_fields(format)
1526             format['url'] = sanitize_url(format['url'])
1527             if not format.get('format_id'):
1528                 format['format_id'] = compat_str(i)
1529             else:
1530                 # Sanitize format_id from characters used in format selector expression
1531                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1532             format_id = format['format_id']
1533             if format_id not in formats_dict:
1534                 formats_dict[format_id] = []
1535             formats_dict[format_id].append(format)
1536
1537         # Make sure all formats have unique format_id
1538         for format_id, ambiguous_formats in formats_dict.items():
1539             if len(ambiguous_formats) > 1:
1540                 for i, format in enumerate(ambiguous_formats):
1541                     format['format_id'] = '%s-%d' % (format_id, i)
1542
1543         for i, format in enumerate(formats):
1544             if format.get('format') is None:
1545                 format['format'] = '{id} - {res}{note}'.format(
1546                     id=format['format_id'],
1547                     res=self.format_resolution(format),
1548                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1549                 )
1550             # Automatically determine file extension if missing
1551             if format.get('ext') is None:
1552                 format['ext'] = determine_ext(format['url']).lower()
1553             # Automatically determine protocol if missing (useful for format
1554             # selection purposes)
1555             if format.get('protocol') is None:
1556                 format['protocol'] = determine_protocol(format)
1557             # Add HTTP headers, so that external programs can use them from the
1558             # json output
1559             full_format_info = info_dict.copy()
1560             full_format_info.update(format)
1561             format['http_headers'] = self._calc_headers(full_format_info)
1562         # Remove private housekeeping stuff
1563         if '__x_forwarded_for_ip' in info_dict:
1564             del info_dict['__x_forwarded_for_ip']
1565
1566         # TODO Central sorting goes here
1567
1568         if formats[0] is not info_dict:
1569             # only set the 'formats' fields if the original info_dict list them
1570             # otherwise we end up with a circular reference, the first (and unique)
1571             # element in the 'formats' field in info_dict is info_dict itself,
1572             # which can't be exported to json
1573             info_dict['formats'] = formats
1574         if self.params.get('listformats'):
1575             self.list_formats(info_dict)
1576             return
1577
1578         req_format = self.params.get('format')
1579         if req_format is None:
1580             req_format = self._default_format_spec(info_dict, download=download)
1581             if self.params.get('verbose'):
1582                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1583
1584         format_selector = self.build_format_selector(req_format)
1585
1586         # While in format selection we may need to have an access to the original
1587         # format set in order to calculate some metrics or do some processing.
1588         # For now we need to be able to guess whether original formats provided
1589         # by extractor are incomplete or not (i.e. whether extractor provides only
1590         # video-only or audio-only formats) for proper formats selection for
1591         # extractors with such incomplete formats (see
1592         # https://github.com/rg3/youtube-dl/pull/5556).
1593         # Since formats may be filtered during format selection and may not match
1594         # the original formats the results may be incorrect. Thus original formats
1595         # or pre-calculated metrics should be passed to format selection routines
1596         # as well.
1597         # We will pass a context object containing all necessary additional data
1598         # instead of just formats.
1599         # This fixes incorrect format selection issue (see
1600         # https://github.com/rg3/youtube-dl/issues/10083).
1601         incomplete_formats = (
1602             # All formats are video-only or
1603             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1604             # all formats are audio-only
1605             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1606
1607         ctx = {
1608             'formats': formats,
1609             'incomplete_formats': incomplete_formats,
1610         }
1611
1612         formats_to_download = list(format_selector(ctx))
1613         if not formats_to_download:
1614             raise ExtractorError('requested format not available',
1615                                  expected=True)
1616
1617         if download:
1618             if len(formats_to_download) > 1:
1619                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1620             for format in formats_to_download:
1621                 new_info = dict(info_dict)
1622                 new_info.update(format)
1623                 self.process_info(new_info)
1624         # We update the info dict with the best quality format (backwards compatibility)
1625         info_dict.update(formats_to_download[-1])
1626         return info_dict
1627
1628     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1629         """Select the requested subtitles and their format"""
1630         available_subs = {}
1631         if normal_subtitles and self.params.get('writesubtitles'):
1632             available_subs.update(normal_subtitles)
1633         if automatic_captions and self.params.get('writeautomaticsub'):
1634             for lang, cap_info in automatic_captions.items():
1635                 if lang not in available_subs:
1636                     available_subs[lang] = cap_info
1637
1638         if (not self.params.get('writesubtitles') and not
1639                 self.params.get('writeautomaticsub') or not
1640                 available_subs):
1641             return None
1642
1643         if self.params.get('allsubtitles', False):
1644             requested_langs = available_subs.keys()
1645         else:
1646             if self.params.get('subtitleslangs', False):
1647                 requested_langs = self.params.get('subtitleslangs')
1648             elif 'en' in available_subs:
1649                 requested_langs = ['en']
1650             else:
1651                 requested_langs = [list(available_subs.keys())[0]]
1652
1653         formats_query = self.params.get('subtitlesformat', 'best')
1654         formats_preference = formats_query.split('/') if formats_query else []
1655         subs = {}
1656         for lang in requested_langs:
1657             formats = available_subs.get(lang)
1658             if formats is None:
1659                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1660                 continue
1661             for ext in formats_preference:
1662                 if ext == 'best':
1663                     f = formats[-1]
1664                     break
1665                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1666                 if matches:
1667                     f = matches[-1]
1668                     break
1669             else:
1670                 f = formats[-1]
1671                 self.report_warning(
1672                     'No subtitle format found matching "%s" for language %s, '
1673                     'using %s' % (formats_query, lang, f['ext']))
1674             subs[lang] = f
1675         return subs
1676
1677     def process_info(self, info_dict):
1678         """Process a single resolved IE result."""
1679
1680         assert info_dict.get('_type', 'video') == 'video'
1681
1682         max_downloads = self.params.get('max_downloads')
1683         if max_downloads is not None:
1684             if self._num_downloads >= int(max_downloads):
1685                 raise MaxDownloadsReached()
1686
1687         info_dict['fulltitle'] = info_dict['title']
1688         if len(info_dict['title']) > 200:
1689             info_dict['title'] = info_dict['title'][:197] + '...'
1690
1691         if 'format' not in info_dict:
1692             info_dict['format'] = info_dict['ext']
1693
1694         reason = self._match_entry(info_dict, incomplete=False)
1695         if reason is not None:
1696             self.to_screen('[download] ' + reason)
1697             return
1698
1699         self._num_downloads += 1
1700
1701         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1702
1703         # Forced printings
1704         if self.params.get('forcetitle', False):
1705             self.to_stdout(info_dict['fulltitle'])
1706         if self.params.get('forceid', False):
1707             self.to_stdout(info_dict['id'])
1708         if self.params.get('forceurl', False):
1709             if info_dict.get('requested_formats') is not None:
1710                 for f in info_dict['requested_formats']:
1711                     self.to_stdout(f['url'] + f.get('play_path', ''))
1712             else:
1713                 # For RTMP URLs, also include the playpath
1714                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1715         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1716             self.to_stdout(info_dict['thumbnail'])
1717         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1718             self.to_stdout(info_dict['description'])
1719         if self.params.get('forcefilename', False) and filename is not None:
1720             self.to_stdout(filename)
1721         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1722             self.to_stdout(formatSeconds(info_dict['duration']))
1723         if self.params.get('forceformat', False):
1724             self.to_stdout(info_dict['format'])
1725         if self.params.get('forcejson', False):
1726             self.to_stdout(json.dumps(info_dict))
1727
1728         # Do nothing else if in simulate mode
1729         if self.params.get('simulate', False):
1730             return
1731
1732         if filename is None:
1733             return
1734
1735         def ensure_dir_exists(path):
1736             try:
1737                 dn = os.path.dirname(path)
1738                 if dn and not os.path.exists(dn):
1739                     os.makedirs(dn)
1740                 return True
1741             except (OSError, IOError) as err:
1742                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1743                 return False
1744
1745         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1746             return
1747
1748         if self.params.get('writedescription', False):
1749             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1750             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1751                 self.to_screen('[info] Video description is already present')
1752             elif info_dict.get('description') is None:
1753                 self.report_warning('There\'s no description to write.')
1754             else:
1755                 try:
1756                     self.to_screen('[info] Writing video description to: ' + descfn)
1757                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1758                         descfile.write(info_dict['description'])
1759                 except (OSError, IOError):
1760                     self.report_error('Cannot write description file ' + descfn)
1761                     return
1762
1763         if self.params.get('writeannotations', False):
1764             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1765             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1766                 self.to_screen('[info] Video annotations are already present')
1767             else:
1768                 try:
1769                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1770                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1771                         annofile.write(info_dict['annotations'])
1772                 except (KeyError, TypeError):
1773                     self.report_warning('There are no annotations to write.')
1774                 except (OSError, IOError):
1775                     self.report_error('Cannot write annotations file: ' + annofn)
1776                     return
1777
1778         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1779                                        self.params.get('writeautomaticsub')])
1780
1781         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1782             # subtitles download errors are already managed as troubles in relevant IE
1783             # that way it will silently go on when used with unsupporting IE
1784             subtitles = info_dict['requested_subtitles']
1785             ie = self.get_info_extractor(info_dict['extractor_key'])
1786             for sub_lang, sub_info in subtitles.items():
1787                 sub_format = sub_info['ext']
1788                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1789                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1790                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1791                 else:
1792                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1793                     if sub_info.get('data') is not None:
1794                         try:
1795                             # Use newline='' to prevent conversion of newline characters
1796                             # See https://github.com/rg3/youtube-dl/issues/10268
1797                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1798                                 subfile.write(sub_info['data'])
1799                         except (OSError, IOError):
1800                             self.report_error('Cannot write subtitles file ' + sub_filename)
1801                             return
1802                     else:
1803                         try:
1804                             sub_data = ie._request_webpage(
1805                                 sub_info['url'], info_dict['id'], note=False).read()
1806                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1807                                 subfile.write(sub_data)
1808                         except (ExtractorError, IOError, OSError, ValueError) as err:
1809                             self.report_warning('Unable to download subtitle for "%s": %s' %
1810                                                 (sub_lang, error_to_compat_str(err)))
1811                             continue
1812
1813         if self.params.get('writeinfojson', False):
1814             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1815             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1816                 self.to_screen('[info] Video description metadata is already present')
1817             else:
1818                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1819                 try:
1820                     write_json_file(self.filter_requested_info(info_dict), infofn)
1821                 except (OSError, IOError):
1822                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1823                     return
1824
1825         self._write_thumbnails(info_dict, filename)
1826
1827         if not self.params.get('skip_download', False):
1828             try:
1829                 def dl(name, info):
1830                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1831                     for ph in self._progress_hooks:
1832                         fd.add_progress_hook(ph)
1833                     if self.params.get('verbose'):
1834                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1835                     return fd.download(name, info)
1836
1837                 if info_dict.get('requested_formats') is not None:
1838                     downloaded = []
1839                     success = True
1840                     merger = FFmpegMergerPP(self)
1841                     if not merger.available:
1842                         postprocessors = []
1843                         self.report_warning('You have requested multiple '
1844                                             'formats but ffmpeg or avconv are not installed.'
1845                                             ' The formats won\'t be merged.')
1846                     else:
1847                         postprocessors = [merger]
1848
1849                     def compatible_formats(formats):
1850                         video, audio = formats
1851                         # Check extension
1852                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1853                         if video_ext and audio_ext:
1854                             COMPATIBLE_EXTS = (
1855                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1856                                 ('webm')
1857                             )
1858                             for exts in COMPATIBLE_EXTS:
1859                                 if video_ext in exts and audio_ext in exts:
1860                                     return True
1861                         # TODO: Check acodec/vcodec
1862                         return False
1863
1864                     filename_real_ext = os.path.splitext(filename)[1][1:]
1865                     filename_wo_ext = (
1866                         os.path.splitext(filename)[0]
1867                         if filename_real_ext == info_dict['ext']
1868                         else filename)
1869                     requested_formats = info_dict['requested_formats']
1870                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1871                         info_dict['ext'] = 'mkv'
1872                         self.report_warning(
1873                             'Requested formats are incompatible for merge and will be merged into mkv.')
1874                     # Ensure filename always has a correct extension for successful merge
1875                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1876                     if os.path.exists(encodeFilename(filename)):
1877                         self.to_screen(
1878                             '[download] %s has already been downloaded and '
1879                             'merged' % filename)
1880                     else:
1881                         for f in requested_formats:
1882                             new_info = dict(info_dict)
1883                             new_info.update(f)
1884                             fname = prepend_extension(
1885                                 self.prepare_filename(new_info),
1886                                 'f%s' % f['format_id'], new_info['ext'])
1887                             if not ensure_dir_exists(fname):
1888                                 return
1889                             downloaded.append(fname)
1890                             partial_success = dl(fname, new_info)
1891                             success = success and partial_success
1892                         info_dict['__postprocessors'] = postprocessors
1893                         info_dict['__files_to_merge'] = downloaded
1894                 else:
1895                     # Just a single file
1896                     success = dl(filename, info_dict)
1897             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1898                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1899                 return
1900             except (OSError, IOError) as err:
1901                 raise UnavailableVideoError(err)
1902             except (ContentTooShortError, ) as err:
1903                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1904                 return
1905
1906             if success and filename != '-':
1907                 # Fixup content
1908                 fixup_policy = self.params.get('fixup')
1909                 if fixup_policy is None:
1910                     fixup_policy = 'detect_or_warn'
1911
1912                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1913
1914                 stretched_ratio = info_dict.get('stretched_ratio')
1915                 if stretched_ratio is not None and stretched_ratio != 1:
1916                     if fixup_policy == 'warn':
1917                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1918                             info_dict['id'], stretched_ratio))
1919                     elif fixup_policy == 'detect_or_warn':
1920                         stretched_pp = FFmpegFixupStretchedPP(self)
1921                         if stretched_pp.available:
1922                             info_dict.setdefault('__postprocessors', [])
1923                             info_dict['__postprocessors'].append(stretched_pp)
1924                         else:
1925                             self.report_warning(
1926                                 '%s: Non-uniform pixel ratio (%s). %s'
1927                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1928                     else:
1929                         assert fixup_policy in ('ignore', 'never')
1930
1931                 if (info_dict.get('requested_formats') is None and
1932                         info_dict.get('container') == 'm4a_dash'):
1933                     if fixup_policy == 'warn':
1934                         self.report_warning(
1935                             '%s: writing DASH m4a. '
1936                             'Only some players support this container.'
1937                             % info_dict['id'])
1938                     elif fixup_policy == 'detect_or_warn':
1939                         fixup_pp = FFmpegFixupM4aPP(self)
1940                         if fixup_pp.available:
1941                             info_dict.setdefault('__postprocessors', [])
1942                             info_dict['__postprocessors'].append(fixup_pp)
1943                         else:
1944                             self.report_warning(
1945                                 '%s: writing DASH m4a. '
1946                                 'Only some players support this container. %s'
1947                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1948                     else:
1949                         assert fixup_policy in ('ignore', 'never')
1950
1951                 if (info_dict.get('protocol') == 'm3u8_native' or
1952                         info_dict.get('protocol') == 'm3u8' and
1953                         self.params.get('hls_prefer_native')):
1954                     if fixup_policy == 'warn':
1955                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1956                             info_dict['id']))
1957                     elif fixup_policy == 'detect_or_warn':
1958                         fixup_pp = FFmpegFixupM3u8PP(self)
1959                         if fixup_pp.available:
1960                             info_dict.setdefault('__postprocessors', [])
1961                             info_dict['__postprocessors'].append(fixup_pp)
1962                         else:
1963                             self.report_warning(
1964                                 '%s: malformed AAC bitstream detected. %s'
1965                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1966                     else:
1967                         assert fixup_policy in ('ignore', 'never')
1968
1969                 try:
1970                     self.post_process(filename, info_dict)
1971                 except (PostProcessingError) as err:
1972                     self.report_error('postprocessing: %s' % str(err))
1973                     return
1974                 self.record_download_archive(info_dict)
1975
1976     def download(self, url_list):
1977         """Download a given list of URLs."""
1978         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1979         if (len(url_list) > 1 and
1980                 outtmpl != '-' and
1981                 '%' not in outtmpl and
1982                 self.params.get('max_downloads') != 1):
1983             raise SameFileError(outtmpl)
1984
1985         for url in url_list:
1986             try:
1987                 # It also downloads the videos
1988                 res = self.extract_info(
1989                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1990             except UnavailableVideoError:
1991                 self.report_error('unable to download video')
1992             except MaxDownloadsReached:
1993                 self.to_screen('[info] Maximum number of downloaded files reached.')
1994                 raise
1995             else:
1996                 if self.params.get('dump_single_json', False):
1997                     self.to_stdout(json.dumps(res))
1998
1999         return self._download_retcode
2000
2001     def download_with_info_file(self, info_filename):
2002         with contextlib.closing(fileinput.FileInput(
2003                 [info_filename], mode='r',
2004                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2005             # FileInput doesn't have a read method, we can't call json.load
2006             info = self.filter_requested_info(json.loads('\n'.join(f)))
2007         try:
2008             self.process_ie_result(info, download=True)
2009         except DownloadError:
2010             webpage_url = info.get('webpage_url')
2011             if webpage_url is not None:
2012                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2013                 return self.download([webpage_url])
2014             else:
2015                 raise
2016         return self._download_retcode
2017
2018     @staticmethod
2019     def filter_requested_info(info_dict):
2020         return dict(
2021             (k, v) for k, v in info_dict.items()
2022             if k not in ['requested_formats', 'requested_subtitles'])
2023
2024     def post_process(self, filename, ie_info):
2025         """Run all the postprocessors on the given file."""
2026         info = dict(ie_info)
2027         info['filepath'] = filename
2028         pps_chain = []
2029         if ie_info.get('__postprocessors') is not None:
2030             pps_chain.extend(ie_info['__postprocessors'])
2031         pps_chain.extend(self._pps)
2032         for pp in pps_chain:
2033             files_to_delete = []
2034             try:
2035                 files_to_delete, info = pp.run(info)
2036             except PostProcessingError as e:
2037                 self.report_error(e.msg)
2038             if files_to_delete and not self.params.get('keepvideo', False):
2039                 for old_filename in files_to_delete:
2040                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2041                     try:
2042                         os.remove(encodeFilename(old_filename))
2043                     except (IOError, OSError):
2044                         self.report_warning('Unable to remove downloaded original file')
2045
2046     def _make_archive_id(self, info_dict):
2047         # Future-proof against any change in case
2048         # and backwards compatibility with prior versions
2049         extractor = info_dict.get('extractor_key')
2050         if extractor is None:
2051             if 'id' in info_dict:
2052                 extractor = info_dict.get('ie_key')  # key in a playlist
2053         if extractor is None:
2054             return None  # Incomplete video information
2055         return extractor.lower() + ' ' + info_dict['id']
2056
2057     def in_download_archive(self, info_dict):
2058         fn = self.params.get('download_archive')
2059         if fn is None:
2060             return False
2061
2062         vid_id = self._make_archive_id(info_dict)
2063         if vid_id is None:
2064             return False  # Incomplete video information
2065
2066         try:
2067             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2068                 for line in archive_file:
2069                     if line.strip() == vid_id:
2070                         return True
2071         except IOError as ioe:
2072             if ioe.errno != errno.ENOENT:
2073                 raise
2074         return False
2075
2076     def record_download_archive(self, info_dict):
2077         fn = self.params.get('download_archive')
2078         if fn is None:
2079             return
2080         vid_id = self._make_archive_id(info_dict)
2081         assert vid_id
2082         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2083             archive_file.write(vid_id + '\n')
2084
2085     @staticmethod
2086     def format_resolution(format, default='unknown'):
2087         if format.get('vcodec') == 'none':
2088             return 'audio only'
2089         if format.get('resolution') is not None:
2090             return format['resolution']
2091         if format.get('height') is not None:
2092             if format.get('width') is not None:
2093                 res = '%sx%s' % (format['width'], format['height'])
2094             else:
2095                 res = '%sp' % format['height']
2096         elif format.get('width') is not None:
2097             res = '%dx?' % format['width']
2098         else:
2099             res = default
2100         return res
2101
2102     def _format_note(self, fdict):
2103         res = ''
2104         if fdict.get('ext') in ['f4f', 'f4m']:
2105             res += '(unsupported) '
2106         if fdict.get('language'):
2107             if res:
2108                 res += ' '
2109             res += '[%s] ' % fdict['language']
2110         if fdict.get('format_note') is not None:
2111             res += fdict['format_note'] + ' '
2112         if fdict.get('tbr') is not None:
2113             res += '%4dk ' % fdict['tbr']
2114         if fdict.get('container') is not None:
2115             if res:
2116                 res += ', '
2117             res += '%s container' % fdict['container']
2118         if (fdict.get('vcodec') is not None and
2119                 fdict.get('vcodec') != 'none'):
2120             if res:
2121                 res += ', '
2122             res += fdict['vcodec']
2123             if fdict.get('vbr') is not None:
2124                 res += '@'
2125         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2126             res += 'video@'
2127         if fdict.get('vbr') is not None:
2128             res += '%4dk' % fdict['vbr']
2129         if fdict.get('fps') is not None:
2130             if res:
2131                 res += ', '
2132             res += '%sfps' % fdict['fps']
2133         if fdict.get('acodec') is not None:
2134             if res:
2135                 res += ', '
2136             if fdict['acodec'] == 'none':
2137                 res += 'video only'
2138             else:
2139                 res += '%-5s' % fdict['acodec']
2140         elif fdict.get('abr') is not None:
2141             if res:
2142                 res += ', '
2143             res += 'audio'
2144         if fdict.get('abr') is not None:
2145             res += '@%3dk' % fdict['abr']
2146         if fdict.get('asr') is not None:
2147             res += ' (%5dHz)' % fdict['asr']
2148         if fdict.get('filesize') is not None:
2149             if res:
2150                 res += ', '
2151             res += format_bytes(fdict['filesize'])
2152         elif fdict.get('filesize_approx') is not None:
2153             if res:
2154                 res += ', '
2155             res += '~' + format_bytes(fdict['filesize_approx'])
2156         return res
2157
2158     def list_formats(self, info_dict):
2159         formats = info_dict.get('formats', [info_dict])
2160         table = [
2161             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2162             for f in formats
2163             if f.get('preference') is None or f['preference'] >= -1000]
2164         if len(formats) > 1:
2165             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2166
2167         header_line = ['format code', 'extension', 'resolution', 'note']
2168         self.to_screen(
2169             '[info] Available formats for %s:\n%s' %
2170             (info_dict['id'], render_table(header_line, table)))
2171
2172     def list_thumbnails(self, info_dict):
2173         thumbnails = info_dict.get('thumbnails')
2174         if not thumbnails:
2175             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2176             return
2177
2178         self.to_screen(
2179             '[info] Thumbnails for %s:' % info_dict['id'])
2180         self.to_screen(render_table(
2181             ['ID', 'width', 'height', 'URL'],
2182             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2183
2184     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2185         if not subtitles:
2186             self.to_screen('%s has no %s' % (video_id, name))
2187             return
2188         self.to_screen(
2189             'Available %s for %s:' % (name, video_id))
2190         self.to_screen(render_table(
2191             ['Language', 'formats'],
2192             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2193                 for lang, formats in subtitles.items()]))
2194
2195     def urlopen(self, req):
2196         """ Start an HTTP download """
2197         if isinstance(req, compat_basestring):
2198             req = sanitized_Request(req)
2199         return self._opener.open(req, timeout=self._socket_timeout)
2200
2201     def print_debug_header(self):
2202         if not self.params.get('verbose'):
2203             return
2204
2205         if type('') is not compat_str:
2206             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2207             self.report_warning(
2208                 'Your Python is broken! Update to a newer and supported version')
2209
2210         stdout_encoding = getattr(
2211             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2212         encoding_str = (
2213             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2214                 locale.getpreferredencoding(),
2215                 sys.getfilesystemencoding(),
2216                 stdout_encoding,
2217                 self.get_encoding()))
2218         write_string(encoding_str, encoding=None)
2219
2220         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2221         if _LAZY_LOADER:
2222             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2223         try:
2224             sp = subprocess.Popen(
2225                 ['git', 'rev-parse', '--short', 'HEAD'],
2226                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2227                 cwd=os.path.dirname(os.path.abspath(__file__)))
2228             out, err = sp.communicate()
2229             out = out.decode().strip()
2230             if re.match('[0-9a-f]+', out):
2231                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2232         except Exception:
2233             try:
2234                 sys.exc_clear()
2235             except Exception:
2236                 pass
2237
2238         def python_implementation():
2239             impl_name = platform.python_implementation()
2240             if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2241                 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2242             return impl_name
2243
2244         self._write_string('[debug] Python version %s (%s) - %s\n' % (
2245             platform.python_version(), python_implementation(),
2246             platform_name()))
2247
2248         exe_versions = FFmpegPostProcessor.get_versions(self)
2249         exe_versions['rtmpdump'] = rtmpdump_version()
2250         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2251         exe_str = ', '.join(
2252             '%s %s' % (exe, v)
2253             for exe, v in sorted(exe_versions.items())
2254             if v
2255         )
2256         if not exe_str:
2257             exe_str = 'none'
2258         self._write_string('[debug] exe versions: %s\n' % exe_str)
2259
2260         proxy_map = {}
2261         for handler in self._opener.handlers:
2262             if hasattr(handler, 'proxies'):
2263                 proxy_map.update(handler.proxies)
2264         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2265
2266         if self.params.get('call_home', False):
2267             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2268             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2269             latest_version = self.urlopen(
2270                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2271             if version_tuple(latest_version) > version_tuple(__version__):
2272                 self.report_warning(
2273                     'You are using an outdated version (newest version: %s)! '
2274                     'See https://yt-dl.org/update if you need help updating.' %
2275                     latest_version)
2276
2277     def _setup_opener(self):
2278         timeout_val = self.params.get('socket_timeout')
2279         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2280
2281         opts_cookiefile = self.params.get('cookiefile')
2282         opts_proxy = self.params.get('proxy')
2283
2284         if opts_cookiefile is None:
2285             self.cookiejar = compat_cookiejar.CookieJar()
2286         else:
2287             opts_cookiefile = expand_path(opts_cookiefile)
2288             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2289                 opts_cookiefile)
2290             if os.access(opts_cookiefile, os.R_OK):
2291                 self.cookiejar.load()
2292
2293         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2294         if opts_proxy is not None:
2295             if opts_proxy == '':
2296                 proxies = {}
2297             else:
2298                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2299         else:
2300             proxies = compat_urllib_request.getproxies()
2301             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2302             if 'http' in proxies and 'https' not in proxies:
2303                 proxies['https'] = proxies['http']
2304         proxy_handler = PerRequestProxyHandler(proxies)
2305
2306         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2307         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2308         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2309         data_handler = compat_urllib_request_DataHandler()
2310
2311         # When passing our own FileHandler instance, build_opener won't add the
2312         # default FileHandler and allows us to disable the file protocol, which
2313         # can be used for malicious purposes (see
2314         # https://github.com/rg3/youtube-dl/issues/8227)
2315         file_handler = compat_urllib_request.FileHandler()
2316
2317         def file_open(*args, **kwargs):
2318             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2319         file_handler.file_open = file_open
2320
2321         opener = compat_urllib_request.build_opener(
2322             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2323
2324         # Delete the default user-agent header, which would otherwise apply in
2325         # cases where our custom HTTP handler doesn't come into play
2326         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2327         opener.addheaders = []
2328         self._opener = opener
2329
2330     def encode(self, s):
2331         if isinstance(s, bytes):
2332             return s  # Already encoded
2333
2334         try:
2335             return s.encode(self.get_encoding())
2336         except UnicodeEncodeError as err:
2337             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2338             raise
2339
2340     def get_encoding(self):
2341         encoding = self.params.get('encoding')
2342         if encoding is None:
2343             encoding = preferredencoding()
2344         return encoding
2345
2346     def _write_thumbnails(self, info_dict, filename):
2347         if self.params.get('writethumbnail', False):
2348             thumbnails = info_dict.get('thumbnails')
2349             if thumbnails:
2350                 thumbnails = [thumbnails[-1]]
2351         elif self.params.get('write_all_thumbnails', False):
2352             thumbnails = info_dict.get('thumbnails')
2353         else:
2354             return
2355
2356         if not thumbnails:
2357             # No thumbnails present, so return immediately
2358             return
2359
2360         for t in thumbnails:
2361             thumb_ext = determine_ext(t['url'], 'jpg')
2362             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2363             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2364             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2365
2366             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2367                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2368                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2369             else:
2370                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2371                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2372                 try:
2373                     uf = self.urlopen(t['url'])
2374                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2375                         shutil.copyfileobj(uf, thumbf)
2376                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2377                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2378                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2379                     self.report_warning('Unable to download thumbnail "%s": %s' %
2380                                         (t['url'], error_to_compat_str(err)))