[YoutubeDL] PEP 8
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     PagedList,
69     parse_filesize,
70     PerRequestProxyHandler,
71     platform_name,
72     PostProcessingError,
73     preferredencoding,
74     prepend_extension,
75     register_socks_protocols,
76     render_table,
77     replace_extension,
78     SameFileError,
79     sanitize_filename,
80     sanitize_path,
81     sanitize_url,
82     sanitized_Request,
83     std_headers,
84     subtitles_filename,
85     UnavailableVideoError,
86     url_basename,
87     version_tuple,
88     write_json_file,
89     write_string,
90     YoutubeDLCookieProcessor,
91     YoutubeDLHandler,
92 )
93 from .cache import Cache
94 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
95 from .extractor.openload import PhantomJSwrapper
96 from .downloader import get_suitable_downloader
97 from .downloader.rtmp import rtmpdump_version
98 from .postprocessor import (
99     FFmpegFixupM3u8PP,
100     FFmpegFixupM4aPP,
101     FFmpegFixupStretchedPP,
102     FFmpegMergerPP,
103     FFmpegPostProcessor,
104     get_postprocessor,
105 )
106 from .version import __version__
107
108 if compat_os_name == 'nt':
109     import ctypes
110
111
112 class YoutubeDL(object):
113     """YoutubeDL class.
114
115     YoutubeDL objects are the ones responsible of downloading the
116     actual video file and writing it to disk if the user has requested
117     it, among some other tasks. In most cases there should be one per
118     program. As, given a video URL, the downloader doesn't know how to
119     extract all the needed information, task that InfoExtractors do, it
120     has to pass the URL to one of them.
121
122     For this, YoutubeDL objects have a method that allows
123     InfoExtractors to be registered in a given order. When it is passed
124     a URL, the YoutubeDL object handles it to the first InfoExtractor it
125     finds that reports being able to handle it. The InfoExtractor extracts
126     all the information about the video or videos the URL refers to, and
127     YoutubeDL process the extracted information, possibly using a File
128     Downloader to download the video.
129
130     YoutubeDL objects accept a lot of parameters. In order not to saturate
131     the object constructor with arguments, it receives a dictionary of
132     options instead. These options are available through the params
133     attribute for the InfoExtractors to use. The YoutubeDL also
134     registers itself as the downloader in charge for the InfoExtractors
135     that are added to it, so this is a "mutual registration".
136
137     Available options:
138
139     username:          Username for authentication purposes.
140     password:          Password for authentication purposes.
141     videopassword:     Password for accessing a video.
142     ap_mso:            Adobe Pass multiple-system operator identifier.
143     ap_username:       Multiple-system operator account username.
144     ap_password:       Multiple-system operator account password.
145     usenetrc:          Use netrc for authentication instead.
146     verbose:           Print additional info to stdout.
147     quiet:             Do not print messages to stdout.
148     no_warnings:       Do not print out anything for warnings.
149     forceurl:          Force printing final URL.
150     forcetitle:        Force printing title.
151     forceid:           Force printing ID.
152     forcethumbnail:    Force printing thumbnail URL.
153     forcedescription:  Force printing description.
154     forcefilename:     Force printing final filename.
155     forceduration:     Force printing duration.
156     forcejson:         Force printing info_dict as JSON.
157     dump_single_json:  Force printing the info_dict of the whole playlist
158                        (or video) as a single JSON line.
159     simulate:          Do not download the video files.
160     format:            Video format code. See options.py for more information.
161     outtmpl:           Template for output names.
162     restrictfilenames: Do not allow "&" and spaces in file names
163     ignoreerrors:      Do not stop on download errors.
164     force_generic_extractor: Force downloader to use the generic extractor
165     nooverwrites:      Prevent overwriting files.
166     playliststart:     Playlist item to start at.
167     playlistend:       Playlist item to end at.
168     playlist_items:    Specific indices of playlist to download.
169     playlistreverse:   Download playlist items in reverse order.
170     playlistrandom:    Download playlist items in random order.
171     matchtitle:        Download only matching titles.
172     rejecttitle:       Reject downloads for matching titles.
173     logger:            Log messages to a logging.Logger instance.
174     logtostderr:       Log messages to stderr instead of stdout.
175     writedescription:  Write the video description to a .description file
176     writeinfojson:     Write the video description to a .info.json file
177     writeannotations:  Write the video annotations to a .annotations.xml file
178     writethumbnail:    Write the thumbnail image to a file
179     write_all_thumbnails:  Write all thumbnail formats to files
180     writesubtitles:    Write the video subtitles to a file
181     writeautomaticsub: Write the automatically generated subtitles to a file
182     allsubtitles:      Downloads all the subtitles of the video
183                        (requires writesubtitles or writeautomaticsub)
184     listsubtitles:     Lists all available subtitles for the video
185     subtitlesformat:   The format code for subtitles
186     subtitleslangs:    List of languages of the subtitles to download
187     keepvideo:         Keep the video file after post-processing
188     daterange:         A DateRange object, download only if the upload_date is in the range.
189     skip_download:     Skip the actual download of the video file
190     cachedir:          Location of the cache files in the filesystem.
191                        False to disable filesystem cache.
192     noplaylist:        Download single video instead of a playlist if in doubt.
193     age_limit:         An integer representing the user's age in years.
194                        Unsuitable videos for the given age are skipped.
195     min_views:         An integer representing the minimum view count the video
196                        must have in order to not be skipped.
197                        Videos without view count information are always
198                        downloaded. None for no limit.
199     max_views:         An integer representing the maximum view count.
200                        Videos that are more popular than that are not
201                        downloaded.
202                        Videos without view count information are always
203                        downloaded. None for no limit.
204     download_archive:  File name of a file where all downloads are recorded.
205                        Videos already present in the file are not downloaded
206                        again.
207     cookiefile:        File name where cookies should be read from and dumped to.
208     nocheckcertificate:Do not verify SSL certificates
209     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
210                        At the moment, this is only supported by YouTube.
211     proxy:             URL of the proxy server to use
212     geo_verification_proxy:  URL of the proxy to use for IP address verification
213                        on geo-restricted sites. (Experimental)
214     socket_timeout:    Time to wait for unresponsive hosts, in seconds
215     bidi_workaround:   Work around buggy terminals without bidirectional text
216                        support, using fridibi
217     debug_printtraffic:Print out sent and received HTTP traffic
218     include_ads:       Download ads as well
219     default_search:    Prepend this string if an input url is not valid.
220                        'auto' for elaborate guessing
221     encoding:          Use this encoding instead of the system-specified.
222     extract_flat:      Do not resolve URLs, return the immediate result.
223                        Pass in 'in_playlist' to only show this behavior for
224                        playlist items.
225     postprocessors:    A list of dictionaries, each with an entry
226                        * key:  The name of the postprocessor. See
227                                youtube_dl/postprocessor/__init__.py for a list.
228                        as well as any further keyword arguments for the
229                        postprocessor.
230     progress_hooks:    A list of functions that get called on download
231                        progress, with a dictionary with the entries
232                        * status: One of "downloading", "error", or "finished".
233                                  Check this first and ignore unknown values.
234
235                        If status is one of "downloading", or "finished", the
236                        following properties may also be present:
237                        * filename: The final filename (always present)
238                        * tmpfilename: The filename we're currently writing to
239                        * downloaded_bytes: Bytes on disk
240                        * total_bytes: Size of the whole file, None if unknown
241                        * total_bytes_estimate: Guess of the eventual file size,
242                                                None if unavailable.
243                        * elapsed: The number of seconds since download started.
244                        * eta: The estimated time in seconds, None if unknown
245                        * speed: The download speed in bytes/second, None if
246                                 unknown
247                        * fragment_index: The counter of the currently
248                                          downloaded video fragment.
249                        * fragment_count: The number of fragments (= individual
250                                          files that will be merged)
251
252                        Progress hooks are guaranteed to be called at least once
253                        (with status "finished") if the download is successful.
254     merge_output_format: Extension to use when merging formats.
255     fixup:             Automatically correct known faults of the file.
256                        One of:
257                        - "never": do nothing
258                        - "warn": only emit a warning
259                        - "detect_or_warn": check whether we can do anything
260                                            about it, warn otherwise (default)
261     source_address:    (Experimental) Client-side IP address to bind to.
262     call_home:         Boolean, true iff we are allowed to contact the
263                        youtube-dl servers for debugging.
264     sleep_interval:    Number of seconds to sleep before each download when
265                        used alone or a lower bound of a range for randomized
266                        sleep before each download (minimum possible number
267                        of seconds to sleep) when used along with
268                        max_sleep_interval.
269     max_sleep_interval:Upper bound of a range for randomized sleep before each
270                        download (maximum possible number of seconds to sleep).
271                        Must only be used along with sleep_interval.
272                        Actual sleep time will be a random float from range
273                        [sleep_interval; max_sleep_interval].
274     listformats:       Print an overview of available video formats and exit.
275     list_thumbnails:   Print a table of all thumbnails and exit.
276     match_filter:      A function that gets called with the info_dict of
277                        every video.
278                        If it returns a message, the video is ignored.
279                        If it returns None, the video is downloaded.
280                        match_filter_func in utils.py is one example for this.
281     no_color:          Do not emit color codes in output.
282     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
283                        HTTP header (experimental)
284     geo_bypass_country:
285                        Two-letter ISO 3166-2 country code that will be used for
286                        explicit geographic restriction bypassing via faking
287                        X-Forwarded-For HTTP header (experimental)
288
289     The following options determine which downloader is picked:
290     external_downloader: Executable of the external downloader to call.
291                        None or unset for standard (built-in) downloader.
292     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
293                        if True, otherwise use ffmpeg/avconv if False, otherwise
294                        use downloader suggested by extractor if None.
295
296     The following parameters are not used by YoutubeDL itself, they are used by
297     the downloader (see youtube_dl/downloader/common.py):
298     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
299     noresizebuffer, retries, continuedl, noprogress, consoletitle,
300     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
301
302     The following options are used by the post processors:
303     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
304                        otherwise prefer avconv.
305     postprocessor_args: A list of additional command-line arguments for the
306                         postprocessor.
307
308     The following options are used by the Youtube extractor:
309     youtube_include_dash_manifest: If True (default), DASH manifests and related
310                         data will be downloaded and processed by extractor.
311                         You can reduce network I/O by disabling it if you don't
312                         care about DASH.
313     """
314
315     _NUMERIC_FIELDS = set((
316         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
317         'timestamp', 'upload_year', 'upload_month', 'upload_day',
318         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
319         'average_rating', 'comment_count', 'age_limit',
320         'start_time', 'end_time',
321         'chapter_number', 'season_number', 'episode_number',
322         'track_number', 'disc_number', 'release_year',
323         'playlist_index',
324     ))
325
326     params = None
327     _ies = []
328     _pps = []
329     _download_retcode = None
330     _num_downloads = None
331     _screen_file = None
332
333     def __init__(self, params=None, auto_init=True):
334         """Create a FileDownloader object with the given options."""
335         if params is None:
336             params = {}
337         self._ies = []
338         self._ies_instances = {}
339         self._pps = []
340         self._progress_hooks = []
341         self._download_retcode = 0
342         self._num_downloads = 0
343         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
344         self._err_file = sys.stderr
345         self.params = {
346             # Default parameters
347             'nocheckcertificate': False,
348         }
349         self.params.update(params)
350         self.cache = Cache(self)
351
352         def check_deprecated(param, option, suggestion):
353             if self.params.get(param) is not None:
354                 self.report_warning(
355                     '%s is deprecated. Use %s instead.' % (option, suggestion))
356                 return True
357             return False
358
359         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
360             if self.params.get('geo_verification_proxy') is None:
361                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
362
363         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
364         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
365         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
366
367         if params.get('bidi_workaround', False):
368             try:
369                 import pty
370                 master, slave = pty.openpty()
371                 width = compat_get_terminal_size().columns
372                 if width is None:
373                     width_args = []
374                 else:
375                     width_args = ['-w', str(width)]
376                 sp_kwargs = dict(
377                     stdin=subprocess.PIPE,
378                     stdout=slave,
379                     stderr=self._err_file)
380                 try:
381                     self._output_process = subprocess.Popen(
382                         ['bidiv'] + width_args, **sp_kwargs
383                     )
384                 except OSError:
385                     self._output_process = subprocess.Popen(
386                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
387                 self._output_channel = os.fdopen(master, 'rb')
388             except OSError as ose:
389                 if ose.errno == errno.ENOENT:
390                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
391                 else:
392                     raise
393
394         if (sys.platform != 'win32' and
395                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
396                 not params.get('restrictfilenames', False)):
397             # Unicode filesystem API will throw errors (#1474, #13027)
398             self.report_warning(
399                 'Assuming --restrict-filenames since file system encoding '
400                 'cannot encode all characters. '
401                 'Set the LC_ALL environment variable to fix this.')
402             self.params['restrictfilenames'] = True
403
404         if isinstance(params.get('outtmpl'), bytes):
405             self.report_warning(
406                 'Parameter outtmpl is bytes, but should be a unicode string. '
407                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
408
409         self._setup_opener()
410
411         if auto_init:
412             self.print_debug_header()
413             self.add_default_info_extractors()
414
415         for pp_def_raw in self.params.get('postprocessors', []):
416             pp_class = get_postprocessor(pp_def_raw['key'])
417             pp_def = dict(pp_def_raw)
418             del pp_def['key']
419             pp = pp_class(self, **compat_kwargs(pp_def))
420             self.add_post_processor(pp)
421
422         for ph in self.params.get('progress_hooks', []):
423             self.add_progress_hook(ph)
424
425         register_socks_protocols()
426
427     def warn_if_short_id(self, argv):
428         # short YouTube ID starting with dash?
429         idxs = [
430             i for i, a in enumerate(argv)
431             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
432         if idxs:
433             correct_argv = (
434                 ['youtube-dl'] +
435                 [a for i, a in enumerate(argv) if i not in idxs] +
436                 ['--'] + [argv[i] for i in idxs]
437             )
438             self.report_warning(
439                 'Long argument string detected. '
440                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
441                 args_to_str(correct_argv))
442
443     def add_info_extractor(self, ie):
444         """Add an InfoExtractor object to the end of the list."""
445         self._ies.append(ie)
446         if not isinstance(ie, type):
447             self._ies_instances[ie.ie_key()] = ie
448             ie.set_downloader(self)
449
450     def get_info_extractor(self, ie_key):
451         """
452         Get an instance of an IE with name ie_key, it will try to get one from
453         the _ies list, if there's no instance it will create a new one and add
454         it to the extractor list.
455         """
456         ie = self._ies_instances.get(ie_key)
457         if ie is None:
458             ie = get_info_extractor(ie_key)()
459             self.add_info_extractor(ie)
460         return ie
461
462     def add_default_info_extractors(self):
463         """
464         Add the InfoExtractors returned by gen_extractors to the end of the list
465         """
466         for ie in gen_extractor_classes():
467             self.add_info_extractor(ie)
468
469     def add_post_processor(self, pp):
470         """Add a PostProcessor object to the end of the chain."""
471         self._pps.append(pp)
472         pp.set_downloader(self)
473
474     def add_progress_hook(self, ph):
475         """Add the progress hook (currently only for the file downloader)"""
476         self._progress_hooks.append(ph)
477
478     def _bidi_workaround(self, message):
479         if not hasattr(self, '_output_channel'):
480             return message
481
482         assert hasattr(self, '_output_process')
483         assert isinstance(message, compat_str)
484         line_count = message.count('\n') + 1
485         self._output_process.stdin.write((message + '\n').encode('utf-8'))
486         self._output_process.stdin.flush()
487         res = ''.join(self._output_channel.readline().decode('utf-8')
488                       for _ in range(line_count))
489         return res[:-len('\n')]
490
491     def to_screen(self, message, skip_eol=False):
492         """Print message to stdout if not in quiet mode."""
493         return self.to_stdout(message, skip_eol, check_quiet=True)
494
495     def _write_string(self, s, out=None):
496         write_string(s, out=out, encoding=self.params.get('encoding'))
497
498     def to_stdout(self, message, skip_eol=False, check_quiet=False):
499         """Print message to stdout if not in quiet mode."""
500         if self.params.get('logger'):
501             self.params['logger'].debug(message)
502         elif not check_quiet or not self.params.get('quiet', False):
503             message = self._bidi_workaround(message)
504             terminator = ['\n', ''][skip_eol]
505             output = message + terminator
506
507             self._write_string(output, self._screen_file)
508
509     def to_stderr(self, message):
510         """Print message to stderr."""
511         assert isinstance(message, compat_str)
512         if self.params.get('logger'):
513             self.params['logger'].error(message)
514         else:
515             message = self._bidi_workaround(message)
516             output = message + '\n'
517             self._write_string(output, self._err_file)
518
519     def to_console_title(self, message):
520         if not self.params.get('consoletitle', False):
521             return
522         if compat_os_name == 'nt':
523             if ctypes.windll.kernel32.GetConsoleWindow():
524                 # c_wchar_p() might not be necessary if `message` is
525                 # already of type unicode()
526                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
527         elif 'TERM' in os.environ:
528             self._write_string('\033]0;%s\007' % message, self._screen_file)
529
530     def save_console_title(self):
531         if not self.params.get('consoletitle', False):
532             return
533         if compat_os_name != 'nt' and 'TERM' in os.environ:
534             # Save the title on stack
535             self._write_string('\033[22;0t', self._screen_file)
536
537     def restore_console_title(self):
538         if not self.params.get('consoletitle', False):
539             return
540         if compat_os_name != 'nt' and 'TERM' in os.environ:
541             # Restore the title from stack
542             self._write_string('\033[23;0t', self._screen_file)
543
544     def __enter__(self):
545         self.save_console_title()
546         return self
547
548     def __exit__(self, *args):
549         self.restore_console_title()
550
551         if self.params.get('cookiefile') is not None:
552             self.cookiejar.save()
553
554     def trouble(self, message=None, tb=None):
555         """Determine action to take when a download problem appears.
556
557         Depending on if the downloader has been configured to ignore
558         download errors or not, this method may throw an exception or
559         not when errors are found, after printing the message.
560
561         tb, if given, is additional traceback information.
562         """
563         if message is not None:
564             self.to_stderr(message)
565         if self.params.get('verbose'):
566             if tb is None:
567                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
568                     tb = ''
569                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
570                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
571                     tb += encode_compat_str(traceback.format_exc())
572                 else:
573                     tb_data = traceback.format_list(traceback.extract_stack())
574                     tb = ''.join(tb_data)
575             self.to_stderr(tb)
576         if not self.params.get('ignoreerrors', False):
577             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
578                 exc_info = sys.exc_info()[1].exc_info
579             else:
580                 exc_info = sys.exc_info()
581             raise DownloadError(message, exc_info)
582         self._download_retcode = 1
583
584     def report_warning(self, message):
585         '''
586         Print the message to stderr, it will be prefixed with 'WARNING:'
587         If stderr is a tty file the 'WARNING:' will be colored
588         '''
589         if self.params.get('logger') is not None:
590             self.params['logger'].warning(message)
591         else:
592             if self.params.get('no_warnings'):
593                 return
594             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
595                 _msg_header = '\033[0;33mWARNING:\033[0m'
596             else:
597                 _msg_header = 'WARNING:'
598             warning_message = '%s %s' % (_msg_header, message)
599             self.to_stderr(warning_message)
600
601     def report_error(self, message, tb=None):
602         '''
603         Do the same as trouble, but prefixes the message with 'ERROR:', colored
604         in red if stderr is a tty file.
605         '''
606         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
607             _msg_header = '\033[0;31mERROR:\033[0m'
608         else:
609             _msg_header = 'ERROR:'
610         error_message = '%s %s' % (_msg_header, message)
611         self.trouble(error_message, tb)
612
613     def report_file_already_downloaded(self, file_name):
614         """Report file has already been fully downloaded."""
615         try:
616             self.to_screen('[download] %s has already been downloaded' % file_name)
617         except UnicodeEncodeError:
618             self.to_screen('[download] The file has already been downloaded')
619
620     def prepare_filename(self, info_dict):
621         """Generate the output filename."""
622         try:
623             template_dict = dict(info_dict)
624
625             template_dict['epoch'] = int(time.time())
626             autonumber_size = self.params.get('autonumber_size')
627             if autonumber_size is None:
628                 autonumber_size = 5
629             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
630             if template_dict.get('resolution') is None:
631                 if template_dict.get('width') and template_dict.get('height'):
632                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
633                 elif template_dict.get('height'):
634                     template_dict['resolution'] = '%sp' % template_dict['height']
635                 elif template_dict.get('width'):
636                     template_dict['resolution'] = '%dx?' % template_dict['width']
637
638             sanitize = lambda k, v: sanitize_filename(
639                 compat_str(v),
640                 restricted=self.params.get('restrictfilenames'),
641                 is_id=(k == 'id' or k.endswith('_id')))
642             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
643                                  for k, v in template_dict.items()
644                                  if v is not None and not isinstance(v, (list, tuple, dict)))
645             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
646
647             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
648
649             # For fields playlist_index and autonumber convert all occurrences
650             # of %(field)s to %(field)0Nd for backward compatibility
651             field_size_compat_map = {
652                 'playlist_index': len(str(template_dict['n_entries'])),
653                 'autonumber': autonumber_size,
654             }
655             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
656             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
657             if mobj:
658                 outtmpl = re.sub(
659                     FIELD_SIZE_COMPAT_RE,
660                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
661                     outtmpl)
662
663             # Missing numeric fields used together with integer presentation types
664             # in format specification will break the argument substitution since
665             # string 'NA' is returned for missing fields. We will patch output
666             # template for missing fields to meet string presentation type.
667             for numeric_field in self._NUMERIC_FIELDS:
668                 if numeric_field not in template_dict:
669                     # As of [1] format syntax is:
670                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
671                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
672                     FORMAT_RE = r'''(?x)
673                         (?<!%)
674                         %
675                         \({0}\)  # mapping key
676                         (?:[#0\-+ ]+)?  # conversion flags (optional)
677                         (?:\d+)?  # minimum field width (optional)
678                         (?:\.\d+)?  # precision (optional)
679                         [hlL]?  # length modifier (optional)
680                         [diouxXeEfFgGcrs%]  # conversion type
681                     '''
682                     outtmpl = re.sub(
683                         FORMAT_RE.format(numeric_field),
684                         r'%({0})s'.format(numeric_field), outtmpl)
685
686             # expand_path translates '%%' into '%' and '$$' into '$'
687             # correspondingly that is not what we want since we need to keep
688             # '%%' intact for template dict substitution step. Working around
689             # with boundary-alike separator hack.
690             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
691             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
692
693             # outtmpl should be expand_path'ed before template dict substitution
694             # because meta fields may contain env variables we don't want to
695             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
696             # title "Hello $PATH", we don't want `$PATH` to be expanded.
697             filename = expand_path(outtmpl).replace(sep, '') % template_dict
698
699             # Temporary fix for #4787
700             # 'Treat' all problem characters by passing filename through preferredencoding
701             # to workaround encoding issues with subprocess on python2 @ Windows
702             if sys.version_info < (3, 0) and sys.platform == 'win32':
703                 filename = encodeFilename(filename, True).decode(preferredencoding())
704             return sanitize_path(filename)
705         except ValueError as err:
706             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
707             return None
708
709     def _match_entry(self, info_dict, incomplete):
710         """ Returns None iff the file should be downloaded """
711
712         video_title = info_dict.get('title', info_dict.get('id', 'video'))
713         if 'title' in info_dict:
714             # This can happen when we're just evaluating the playlist
715             title = info_dict['title']
716             matchtitle = self.params.get('matchtitle', False)
717             if matchtitle:
718                 if not re.search(matchtitle, title, re.IGNORECASE):
719                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
720             rejecttitle = self.params.get('rejecttitle', False)
721             if rejecttitle:
722                 if re.search(rejecttitle, title, re.IGNORECASE):
723                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724         date = info_dict.get('upload_date')
725         if date is not None:
726             dateRange = self.params.get('daterange', DateRange())
727             if date not in dateRange:
728                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
729         view_count = info_dict.get('view_count')
730         if view_count is not None:
731             min_views = self.params.get('min_views')
732             if min_views is not None and view_count < min_views:
733                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
734             max_views = self.params.get('max_views')
735             if max_views is not None and view_count > max_views:
736                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
737         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
738             return 'Skipping "%s" because it is age restricted' % video_title
739         if self.in_download_archive(info_dict):
740             return '%s has already been recorded in archive' % video_title
741
742         if not incomplete:
743             match_filter = self.params.get('match_filter')
744             if match_filter is not None:
745                 ret = match_filter(info_dict)
746                 if ret is not None:
747                     return ret
748
749         return None
750
751     @staticmethod
752     def add_extra_info(info_dict, extra_info):
753         '''Set the keys from extra_info in info dict if they are missing'''
754         for key, value in extra_info.items():
755             info_dict.setdefault(key, value)
756
757     def extract_info(self, url, download=True, ie_key=None, extra_info={},
758                      process=True, force_generic_extractor=False):
759         '''
760         Returns a list with a dictionary for each video we find.
761         If 'download', also downloads the videos.
762         extra_info is a dict containing the extra values to add to each result
763         '''
764
765         if not ie_key and force_generic_extractor:
766             ie_key = 'Generic'
767
768         if ie_key:
769             ies = [self.get_info_extractor(ie_key)]
770         else:
771             ies = self._ies
772
773         for ie in ies:
774             if not ie.suitable(url):
775                 continue
776
777             ie = self.get_info_extractor(ie.ie_key())
778             if not ie.working():
779                 self.report_warning('The program functionality for this site has been marked as broken, '
780                                     'and will probably not work.')
781
782             try:
783                 ie_result = ie.extract(url)
784                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
785                     break
786                 if isinstance(ie_result, list):
787                     # Backwards compatibility: old IE result format
788                     ie_result = {
789                         '_type': 'compat_list',
790                         'entries': ie_result,
791                     }
792                 self.add_default_extra_info(ie_result, ie, url)
793                 if process:
794                     return self.process_ie_result(ie_result, download, extra_info)
795                 else:
796                     return ie_result
797             except GeoRestrictedError as e:
798                 msg = e.msg
799                 if e.countries:
800                     msg += '\nThis video is available in %s.' % ', '.join(
801                         map(ISO3166Utils.short2full, e.countries))
802                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
803                 self.report_error(msg)
804                 break
805             except ExtractorError as e:  # An error we somewhat expected
806                 self.report_error(compat_str(e), e.format_traceback())
807                 break
808             except MaxDownloadsReached:
809                 raise
810             except Exception as e:
811                 if self.params.get('ignoreerrors', False):
812                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
813                     break
814                 else:
815                     raise
816         else:
817             self.report_error('no suitable InfoExtractor for URL %s' % url)
818
819     def add_default_extra_info(self, ie_result, ie, url):
820         self.add_extra_info(ie_result, {
821             'extractor': ie.IE_NAME,
822             'webpage_url': url,
823             'webpage_url_basename': url_basename(url),
824             'extractor_key': ie.ie_key(),
825         })
826
827     def process_ie_result(self, ie_result, download=True, extra_info={}):
828         """
829         Take the result of the ie(may be modified) and resolve all unresolved
830         references (URLs, playlist items).
831
832         It will also download the videos if 'download'.
833         Returns the resolved ie_result.
834         """
835         result_type = ie_result.get('_type', 'video')
836
837         if result_type in ('url', 'url_transparent'):
838             ie_result['url'] = sanitize_url(ie_result['url'])
839             extract_flat = self.params.get('extract_flat', False)
840             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
841                     extract_flat is True):
842                 if self.params.get('forcejson', False):
843                     self.to_stdout(json.dumps(ie_result))
844                 return ie_result
845
846         if result_type == 'video':
847             self.add_extra_info(ie_result, extra_info)
848             return self.process_video_result(ie_result, download=download)
849         elif result_type == 'url':
850             # We have to add extra_info to the results because it may be
851             # contained in a playlist
852             return self.extract_info(ie_result['url'],
853                                      download,
854                                      ie_key=ie_result.get('ie_key'),
855                                      extra_info=extra_info)
856         elif result_type == 'url_transparent':
857             # Use the information from the embedding page
858             info = self.extract_info(
859                 ie_result['url'], ie_key=ie_result.get('ie_key'),
860                 extra_info=extra_info, download=False, process=False)
861
862             # extract_info may return None when ignoreerrors is enabled and
863             # extraction failed with an error, don't crash and return early
864             # in this case
865             if not info:
866                 return info
867
868             force_properties = dict(
869                 (k, v) for k, v in ie_result.items() if v is not None)
870             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
871                 if f in force_properties:
872                     del force_properties[f]
873             new_result = info.copy()
874             new_result.update(force_properties)
875
876             # Extracted info may not be a video result (i.e.
877             # info.get('_type', 'video') != video) but rather an url or
878             # url_transparent. In such cases outer metadata (from ie_result)
879             # should be propagated to inner one (info). For this to happen
880             # _type of info should be overridden with url_transparent. This
881             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
882             if new_result.get('_type') == 'url':
883                 new_result['_type'] = 'url_transparent'
884
885             return self.process_ie_result(
886                 new_result, download=download, extra_info=extra_info)
887         elif result_type in ('playlist', 'multi_video'):
888             # We process each entry in the playlist
889             playlist = ie_result.get('title') or ie_result.get('id')
890             self.to_screen('[download] Downloading playlist: %s' % playlist)
891
892             playlist_results = []
893
894             playliststart = self.params.get('playliststart', 1) - 1
895             playlistend = self.params.get('playlistend')
896             # For backwards compatibility, interpret -1 as whole list
897             if playlistend == -1:
898                 playlistend = None
899
900             playlistitems_str = self.params.get('playlist_items')
901             playlistitems = None
902             if playlistitems_str is not None:
903                 def iter_playlistitems(format):
904                     for string_segment in format.split(','):
905                         if '-' in string_segment:
906                             start, end = string_segment.split('-')
907                             for item in range(int(start), int(end) + 1):
908                                 yield int(item)
909                         else:
910                             yield int(string_segment)
911                 playlistitems = iter_playlistitems(playlistitems_str)
912
913             ie_entries = ie_result['entries']
914             if isinstance(ie_entries, list):
915                 n_all_entries = len(ie_entries)
916                 if playlistitems:
917                     entries = [
918                         ie_entries[i - 1] for i in playlistitems
919                         if -n_all_entries <= i - 1 < n_all_entries]
920                 else:
921                     entries = ie_entries[playliststart:playlistend]
922                 n_entries = len(entries)
923                 self.to_screen(
924                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
925                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
926             elif isinstance(ie_entries, PagedList):
927                 if playlistitems:
928                     entries = []
929                     for item in playlistitems:
930                         entries.extend(ie_entries.getslice(
931                             item - 1, item
932                         ))
933                 else:
934                     entries = ie_entries.getslice(
935                         playliststart, playlistend)
936                 n_entries = len(entries)
937                 self.to_screen(
938                     '[%s] playlist %s: Downloading %d videos' %
939                     (ie_result['extractor'], playlist, n_entries))
940             else:  # iterable
941                 if playlistitems:
942                     entry_list = list(ie_entries)
943                     entries = [entry_list[i - 1] for i in playlistitems]
944                 else:
945                     entries = list(itertools.islice(
946                         ie_entries, playliststart, playlistend))
947                 n_entries = len(entries)
948                 self.to_screen(
949                     '[%s] playlist %s: Downloading %d videos' %
950                     (ie_result['extractor'], playlist, n_entries))
951
952             if self.params.get('playlistreverse', False):
953                 entries = entries[::-1]
954
955             if self.params.get('playlistrandom', False):
956                 random.shuffle(entries)
957
958             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
959
960             for i, entry in enumerate(entries, 1):
961                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
962                 # This __x_forwarded_for_ip thing is a bit ugly but requires
963                 # minimal changes
964                 if x_forwarded_for:
965                     entry['__x_forwarded_for_ip'] = x_forwarded_for
966                 extra = {
967                     'n_entries': n_entries,
968                     'playlist': playlist,
969                     'playlist_id': ie_result.get('id'),
970                     'playlist_title': ie_result.get('title'),
971                     'playlist_index': i + playliststart,
972                     'extractor': ie_result['extractor'],
973                     'webpage_url': ie_result['webpage_url'],
974                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
975                     'extractor_key': ie_result['extractor_key'],
976                 }
977
978                 reason = self._match_entry(entry, incomplete=True)
979                 if reason is not None:
980                     self.to_screen('[download] ' + reason)
981                     continue
982
983                 entry_result = self.process_ie_result(entry,
984                                                       download=download,
985                                                       extra_info=extra)
986                 playlist_results.append(entry_result)
987             ie_result['entries'] = playlist_results
988             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
989             return ie_result
990         elif result_type == 'compat_list':
991             self.report_warning(
992                 'Extractor %s returned a compat_list result. '
993                 'It needs to be updated.' % ie_result.get('extractor'))
994
995             def _fixup(r):
996                 self.add_extra_info(
997                     r,
998                     {
999                         'extractor': ie_result['extractor'],
1000                         'webpage_url': ie_result['webpage_url'],
1001                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1002                         'extractor_key': ie_result['extractor_key'],
1003                     }
1004                 )
1005                 return r
1006             ie_result['entries'] = [
1007                 self.process_ie_result(_fixup(r), download, extra_info)
1008                 for r in ie_result['entries']
1009             ]
1010             return ie_result
1011         else:
1012             raise Exception('Invalid result type: %s' % result_type)
1013
1014     def _build_format_filter(self, filter_spec):
1015         " Returns a function to filter the formats according to the filter_spec "
1016
1017         OPERATORS = {
1018             '<': operator.lt,
1019             '<=': operator.le,
1020             '>': operator.gt,
1021             '>=': operator.ge,
1022             '=': operator.eq,
1023             '!=': operator.ne,
1024         }
1025         operator_rex = re.compile(r'''(?x)\s*
1026             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1027             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1028             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1029             $
1030             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1031         m = operator_rex.search(filter_spec)
1032         if m:
1033             try:
1034                 comparison_value = int(m.group('value'))
1035             except ValueError:
1036                 comparison_value = parse_filesize(m.group('value'))
1037                 if comparison_value is None:
1038                     comparison_value = parse_filesize(m.group('value') + 'B')
1039                 if comparison_value is None:
1040                     raise ValueError(
1041                         'Invalid value %r in format specification %r' % (
1042                             m.group('value'), filter_spec))
1043             op = OPERATORS[m.group('op')]
1044
1045         if not m:
1046             STR_OPERATORS = {
1047                 '=': operator.eq,
1048                 '!=': operator.ne,
1049                 '^=': lambda attr, value: attr.startswith(value),
1050                 '$=': lambda attr, value: attr.endswith(value),
1051                 '*=': lambda attr, value: value in attr,
1052             }
1053             str_operator_rex = re.compile(r'''(?x)
1054                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1055                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1056                 \s*(?P<value>[a-zA-Z0-9._-]+)
1057                 \s*$
1058                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1059             m = str_operator_rex.search(filter_spec)
1060             if m:
1061                 comparison_value = m.group('value')
1062                 op = STR_OPERATORS[m.group('op')]
1063
1064         if not m:
1065             raise ValueError('Invalid filter specification %r' % filter_spec)
1066
1067         def _filter(f):
1068             actual_value = f.get(m.group('key'))
1069             if actual_value is None:
1070                 return m.group('none_inclusive')
1071             return op(actual_value, comparison_value)
1072         return _filter
1073
1074     def _default_format_spec(self, info_dict, download=True):
1075         req_format_list = []
1076
1077         def can_have_partial_formats():
1078             if self.params.get('simulate', False):
1079                 return True
1080             if not download:
1081                 return True
1082             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1083                 return False
1084             if info_dict.get('is_live'):
1085                 return False
1086             merger = FFmpegMergerPP(self)
1087             return merger.available and merger.can_merge()
1088         if can_have_partial_formats():
1089             req_format_list.append('bestvideo+bestaudio')
1090         req_format_list.append('best')
1091         return '/'.join(req_format_list)
1092
1093     def build_format_selector(self, format_spec):
1094         def syntax_error(note, start):
1095             message = (
1096                 'Invalid format specification: '
1097                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1098             return SyntaxError(message)
1099
1100         PICKFIRST = 'PICKFIRST'
1101         MERGE = 'MERGE'
1102         SINGLE = 'SINGLE'
1103         GROUP = 'GROUP'
1104         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1105
1106         def _parse_filter(tokens):
1107             filter_parts = []
1108             for type, string, start, _, _ in tokens:
1109                 if type == tokenize.OP and string == ']':
1110                     return ''.join(filter_parts)
1111                 else:
1112                     filter_parts.append(string)
1113
1114         def _remove_unused_ops(tokens):
1115             # Remove operators that we don't use and join them with the surrounding strings
1116             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1117             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1118             last_string, last_start, last_end, last_line = None, None, None, None
1119             for type, string, start, end, line in tokens:
1120                 if type == tokenize.OP and string == '[':
1121                     if last_string:
1122                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1123                         last_string = None
1124                     yield type, string, start, end, line
1125                     # everything inside brackets will be handled by _parse_filter
1126                     for type, string, start, end, line in tokens:
1127                         yield type, string, start, end, line
1128                         if type == tokenize.OP and string == ']':
1129                             break
1130                 elif type == tokenize.OP and string in ALLOWED_OPS:
1131                     if last_string:
1132                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1133                         last_string = None
1134                     yield type, string, start, end, line
1135                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1136                     if not last_string:
1137                         last_string = string
1138                         last_start = start
1139                         last_end = end
1140                     else:
1141                         last_string += string
1142             if last_string:
1143                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1144
1145         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1146             selectors = []
1147             current_selector = None
1148             for type, string, start, _, _ in tokens:
1149                 # ENCODING is only defined in python 3.x
1150                 if type == getattr(tokenize, 'ENCODING', None):
1151                     continue
1152                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1153                     current_selector = FormatSelector(SINGLE, string, [])
1154                 elif type == tokenize.OP:
1155                     if string == ')':
1156                         if not inside_group:
1157                             # ')' will be handled by the parentheses group
1158                             tokens.restore_last_token()
1159                         break
1160                     elif inside_merge and string in ['/', ',']:
1161                         tokens.restore_last_token()
1162                         break
1163                     elif inside_choice and string == ',':
1164                         tokens.restore_last_token()
1165                         break
1166                     elif string == ',':
1167                         if not current_selector:
1168                             raise syntax_error('"," must follow a format selector', start)
1169                         selectors.append(current_selector)
1170                         current_selector = None
1171                     elif string == '/':
1172                         if not current_selector:
1173                             raise syntax_error('"/" must follow a format selector', start)
1174                         first_choice = current_selector
1175                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1176                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1177                     elif string == '[':
1178                         if not current_selector:
1179                             current_selector = FormatSelector(SINGLE, 'best', [])
1180                         format_filter = _parse_filter(tokens)
1181                         current_selector.filters.append(format_filter)
1182                     elif string == '(':
1183                         if current_selector:
1184                             raise syntax_error('Unexpected "("', start)
1185                         group = _parse_format_selection(tokens, inside_group=True)
1186                         current_selector = FormatSelector(GROUP, group, [])
1187                     elif string == '+':
1188                         video_selector = current_selector
1189                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1190                         if not video_selector or not audio_selector:
1191                             raise syntax_error('"+" must be between two format selectors', start)
1192                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1193                     else:
1194                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1195                 elif type == tokenize.ENDMARKER:
1196                     break
1197             if current_selector:
1198                 selectors.append(current_selector)
1199             return selectors
1200
1201         def _build_selector_function(selector):
1202             if isinstance(selector, list):
1203                 fs = [_build_selector_function(s) for s in selector]
1204
1205                 def selector_function(ctx):
1206                     for f in fs:
1207                         for format in f(ctx):
1208                             yield format
1209                 return selector_function
1210             elif selector.type == GROUP:
1211                 selector_function = _build_selector_function(selector.selector)
1212             elif selector.type == PICKFIRST:
1213                 fs = [_build_selector_function(s) for s in selector.selector]
1214
1215                 def selector_function(ctx):
1216                     for f in fs:
1217                         picked_formats = list(f(ctx))
1218                         if picked_formats:
1219                             return picked_formats
1220                     return []
1221             elif selector.type == SINGLE:
1222                 format_spec = selector.selector
1223
1224                 def selector_function(ctx):
1225                     formats = list(ctx['formats'])
1226                     if not formats:
1227                         return
1228                     if format_spec == 'all':
1229                         for f in formats:
1230                             yield f
1231                     elif format_spec in ['best', 'worst', None]:
1232                         format_idx = 0 if format_spec == 'worst' else -1
1233                         audiovideo_formats = [
1234                             f for f in formats
1235                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1236                         if audiovideo_formats:
1237                             yield audiovideo_formats[format_idx]
1238                         # for extractors with incomplete formats (audio only (soundcloud)
1239                         # or video only (imgur)) we will fallback to best/worst
1240                         # {video,audio}-only format
1241                         elif ctx['incomplete_formats']:
1242                             yield formats[format_idx]
1243                     elif format_spec == 'bestaudio':
1244                         audio_formats = [
1245                             f for f in formats
1246                             if f.get('vcodec') == 'none']
1247                         if audio_formats:
1248                             yield audio_formats[-1]
1249                     elif format_spec == 'worstaudio':
1250                         audio_formats = [
1251                             f for f in formats
1252                             if f.get('vcodec') == 'none']
1253                         if audio_formats:
1254                             yield audio_formats[0]
1255                     elif format_spec == 'bestvideo':
1256                         video_formats = [
1257                             f for f in formats
1258                             if f.get('acodec') == 'none']
1259                         if video_formats:
1260                             yield video_formats[-1]
1261                     elif format_spec == 'worstvideo':
1262                         video_formats = [
1263                             f for f in formats
1264                             if f.get('acodec') == 'none']
1265                         if video_formats:
1266                             yield video_formats[0]
1267                     else:
1268                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1269                         if format_spec in extensions:
1270                             filter_f = lambda f: f['ext'] == format_spec
1271                         else:
1272                             filter_f = lambda f: f['format_id'] == format_spec
1273                         matches = list(filter(filter_f, formats))
1274                         if matches:
1275                             yield matches[-1]
1276             elif selector.type == MERGE:
1277                 def _merge(formats_info):
1278                     format_1, format_2 = [f['format_id'] for f in formats_info]
1279                     # The first format must contain the video and the
1280                     # second the audio
1281                     if formats_info[0].get('vcodec') == 'none':
1282                         self.report_error('The first format must '
1283                                           'contain the video, try using '
1284                                           '"-f %s+%s"' % (format_2, format_1))
1285                         return
1286                     # Formats must be opposite (video+audio)
1287                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1288                         self.report_error(
1289                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1290                             % (format_1, format_2))
1291                         return
1292                     output_ext = (
1293                         formats_info[0]['ext']
1294                         if self.params.get('merge_output_format') is None
1295                         else self.params['merge_output_format'])
1296                     return {
1297                         'requested_formats': formats_info,
1298                         'format': '%s+%s' % (formats_info[0].get('format'),
1299                                              formats_info[1].get('format')),
1300                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1301                                                 formats_info[1].get('format_id')),
1302                         'width': formats_info[0].get('width'),
1303                         'height': formats_info[0].get('height'),
1304                         'resolution': formats_info[0].get('resolution'),
1305                         'fps': formats_info[0].get('fps'),
1306                         'vcodec': formats_info[0].get('vcodec'),
1307                         'vbr': formats_info[0].get('vbr'),
1308                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1309                         'acodec': formats_info[1].get('acodec'),
1310                         'abr': formats_info[1].get('abr'),
1311                         'ext': output_ext,
1312                     }
1313                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1314
1315                 def selector_function(ctx):
1316                     for pair in itertools.product(
1317                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1318                         yield _merge(pair)
1319
1320             filters = [self._build_format_filter(f) for f in selector.filters]
1321
1322             def final_selector(ctx):
1323                 ctx_copy = copy.deepcopy(ctx)
1324                 for _filter in filters:
1325                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1326                 return selector_function(ctx_copy)
1327             return final_selector
1328
1329         stream = io.BytesIO(format_spec.encode('utf-8'))
1330         try:
1331             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1332         except tokenize.TokenError:
1333             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1334
1335         class TokenIterator(object):
1336             def __init__(self, tokens):
1337                 self.tokens = tokens
1338                 self.counter = 0
1339
1340             def __iter__(self):
1341                 return self
1342
1343             def __next__(self):
1344                 if self.counter >= len(self.tokens):
1345                     raise StopIteration()
1346                 value = self.tokens[self.counter]
1347                 self.counter += 1
1348                 return value
1349
1350             next = __next__
1351
1352             def restore_last_token(self):
1353                 self.counter -= 1
1354
1355         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1356         return _build_selector_function(parsed_selector)
1357
1358     def _calc_headers(self, info_dict):
1359         res = std_headers.copy()
1360
1361         add_headers = info_dict.get('http_headers')
1362         if add_headers:
1363             res.update(add_headers)
1364
1365         cookies = self._calc_cookies(info_dict)
1366         if cookies:
1367             res['Cookie'] = cookies
1368
1369         if 'X-Forwarded-For' not in res:
1370             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1371             if x_forwarded_for_ip:
1372                 res['X-Forwarded-For'] = x_forwarded_for_ip
1373
1374         return res
1375
1376     def _calc_cookies(self, info_dict):
1377         pr = sanitized_Request(info_dict['url'])
1378         self.cookiejar.add_cookie_header(pr)
1379         return pr.get_header('Cookie')
1380
1381     def process_video_result(self, info_dict, download=True):
1382         assert info_dict.get('_type', 'video') == 'video'
1383
1384         if 'id' not in info_dict:
1385             raise ExtractorError('Missing "id" field in extractor result')
1386         if 'title' not in info_dict:
1387             raise ExtractorError('Missing "title" field in extractor result')
1388
1389         def report_force_conversion(field, field_not, conversion):
1390             self.report_warning(
1391                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1392                 % (field, field_not, conversion))
1393
1394         def sanitize_string_field(info, string_field):
1395             field = info.get(string_field)
1396             if field is None or isinstance(field, compat_str):
1397                 return
1398             report_force_conversion(string_field, 'a string', 'string')
1399             info[string_field] = compat_str(field)
1400
1401         def sanitize_numeric_fields(info):
1402             for numeric_field in self._NUMERIC_FIELDS:
1403                 field = info.get(numeric_field)
1404                 if field is None or isinstance(field, compat_numeric_types):
1405                     continue
1406                 report_force_conversion(numeric_field, 'numeric', 'int')
1407                 info[numeric_field] = int_or_none(field)
1408
1409         sanitize_string_field(info_dict, 'id')
1410         sanitize_numeric_fields(info_dict)
1411
1412         if 'playlist' not in info_dict:
1413             # It isn't part of a playlist
1414             info_dict['playlist'] = None
1415             info_dict['playlist_index'] = None
1416
1417         thumbnails = info_dict.get('thumbnails')
1418         if thumbnails is None:
1419             thumbnail = info_dict.get('thumbnail')
1420             if thumbnail:
1421                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1422         if thumbnails:
1423             thumbnails.sort(key=lambda t: (
1424                 t.get('preference') if t.get('preference') is not None else -1,
1425                 t.get('width') if t.get('width') is not None else -1,
1426                 t.get('height') if t.get('height') is not None else -1,
1427                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1428             for i, t in enumerate(thumbnails):
1429                 t['url'] = sanitize_url(t['url'])
1430                 if t.get('width') and t.get('height'):
1431                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1432                 if t.get('id') is None:
1433                     t['id'] = '%d' % i
1434
1435         if self.params.get('list_thumbnails'):
1436             self.list_thumbnails(info_dict)
1437             return
1438
1439         thumbnail = info_dict.get('thumbnail')
1440         if thumbnail:
1441             info_dict['thumbnail'] = sanitize_url(thumbnail)
1442         elif thumbnails:
1443             info_dict['thumbnail'] = thumbnails[-1]['url']
1444
1445         if 'display_id' not in info_dict and 'id' in info_dict:
1446             info_dict['display_id'] = info_dict['id']
1447
1448         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1449             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1450             # see http://bugs.python.org/issue1646728)
1451             try:
1452                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1453                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1454             except (ValueError, OverflowError, OSError):
1455                 pass
1456
1457         # Auto generate title fields corresponding to the *_number fields when missing
1458         # in order to always have clean titles. This is very common for TV series.
1459         for field in ('chapter', 'season', 'episode'):
1460             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1461                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1462
1463         subtitles = info_dict.get('subtitles')
1464         if subtitles:
1465             for _, subtitle in subtitles.items():
1466                 for subtitle_format in subtitle:
1467                     if subtitle_format.get('url'):
1468                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1469                     if subtitle_format.get('ext') is None:
1470                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1471
1472         if self.params.get('listsubtitles', False):
1473             if 'automatic_captions' in info_dict:
1474                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1475             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1476             return
1477         info_dict['requested_subtitles'] = self.process_subtitles(
1478             info_dict['id'], subtitles,
1479             info_dict.get('automatic_captions'))
1480
1481         # We now pick which formats have to be downloaded
1482         if info_dict.get('formats') is None:
1483             # There's only one format available
1484             formats = [info_dict]
1485         else:
1486             formats = info_dict['formats']
1487
1488         if not formats:
1489             raise ExtractorError('No video formats found!')
1490
1491         def is_wellformed(f):
1492             url = f.get('url')
1493             if not url:
1494                 self.report_warning(
1495                     '"url" field is missing or empty - skipping format, '
1496                     'there is an error in extractor')
1497                 return False
1498             if isinstance(url, bytes):
1499                 sanitize_string_field(f, 'url')
1500             return True
1501
1502         # Filter out malformed formats for better extraction robustness
1503         formats = list(filter(is_wellformed, formats))
1504
1505         formats_dict = {}
1506
1507         # We check that all the formats have the format and format_id fields
1508         for i, format in enumerate(formats):
1509             sanitize_string_field(format, 'format_id')
1510             sanitize_numeric_fields(format)
1511             format['url'] = sanitize_url(format['url'])
1512             if not format.get('format_id'):
1513                 format['format_id'] = compat_str(i)
1514             else:
1515                 # Sanitize format_id from characters used in format selector expression
1516                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1517             format_id = format['format_id']
1518             if format_id not in formats_dict:
1519                 formats_dict[format_id] = []
1520             formats_dict[format_id].append(format)
1521
1522         # Make sure all formats have unique format_id
1523         for format_id, ambiguous_formats in formats_dict.items():
1524             if len(ambiguous_formats) > 1:
1525                 for i, format in enumerate(ambiguous_formats):
1526                     format['format_id'] = '%s-%d' % (format_id, i)
1527
1528         for i, format in enumerate(formats):
1529             if format.get('format') is None:
1530                 format['format'] = '{id} - {res}{note}'.format(
1531                     id=format['format_id'],
1532                     res=self.format_resolution(format),
1533                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1534                 )
1535             # Automatically determine file extension if missing
1536             if format.get('ext') is None:
1537                 format['ext'] = determine_ext(format['url']).lower()
1538             # Automatically determine protocol if missing (useful for format
1539             # selection purposes)
1540             if format.get('protocol') is None:
1541                 format['protocol'] = determine_protocol(format)
1542             # Add HTTP headers, so that external programs can use them from the
1543             # json output
1544             full_format_info = info_dict.copy()
1545             full_format_info.update(format)
1546             format['http_headers'] = self._calc_headers(full_format_info)
1547         # Remove private housekeeping stuff
1548         if '__x_forwarded_for_ip' in info_dict:
1549             del info_dict['__x_forwarded_for_ip']
1550
1551         # TODO Central sorting goes here
1552
1553         if formats[0] is not info_dict:
1554             # only set the 'formats' fields if the original info_dict list them
1555             # otherwise we end up with a circular reference, the first (and unique)
1556             # element in the 'formats' field in info_dict is info_dict itself,
1557             # which can't be exported to json
1558             info_dict['formats'] = formats
1559         if self.params.get('listformats'):
1560             self.list_formats(info_dict)
1561             return
1562
1563         req_format = self.params.get('format')
1564         if req_format is None:
1565             req_format = self._default_format_spec(info_dict, download=download)
1566             if self.params.get('verbose'):
1567                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1568
1569         format_selector = self.build_format_selector(req_format)
1570
1571         # While in format selection we may need to have an access to the original
1572         # format set in order to calculate some metrics or do some processing.
1573         # For now we need to be able to guess whether original formats provided
1574         # by extractor are incomplete or not (i.e. whether extractor provides only
1575         # video-only or audio-only formats) for proper formats selection for
1576         # extractors with such incomplete formats (see
1577         # https://github.com/rg3/youtube-dl/pull/5556).
1578         # Since formats may be filtered during format selection and may not match
1579         # the original formats the results may be incorrect. Thus original formats
1580         # or pre-calculated metrics should be passed to format selection routines
1581         # as well.
1582         # We will pass a context object containing all necessary additional data
1583         # instead of just formats.
1584         # This fixes incorrect format selection issue (see
1585         # https://github.com/rg3/youtube-dl/issues/10083).
1586         incomplete_formats = (
1587             # All formats are video-only or
1588             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1589             # all formats are audio-only
1590             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1591
1592         ctx = {
1593             'formats': formats,
1594             'incomplete_formats': incomplete_formats,
1595         }
1596
1597         formats_to_download = list(format_selector(ctx))
1598         if not formats_to_download:
1599             raise ExtractorError('requested format not available',
1600                                  expected=True)
1601
1602         if download:
1603             if len(formats_to_download) > 1:
1604                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1605             for format in formats_to_download:
1606                 new_info = dict(info_dict)
1607                 new_info.update(format)
1608                 self.process_info(new_info)
1609         # We update the info dict with the best quality format (backwards compatibility)
1610         info_dict.update(formats_to_download[-1])
1611         return info_dict
1612
1613     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1614         """Select the requested subtitles and their format"""
1615         available_subs = {}
1616         if normal_subtitles and self.params.get('writesubtitles'):
1617             available_subs.update(normal_subtitles)
1618         if automatic_captions and self.params.get('writeautomaticsub'):
1619             for lang, cap_info in automatic_captions.items():
1620                 if lang not in available_subs:
1621                     available_subs[lang] = cap_info
1622
1623         if (not self.params.get('writesubtitles') and not
1624                 self.params.get('writeautomaticsub') or not
1625                 available_subs):
1626             return None
1627
1628         if self.params.get('allsubtitles', False):
1629             requested_langs = available_subs.keys()
1630         else:
1631             if self.params.get('subtitleslangs', False):
1632                 requested_langs = self.params.get('subtitleslangs')
1633             elif 'en' in available_subs:
1634                 requested_langs = ['en']
1635             else:
1636                 requested_langs = [list(available_subs.keys())[0]]
1637
1638         formats_query = self.params.get('subtitlesformat', 'best')
1639         formats_preference = formats_query.split('/') if formats_query else []
1640         subs = {}
1641         for lang in requested_langs:
1642             formats = available_subs.get(lang)
1643             if formats is None:
1644                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1645                 continue
1646             for ext in formats_preference:
1647                 if ext == 'best':
1648                     f = formats[-1]
1649                     break
1650                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1651                 if matches:
1652                     f = matches[-1]
1653                     break
1654             else:
1655                 f = formats[-1]
1656                 self.report_warning(
1657                     'No subtitle format found matching "%s" for language %s, '
1658                     'using %s' % (formats_query, lang, f['ext']))
1659             subs[lang] = f
1660         return subs
1661
1662     def process_info(self, info_dict):
1663         """Process a single resolved IE result."""
1664
1665         assert info_dict.get('_type', 'video') == 'video'
1666
1667         max_downloads = self.params.get('max_downloads')
1668         if max_downloads is not None:
1669             if self._num_downloads >= int(max_downloads):
1670                 raise MaxDownloadsReached()
1671
1672         info_dict['fulltitle'] = info_dict['title']
1673         if len(info_dict['title']) > 200:
1674             info_dict['title'] = info_dict['title'][:197] + '...'
1675
1676         if 'format' not in info_dict:
1677             info_dict['format'] = info_dict['ext']
1678
1679         reason = self._match_entry(info_dict, incomplete=False)
1680         if reason is not None:
1681             self.to_screen('[download] ' + reason)
1682             return
1683
1684         self._num_downloads += 1
1685
1686         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1687
1688         # Forced printings
1689         if self.params.get('forcetitle', False):
1690             self.to_stdout(info_dict['fulltitle'])
1691         if self.params.get('forceid', False):
1692             self.to_stdout(info_dict['id'])
1693         if self.params.get('forceurl', False):
1694             if info_dict.get('requested_formats') is not None:
1695                 for f in info_dict['requested_formats']:
1696                     self.to_stdout(f['url'] + f.get('play_path', ''))
1697             else:
1698                 # For RTMP URLs, also include the playpath
1699                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1700         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1701             self.to_stdout(info_dict['thumbnail'])
1702         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1703             self.to_stdout(info_dict['description'])
1704         if self.params.get('forcefilename', False) and filename is not None:
1705             self.to_stdout(filename)
1706         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1707             self.to_stdout(formatSeconds(info_dict['duration']))
1708         if self.params.get('forceformat', False):
1709             self.to_stdout(info_dict['format'])
1710         if self.params.get('forcejson', False):
1711             self.to_stdout(json.dumps(info_dict))
1712
1713         # Do nothing else if in simulate mode
1714         if self.params.get('simulate', False):
1715             return
1716
1717         if filename is None:
1718             return
1719
1720         def ensure_dir_exists(path):
1721             try:
1722                 dn = os.path.dirname(path)
1723                 if dn and not os.path.exists(dn):
1724                     os.makedirs(dn)
1725                 return True
1726             except (OSError, IOError) as err:
1727                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1728                 return False
1729
1730         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1731             return
1732
1733         if self.params.get('writedescription', False):
1734             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1735             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1736                 self.to_screen('[info] Video description is already present')
1737             elif info_dict.get('description') is None:
1738                 self.report_warning('There\'s no description to write.')
1739             else:
1740                 try:
1741                     self.to_screen('[info] Writing video description to: ' + descfn)
1742                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1743                         descfile.write(info_dict['description'])
1744                 except (OSError, IOError):
1745                     self.report_error('Cannot write description file ' + descfn)
1746                     return
1747
1748         if self.params.get('writeannotations', False):
1749             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1750             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1751                 self.to_screen('[info] Video annotations are already present')
1752             else:
1753                 try:
1754                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1755                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1756                         annofile.write(info_dict['annotations'])
1757                 except (KeyError, TypeError):
1758                     self.report_warning('There are no annotations to write.')
1759                 except (OSError, IOError):
1760                     self.report_error('Cannot write annotations file: ' + annofn)
1761                     return
1762
1763         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1764                                        self.params.get('writeautomaticsub')])
1765
1766         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1767             # subtitles download errors are already managed as troubles in relevant IE
1768             # that way it will silently go on when used with unsupporting IE
1769             subtitles = info_dict['requested_subtitles']
1770             ie = self.get_info_extractor(info_dict['extractor_key'])
1771             for sub_lang, sub_info in subtitles.items():
1772                 sub_format = sub_info['ext']
1773                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1774                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1775                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1776                 else:
1777                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1778                     if sub_info.get('data') is not None:
1779                         try:
1780                             # Use newline='' to prevent conversion of newline characters
1781                             # See https://github.com/rg3/youtube-dl/issues/10268
1782                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1783                                 subfile.write(sub_info['data'])
1784                         except (OSError, IOError):
1785                             self.report_error('Cannot write subtitles file ' + sub_filename)
1786                             return
1787                     else:
1788                         try:
1789                             sub_data = ie._request_webpage(
1790                                 sub_info['url'], info_dict['id'], note=False).read()
1791                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1792                                 subfile.write(sub_data)
1793                         except (ExtractorError, IOError, OSError, ValueError) as err:
1794                             self.report_warning('Unable to download subtitle for "%s": %s' %
1795                                                 (sub_lang, error_to_compat_str(err)))
1796                             continue
1797
1798         if self.params.get('writeinfojson', False):
1799             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1800             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1801                 self.to_screen('[info] Video description metadata is already present')
1802             else:
1803                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1804                 try:
1805                     write_json_file(self.filter_requested_info(info_dict), infofn)
1806                 except (OSError, IOError):
1807                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1808                     return
1809
1810         self._write_thumbnails(info_dict, filename)
1811
1812         if not self.params.get('skip_download', False):
1813             try:
1814                 def dl(name, info):
1815                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1816                     for ph in self._progress_hooks:
1817                         fd.add_progress_hook(ph)
1818                     if self.params.get('verbose'):
1819                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1820                     return fd.download(name, info)
1821
1822                 if info_dict.get('requested_formats') is not None:
1823                     downloaded = []
1824                     success = True
1825                     merger = FFmpegMergerPP(self)
1826                     if not merger.available:
1827                         postprocessors = []
1828                         self.report_warning('You have requested multiple '
1829                                             'formats but ffmpeg or avconv are not installed.'
1830                                             ' The formats won\'t be merged.')
1831                     else:
1832                         postprocessors = [merger]
1833
1834                     def compatible_formats(formats):
1835                         video, audio = formats
1836                         # Check extension
1837                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1838                         if video_ext and audio_ext:
1839                             COMPATIBLE_EXTS = (
1840                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1841                                 ('webm')
1842                             )
1843                             for exts in COMPATIBLE_EXTS:
1844                                 if video_ext in exts and audio_ext in exts:
1845                                     return True
1846                         # TODO: Check acodec/vcodec
1847                         return False
1848
1849                     filename_real_ext = os.path.splitext(filename)[1][1:]
1850                     filename_wo_ext = (
1851                         os.path.splitext(filename)[0]
1852                         if filename_real_ext == info_dict['ext']
1853                         else filename)
1854                     requested_formats = info_dict['requested_formats']
1855                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1856                         info_dict['ext'] = 'mkv'
1857                         self.report_warning(
1858                             'Requested formats are incompatible for merge and will be merged into mkv.')
1859                     # Ensure filename always has a correct extension for successful merge
1860                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1861                     if os.path.exists(encodeFilename(filename)):
1862                         self.to_screen(
1863                             '[download] %s has already been downloaded and '
1864                             'merged' % filename)
1865                     else:
1866                         for f in requested_formats:
1867                             new_info = dict(info_dict)
1868                             new_info.update(f)
1869                             fname = prepend_extension(
1870                                 self.prepare_filename(new_info),
1871                                 'f%s' % f['format_id'], new_info['ext'])
1872                             if not ensure_dir_exists(fname):
1873                                 return
1874                             downloaded.append(fname)
1875                             partial_success = dl(fname, new_info)
1876                             success = success and partial_success
1877                         info_dict['__postprocessors'] = postprocessors
1878                         info_dict['__files_to_merge'] = downloaded
1879                 else:
1880                     # Just a single file
1881                     success = dl(filename, info_dict)
1882             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1883                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1884                 return
1885             except (OSError, IOError) as err:
1886                 raise UnavailableVideoError(err)
1887             except (ContentTooShortError, ) as err:
1888                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1889                 return
1890
1891             if success and filename != '-':
1892                 # Fixup content
1893                 fixup_policy = self.params.get('fixup')
1894                 if fixup_policy is None:
1895                     fixup_policy = 'detect_or_warn'
1896
1897                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1898
1899                 stretched_ratio = info_dict.get('stretched_ratio')
1900                 if stretched_ratio is not None and stretched_ratio != 1:
1901                     if fixup_policy == 'warn':
1902                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1903                             info_dict['id'], stretched_ratio))
1904                     elif fixup_policy == 'detect_or_warn':
1905                         stretched_pp = FFmpegFixupStretchedPP(self)
1906                         if stretched_pp.available:
1907                             info_dict.setdefault('__postprocessors', [])
1908                             info_dict['__postprocessors'].append(stretched_pp)
1909                         else:
1910                             self.report_warning(
1911                                 '%s: Non-uniform pixel ratio (%s). %s'
1912                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1913                     else:
1914                         assert fixup_policy in ('ignore', 'never')
1915
1916                 if (info_dict.get('requested_formats') is None and
1917                         info_dict.get('container') == 'm4a_dash'):
1918                     if fixup_policy == 'warn':
1919                         self.report_warning(
1920                             '%s: writing DASH m4a. '
1921                             'Only some players support this container.'
1922                             % info_dict['id'])
1923                     elif fixup_policy == 'detect_or_warn':
1924                         fixup_pp = FFmpegFixupM4aPP(self)
1925                         if fixup_pp.available:
1926                             info_dict.setdefault('__postprocessors', [])
1927                             info_dict['__postprocessors'].append(fixup_pp)
1928                         else:
1929                             self.report_warning(
1930                                 '%s: writing DASH m4a. '
1931                                 'Only some players support this container. %s'
1932                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1933                     else:
1934                         assert fixup_policy in ('ignore', 'never')
1935
1936                 if (info_dict.get('protocol') == 'm3u8_native' or
1937                         info_dict.get('protocol') == 'm3u8' and
1938                         self.params.get('hls_prefer_native')):
1939                     if fixup_policy == 'warn':
1940                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1941                             info_dict['id']))
1942                     elif fixup_policy == 'detect_or_warn':
1943                         fixup_pp = FFmpegFixupM3u8PP(self)
1944                         if fixup_pp.available:
1945                             info_dict.setdefault('__postprocessors', [])
1946                             info_dict['__postprocessors'].append(fixup_pp)
1947                         else:
1948                             self.report_warning(
1949                                 '%s: malformed AAC bitstream detected. %s'
1950                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1951                     else:
1952                         assert fixup_policy in ('ignore', 'never')
1953
1954                 try:
1955                     self.post_process(filename, info_dict)
1956                 except (PostProcessingError) as err:
1957                     self.report_error('postprocessing: %s' % str(err))
1958                     return
1959                 self.record_download_archive(info_dict)
1960
1961     def download(self, url_list):
1962         """Download a given list of URLs."""
1963         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1964         if (len(url_list) > 1 and
1965                 outtmpl != '-' and
1966                 '%' not in outtmpl and
1967                 self.params.get('max_downloads') != 1):
1968             raise SameFileError(outtmpl)
1969
1970         for url in url_list:
1971             try:
1972                 # It also downloads the videos
1973                 res = self.extract_info(
1974                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1975             except UnavailableVideoError:
1976                 self.report_error('unable to download video')
1977             except MaxDownloadsReached:
1978                 self.to_screen('[info] Maximum number of downloaded files reached.')
1979                 raise
1980             else:
1981                 if self.params.get('dump_single_json', False):
1982                     self.to_stdout(json.dumps(res))
1983
1984         return self._download_retcode
1985
1986     def download_with_info_file(self, info_filename):
1987         with contextlib.closing(fileinput.FileInput(
1988                 [info_filename], mode='r',
1989                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1990             # FileInput doesn't have a read method, we can't call json.load
1991             info = self.filter_requested_info(json.loads('\n'.join(f)))
1992         try:
1993             self.process_ie_result(info, download=True)
1994         except DownloadError:
1995             webpage_url = info.get('webpage_url')
1996             if webpage_url is not None:
1997                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1998                 return self.download([webpage_url])
1999             else:
2000                 raise
2001         return self._download_retcode
2002
2003     @staticmethod
2004     def filter_requested_info(info_dict):
2005         return dict(
2006             (k, v) for k, v in info_dict.items()
2007             if k not in ['requested_formats', 'requested_subtitles'])
2008
2009     def post_process(self, filename, ie_info):
2010         """Run all the postprocessors on the given file."""
2011         info = dict(ie_info)
2012         info['filepath'] = filename
2013         pps_chain = []
2014         if ie_info.get('__postprocessors') is not None:
2015             pps_chain.extend(ie_info['__postprocessors'])
2016         pps_chain.extend(self._pps)
2017         for pp in pps_chain:
2018             files_to_delete = []
2019             try:
2020                 files_to_delete, info = pp.run(info)
2021             except PostProcessingError as e:
2022                 self.report_error(e.msg)
2023             if files_to_delete and not self.params.get('keepvideo', False):
2024                 for old_filename in files_to_delete:
2025                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2026                     try:
2027                         os.remove(encodeFilename(old_filename))
2028                     except (IOError, OSError):
2029                         self.report_warning('Unable to remove downloaded original file')
2030
2031     def _make_archive_id(self, info_dict):
2032         # Future-proof against any change in case
2033         # and backwards compatibility with prior versions
2034         extractor = info_dict.get('extractor_key')
2035         if extractor is None:
2036             if 'id' in info_dict:
2037                 extractor = info_dict.get('ie_key')  # key in a playlist
2038         if extractor is None:
2039             return None  # Incomplete video information
2040         return extractor.lower() + ' ' + info_dict['id']
2041
2042     def in_download_archive(self, info_dict):
2043         fn = self.params.get('download_archive')
2044         if fn is None:
2045             return False
2046
2047         vid_id = self._make_archive_id(info_dict)
2048         if vid_id is None:
2049             return False  # Incomplete video information
2050
2051         try:
2052             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2053                 for line in archive_file:
2054                     if line.strip() == vid_id:
2055                         return True
2056         except IOError as ioe:
2057             if ioe.errno != errno.ENOENT:
2058                 raise
2059         return False
2060
2061     def record_download_archive(self, info_dict):
2062         fn = self.params.get('download_archive')
2063         if fn is None:
2064             return
2065         vid_id = self._make_archive_id(info_dict)
2066         assert vid_id
2067         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2068             archive_file.write(vid_id + '\n')
2069
2070     @staticmethod
2071     def format_resolution(format, default='unknown'):
2072         if format.get('vcodec') == 'none':
2073             return 'audio only'
2074         if format.get('resolution') is not None:
2075             return format['resolution']
2076         if format.get('height') is not None:
2077             if format.get('width') is not None:
2078                 res = '%sx%s' % (format['width'], format['height'])
2079             else:
2080                 res = '%sp' % format['height']
2081         elif format.get('width') is not None:
2082             res = '%dx?' % format['width']
2083         else:
2084             res = default
2085         return res
2086
2087     def _format_note(self, fdict):
2088         res = ''
2089         if fdict.get('ext') in ['f4f', 'f4m']:
2090             res += '(unsupported) '
2091         if fdict.get('language'):
2092             if res:
2093                 res += ' '
2094             res += '[%s] ' % fdict['language']
2095         if fdict.get('format_note') is not None:
2096             res += fdict['format_note'] + ' '
2097         if fdict.get('tbr') is not None:
2098             res += '%4dk ' % fdict['tbr']
2099         if fdict.get('container') is not None:
2100             if res:
2101                 res += ', '
2102             res += '%s container' % fdict['container']
2103         if (fdict.get('vcodec') is not None and
2104                 fdict.get('vcodec') != 'none'):
2105             if res:
2106                 res += ', '
2107             res += fdict['vcodec']
2108             if fdict.get('vbr') is not None:
2109                 res += '@'
2110         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2111             res += 'video@'
2112         if fdict.get('vbr') is not None:
2113             res += '%4dk' % fdict['vbr']
2114         if fdict.get('fps') is not None:
2115             if res:
2116                 res += ', '
2117             res += '%sfps' % fdict['fps']
2118         if fdict.get('acodec') is not None:
2119             if res:
2120                 res += ', '
2121             if fdict['acodec'] == 'none':
2122                 res += 'video only'
2123             else:
2124                 res += '%-5s' % fdict['acodec']
2125         elif fdict.get('abr') is not None:
2126             if res:
2127                 res += ', '
2128             res += 'audio'
2129         if fdict.get('abr') is not None:
2130             res += '@%3dk' % fdict['abr']
2131         if fdict.get('asr') is not None:
2132             res += ' (%5dHz)' % fdict['asr']
2133         if fdict.get('filesize') is not None:
2134             if res:
2135                 res += ', '
2136             res += format_bytes(fdict['filesize'])
2137         elif fdict.get('filesize_approx') is not None:
2138             if res:
2139                 res += ', '
2140             res += '~' + format_bytes(fdict['filesize_approx'])
2141         return res
2142
2143     def list_formats(self, info_dict):
2144         formats = info_dict.get('formats', [info_dict])
2145         table = [
2146             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2147             for f in formats
2148             if f.get('preference') is None or f['preference'] >= -1000]
2149         if len(formats) > 1:
2150             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2151
2152         header_line = ['format code', 'extension', 'resolution', 'note']
2153         self.to_screen(
2154             '[info] Available formats for %s:\n%s' %
2155             (info_dict['id'], render_table(header_line, table)))
2156
2157     def list_thumbnails(self, info_dict):
2158         thumbnails = info_dict.get('thumbnails')
2159         if not thumbnails:
2160             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2161             return
2162
2163         self.to_screen(
2164             '[info] Thumbnails for %s:' % info_dict['id'])
2165         self.to_screen(render_table(
2166             ['ID', 'width', 'height', 'URL'],
2167             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2168
2169     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2170         if not subtitles:
2171             self.to_screen('%s has no %s' % (video_id, name))
2172             return
2173         self.to_screen(
2174             'Available %s for %s:' % (name, video_id))
2175         self.to_screen(render_table(
2176             ['Language', 'formats'],
2177             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2178                 for lang, formats in subtitles.items()]))
2179
2180     def urlopen(self, req):
2181         """ Start an HTTP download """
2182         if isinstance(req, compat_basestring):
2183             req = sanitized_Request(req)
2184         return self._opener.open(req, timeout=self._socket_timeout)
2185
2186     def print_debug_header(self):
2187         if not self.params.get('verbose'):
2188             return
2189
2190         if type('') is not compat_str:
2191             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2192             self.report_warning(
2193                 'Your Python is broken! Update to a newer and supported version')
2194
2195         stdout_encoding = getattr(
2196             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2197         encoding_str = (
2198             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2199                 locale.getpreferredencoding(),
2200                 sys.getfilesystemencoding(),
2201                 stdout_encoding,
2202                 self.get_encoding()))
2203         write_string(encoding_str, encoding=None)
2204
2205         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2206         if _LAZY_LOADER:
2207             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2208         try:
2209             sp = subprocess.Popen(
2210                 ['git', 'rev-parse', '--short', 'HEAD'],
2211                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2212                 cwd=os.path.dirname(os.path.abspath(__file__)))
2213             out, err = sp.communicate()
2214             out = out.decode().strip()
2215             if re.match('[0-9a-f]+', out):
2216                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2217         except Exception:
2218             try:
2219                 sys.exc_clear()
2220             except Exception:
2221                 pass
2222         self._write_string('[debug] Python version %s - %s\n' % (
2223             platform.python_version(), platform_name()))
2224
2225         exe_versions = FFmpegPostProcessor.get_versions(self)
2226         exe_versions['rtmpdump'] = rtmpdump_version()
2227         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2228         exe_str = ', '.join(
2229             '%s %s' % (exe, v)
2230             for exe, v in sorted(exe_versions.items())
2231             if v
2232         )
2233         if not exe_str:
2234             exe_str = 'none'
2235         self._write_string('[debug] exe versions: %s\n' % exe_str)
2236
2237         proxy_map = {}
2238         for handler in self._opener.handlers:
2239             if hasattr(handler, 'proxies'):
2240                 proxy_map.update(handler.proxies)
2241         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2242
2243         if self.params.get('call_home', False):
2244             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2245             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2246             latest_version = self.urlopen(
2247                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2248             if version_tuple(latest_version) > version_tuple(__version__):
2249                 self.report_warning(
2250                     'You are using an outdated version (newest version: %s)! '
2251                     'See https://yt-dl.org/update if you need help updating.' %
2252                     latest_version)
2253
2254     def _setup_opener(self):
2255         timeout_val = self.params.get('socket_timeout')
2256         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2257
2258         opts_cookiefile = self.params.get('cookiefile')
2259         opts_proxy = self.params.get('proxy')
2260
2261         if opts_cookiefile is None:
2262             self.cookiejar = compat_cookiejar.CookieJar()
2263         else:
2264             opts_cookiefile = expand_path(opts_cookiefile)
2265             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2266                 opts_cookiefile)
2267             if os.access(opts_cookiefile, os.R_OK):
2268                 self.cookiejar.load()
2269
2270         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2271         if opts_proxy is not None:
2272             if opts_proxy == '':
2273                 proxies = {}
2274             else:
2275                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2276         else:
2277             proxies = compat_urllib_request.getproxies()
2278             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2279             if 'http' in proxies and 'https' not in proxies:
2280                 proxies['https'] = proxies['http']
2281         proxy_handler = PerRequestProxyHandler(proxies)
2282
2283         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2284         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2285         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2286         data_handler = compat_urllib_request_DataHandler()
2287
2288         # When passing our own FileHandler instance, build_opener won't add the
2289         # default FileHandler and allows us to disable the file protocol, which
2290         # can be used for malicious purposes (see
2291         # https://github.com/rg3/youtube-dl/issues/8227)
2292         file_handler = compat_urllib_request.FileHandler()
2293
2294         def file_open(*args, **kwargs):
2295             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2296         file_handler.file_open = file_open
2297
2298         opener = compat_urllib_request.build_opener(
2299             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2300
2301         # Delete the default user-agent header, which would otherwise apply in
2302         # cases where our custom HTTP handler doesn't come into play
2303         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2304         opener.addheaders = []
2305         self._opener = opener
2306
2307     def encode(self, s):
2308         if isinstance(s, bytes):
2309             return s  # Already encoded
2310
2311         try:
2312             return s.encode(self.get_encoding())
2313         except UnicodeEncodeError as err:
2314             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2315             raise
2316
2317     def get_encoding(self):
2318         encoding = self.params.get('encoding')
2319         if encoding is None:
2320             encoding = preferredencoding()
2321         return encoding
2322
2323     def _write_thumbnails(self, info_dict, filename):
2324         if self.params.get('writethumbnail', False):
2325             thumbnails = info_dict.get('thumbnails')
2326             if thumbnails:
2327                 thumbnails = [thumbnails[-1]]
2328         elif self.params.get('write_all_thumbnails', False):
2329             thumbnails = info_dict.get('thumbnails')
2330         else:
2331             return
2332
2333         if not thumbnails:
2334             # No thumbnails present, so return immediately
2335             return
2336
2337         for t in thumbnails:
2338             thumb_ext = determine_ext(t['url'], 'jpg')
2339             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2340             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2341             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2342
2343             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2344                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2345                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2346             else:
2347                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2348                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2349                 try:
2350                     uf = self.urlopen(t['url'])
2351                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2352                         shutil.copyfileobj(uf, thumbf)
2353                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2354                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2355                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2356                     self.report_warning('Unable to download thumbnail "%s": %s' %
2357                                         (t['url'], error_to_compat_str(err)))