9036f0f94b8eb3259dcc3eb22b56e96b8371ee9c
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     PagedList,
69     parse_filesize,
70     PerRequestProxyHandler,
71     platform_name,
72     PostProcessingError,
73     preferredencoding,
74     prepend_extension,
75     register_socks_protocols,
76     render_table,
77     replace_extension,
78     SameFileError,
79     sanitize_filename,
80     sanitize_path,
81     sanitize_url,
82     sanitized_Request,
83     std_headers,
84     subtitles_filename,
85     UnavailableVideoError,
86     url_basename,
87     version_tuple,
88     write_json_file,
89     write_string,
90     YoutubeDLCookieProcessor,
91     YoutubeDLHandler,
92 )
93 from .cache import Cache
94 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
95 from .extractor.openload import PhantomJSwrapper
96 from .downloader import get_suitable_downloader
97 from .downloader.rtmp import rtmpdump_version
98 from .postprocessor import (
99     FFmpegFixupM3u8PP,
100     FFmpegFixupM4aPP,
101     FFmpegFixupStretchedPP,
102     FFmpegMergerPP,
103     FFmpegPostProcessor,
104     get_postprocessor,
105 )
106 from .version import __version__
107
108 if compat_os_name == 'nt':
109     import ctypes
110
111
112 class YoutubeDL(object):
113     """YoutubeDL class.
114
115     YoutubeDL objects are the ones responsible of downloading the
116     actual video file and writing it to disk if the user has requested
117     it, among some other tasks. In most cases there should be one per
118     program. As, given a video URL, the downloader doesn't know how to
119     extract all the needed information, task that InfoExtractors do, it
120     has to pass the URL to one of them.
121
122     For this, YoutubeDL objects have a method that allows
123     InfoExtractors to be registered in a given order. When it is passed
124     a URL, the YoutubeDL object handles it to the first InfoExtractor it
125     finds that reports being able to handle it. The InfoExtractor extracts
126     all the information about the video or videos the URL refers to, and
127     YoutubeDL process the extracted information, possibly using a File
128     Downloader to download the video.
129
130     YoutubeDL objects accept a lot of parameters. In order not to saturate
131     the object constructor with arguments, it receives a dictionary of
132     options instead. These options are available through the params
133     attribute for the InfoExtractors to use. The YoutubeDL also
134     registers itself as the downloader in charge for the InfoExtractors
135     that are added to it, so this is a "mutual registration".
136
137     Available options:
138
139     username:          Username for authentication purposes.
140     password:          Password for authentication purposes.
141     videopassword:     Password for accessing a video.
142     ap_mso:            Adobe Pass multiple-system operator identifier.
143     ap_username:       Multiple-system operator account username.
144     ap_password:       Multiple-system operator account password.
145     usenetrc:          Use netrc for authentication instead.
146     verbose:           Print additional info to stdout.
147     quiet:             Do not print messages to stdout.
148     no_warnings:       Do not print out anything for warnings.
149     forceurl:          Force printing final URL.
150     forcetitle:        Force printing title.
151     forceid:           Force printing ID.
152     forcethumbnail:    Force printing thumbnail URL.
153     forcedescription:  Force printing description.
154     forcefilename:     Force printing final filename.
155     forceduration:     Force printing duration.
156     forcejson:         Force printing info_dict as JSON.
157     dump_single_json:  Force printing the info_dict of the whole playlist
158                        (or video) as a single JSON line.
159     simulate:          Do not download the video files.
160     format:            Video format code. See options.py for more information.
161     outtmpl:           Template for output names.
162     restrictfilenames: Do not allow "&" and spaces in file names
163     ignoreerrors:      Do not stop on download errors.
164     force_generic_extractor: Force downloader to use the generic extractor
165     nooverwrites:      Prevent overwriting files.
166     playliststart:     Playlist item to start at.
167     playlistend:       Playlist item to end at.
168     playlist_items:    Specific indices of playlist to download.
169     playlistreverse:   Download playlist items in reverse order.
170     playlistrandom:    Download playlist items in random order.
171     matchtitle:        Download only matching titles.
172     rejecttitle:       Reject downloads for matching titles.
173     logger:            Log messages to a logging.Logger instance.
174     logtostderr:       Log messages to stderr instead of stdout.
175     writedescription:  Write the video description to a .description file
176     writeinfojson:     Write the video description to a .info.json file
177     writeannotations:  Write the video annotations to a .annotations.xml file
178     writethumbnail:    Write the thumbnail image to a file
179     write_all_thumbnails:  Write all thumbnail formats to files
180     writesubtitles:    Write the video subtitles to a file
181     writeautomaticsub: Write the automatically generated subtitles to a file
182     allsubtitles:      Downloads all the subtitles of the video
183                        (requires writesubtitles or writeautomaticsub)
184     listsubtitles:     Lists all available subtitles for the video
185     subtitlesformat:   The format code for subtitles
186     subtitleslangs:    List of languages of the subtitles to download
187     keepvideo:         Keep the video file after post-processing
188     daterange:         A DateRange object, download only if the upload_date is in the range.
189     skip_download:     Skip the actual download of the video file
190     cachedir:          Location of the cache files in the filesystem.
191                        False to disable filesystem cache.
192     noplaylist:        Download single video instead of a playlist if in doubt.
193     age_limit:         An integer representing the user's age in years.
194                        Unsuitable videos for the given age are skipped.
195     min_views:         An integer representing the minimum view count the video
196                        must have in order to not be skipped.
197                        Videos without view count information are always
198                        downloaded. None for no limit.
199     max_views:         An integer representing the maximum view count.
200                        Videos that are more popular than that are not
201                        downloaded.
202                        Videos without view count information are always
203                        downloaded. None for no limit.
204     download_archive:  File name of a file where all downloads are recorded.
205                        Videos already present in the file are not downloaded
206                        again.
207     cookiefile:        File name where cookies should be read from and dumped to.
208     nocheckcertificate:Do not verify SSL certificates
209     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
210                        At the moment, this is only supported by YouTube.
211     proxy:             URL of the proxy server to use
212     geo_verification_proxy:  URL of the proxy to use for IP address verification
213                        on geo-restricted sites. (Experimental)
214     socket_timeout:    Time to wait for unresponsive hosts, in seconds
215     bidi_workaround:   Work around buggy terminals without bidirectional text
216                        support, using fridibi
217     debug_printtraffic:Print out sent and received HTTP traffic
218     include_ads:       Download ads as well
219     default_search:    Prepend this string if an input url is not valid.
220                        'auto' for elaborate guessing
221     encoding:          Use this encoding instead of the system-specified.
222     extract_flat:      Do not resolve URLs, return the immediate result.
223                        Pass in 'in_playlist' to only show this behavior for
224                        playlist items.
225     postprocessors:    A list of dictionaries, each with an entry
226                        * key:  The name of the postprocessor. See
227                                youtube_dl/postprocessor/__init__.py for a list.
228                        as well as any further keyword arguments for the
229                        postprocessor.
230     progress_hooks:    A list of functions that get called on download
231                        progress, with a dictionary with the entries
232                        * status: One of "downloading", "error", or "finished".
233                                  Check this first and ignore unknown values.
234
235                        If status is one of "downloading", or "finished", the
236                        following properties may also be present:
237                        * filename: The final filename (always present)
238                        * tmpfilename: The filename we're currently writing to
239                        * downloaded_bytes: Bytes on disk
240                        * total_bytes: Size of the whole file, None if unknown
241                        * total_bytes_estimate: Guess of the eventual file size,
242                                                None if unavailable.
243                        * elapsed: The number of seconds since download started.
244                        * eta: The estimated time in seconds, None if unknown
245                        * speed: The download speed in bytes/second, None if
246                                 unknown
247                        * fragment_index: The counter of the currently
248                                          downloaded video fragment.
249                        * fragment_count: The number of fragments (= individual
250                                          files that will be merged)
251
252                        Progress hooks are guaranteed to be called at least once
253                        (with status "finished") if the download is successful.
254     merge_output_format: Extension to use when merging formats.
255     fixup:             Automatically correct known faults of the file.
256                        One of:
257                        - "never": do nothing
258                        - "warn": only emit a warning
259                        - "detect_or_warn": check whether we can do anything
260                                            about it, warn otherwise (default)
261     source_address:    (Experimental) Client-side IP address to bind to.
262     call_home:         Boolean, true iff we are allowed to contact the
263                        youtube-dl servers for debugging.
264     sleep_interval:    Number of seconds to sleep before each download when
265                        used alone or a lower bound of a range for randomized
266                        sleep before each download (minimum possible number
267                        of seconds to sleep) when used along with
268                        max_sleep_interval.
269     max_sleep_interval:Upper bound of a range for randomized sleep before each
270                        download (maximum possible number of seconds to sleep).
271                        Must only be used along with sleep_interval.
272                        Actual sleep time will be a random float from range
273                        [sleep_interval; max_sleep_interval].
274     listformats:       Print an overview of available video formats and exit.
275     list_thumbnails:   Print a table of all thumbnails and exit.
276     match_filter:      A function that gets called with the info_dict of
277                        every video.
278                        If it returns a message, the video is ignored.
279                        If it returns None, the video is downloaded.
280                        match_filter_func in utils.py is one example for this.
281     no_color:          Do not emit color codes in output.
282     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
283                        HTTP header (experimental)
284     geo_bypass_country:
285                        Two-letter ISO 3166-2 country code that will be used for
286                        explicit geographic restriction bypassing via faking
287                        X-Forwarded-For HTTP header (experimental)
288
289     The following options determine which downloader is picked:
290     external_downloader: Executable of the external downloader to call.
291                        None or unset for standard (built-in) downloader.
292     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
293                        if True, otherwise use ffmpeg/avconv if False, otherwise
294                        use downloader suggested by extractor if None.
295
296     The following parameters are not used by YoutubeDL itself, they are used by
297     the downloader (see youtube_dl/downloader/common.py):
298     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
299     noresizebuffer, retries, continuedl, noprogress, consoletitle,
300     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
301
302     The following options are used by the post processors:
303     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
304                        otherwise prefer avconv.
305     postprocessor_args: A list of additional command-line arguments for the
306                         postprocessor.
307
308     The following options are used by the Youtube extractor:
309     youtube_include_dash_manifest: If True (default), DASH manifests and related
310                         data will be downloaded and processed by extractor.
311                         You can reduce network I/O by disabling it if you don't
312                         care about DASH.
313     """
314
315     _NUMERIC_FIELDS = set((
316         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
317         'timestamp', 'upload_year', 'upload_month', 'upload_day',
318         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
319         'average_rating', 'comment_count', 'age_limit',
320         'start_time', 'end_time',
321         'chapter_number', 'season_number', 'episode_number',
322         'track_number', 'disc_number', 'release_year',
323         'playlist_index',
324     ))
325
326     params = None
327     _ies = []
328     _pps = []
329     _download_retcode = None
330     _num_downloads = None
331     _screen_file = None
332
333     def __init__(self, params=None, auto_init=True):
334         """Create a FileDownloader object with the given options."""
335         if params is None:
336             params = {}
337         self._ies = []
338         self._ies_instances = {}
339         self._pps = []
340         self._progress_hooks = []
341         self._download_retcode = 0
342         self._num_downloads = 0
343         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
344         self._err_file = sys.stderr
345         self.params = {
346             # Default parameters
347             'nocheckcertificate': False,
348         }
349         self.params.update(params)
350         self.cache = Cache(self)
351
352         def check_deprecated(param, option, suggestion):
353             if self.params.get(param) is not None:
354                 self.report_warning(
355                     '%s is deprecated. Use %s instead.' % (option, suggestion))
356                 return True
357             return False
358
359         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
360             if self.params.get('geo_verification_proxy') is None:
361                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
362
363         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
364         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
365         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
366
367         if params.get('bidi_workaround', False):
368             try:
369                 import pty
370                 master, slave = pty.openpty()
371                 width = compat_get_terminal_size().columns
372                 if width is None:
373                     width_args = []
374                 else:
375                     width_args = ['-w', str(width)]
376                 sp_kwargs = dict(
377                     stdin=subprocess.PIPE,
378                     stdout=slave,
379                     stderr=self._err_file)
380                 try:
381                     self._output_process = subprocess.Popen(
382                         ['bidiv'] + width_args, **sp_kwargs
383                     )
384                 except OSError:
385                     self._output_process = subprocess.Popen(
386                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
387                 self._output_channel = os.fdopen(master, 'rb')
388             except OSError as ose:
389                 if ose.errno == errno.ENOENT:
390                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
391                 else:
392                     raise
393
394         if (sys.platform != 'win32' and
395                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
396                 not params.get('restrictfilenames', False)):
397             # Unicode filesystem API will throw errors (#1474, #13027)
398             self.report_warning(
399                 'Assuming --restrict-filenames since file system encoding '
400                 'cannot encode all characters. '
401                 'Set the LC_ALL environment variable to fix this.')
402             self.params['restrictfilenames'] = True
403
404         if isinstance(params.get('outtmpl'), bytes):
405             self.report_warning(
406                 'Parameter outtmpl is bytes, but should be a unicode string. '
407                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
408
409         self._setup_opener()
410
411         if auto_init:
412             self.print_debug_header()
413             self.add_default_info_extractors()
414
415         for pp_def_raw in self.params.get('postprocessors', []):
416             pp_class = get_postprocessor(pp_def_raw['key'])
417             pp_def = dict(pp_def_raw)
418             del pp_def['key']
419             pp = pp_class(self, **compat_kwargs(pp_def))
420             self.add_post_processor(pp)
421
422         for ph in self.params.get('progress_hooks', []):
423             self.add_progress_hook(ph)
424
425         register_socks_protocols()
426
427     def warn_if_short_id(self, argv):
428         # short YouTube ID starting with dash?
429         idxs = [
430             i for i, a in enumerate(argv)
431             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
432         if idxs:
433             correct_argv = (
434                 ['youtube-dl'] +
435                 [a for i, a in enumerate(argv) if i not in idxs] +
436                 ['--'] + [argv[i] for i in idxs]
437             )
438             self.report_warning(
439                 'Long argument string detected. '
440                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
441                 args_to_str(correct_argv))
442
443     def add_info_extractor(self, ie):
444         """Add an InfoExtractor object to the end of the list."""
445         self._ies.append(ie)
446         if not isinstance(ie, type):
447             self._ies_instances[ie.ie_key()] = ie
448             ie.set_downloader(self)
449
450     def get_info_extractor(self, ie_key):
451         """
452         Get an instance of an IE with name ie_key, it will try to get one from
453         the _ies list, if there's no instance it will create a new one and add
454         it to the extractor list.
455         """
456         ie = self._ies_instances.get(ie_key)
457         if ie is None:
458             ie = get_info_extractor(ie_key)()
459             self.add_info_extractor(ie)
460         return ie
461
462     def add_default_info_extractors(self):
463         """
464         Add the InfoExtractors returned by gen_extractors to the end of the list
465         """
466         for ie in gen_extractor_classes():
467             self.add_info_extractor(ie)
468
469     def add_post_processor(self, pp):
470         """Add a PostProcessor object to the end of the chain."""
471         self._pps.append(pp)
472         pp.set_downloader(self)
473
474     def add_progress_hook(self, ph):
475         """Add the progress hook (currently only for the file downloader)"""
476         self._progress_hooks.append(ph)
477
478     def _bidi_workaround(self, message):
479         if not hasattr(self, '_output_channel'):
480             return message
481
482         assert hasattr(self, '_output_process')
483         assert isinstance(message, compat_str)
484         line_count = message.count('\n') + 1
485         self._output_process.stdin.write((message + '\n').encode('utf-8'))
486         self._output_process.stdin.flush()
487         res = ''.join(self._output_channel.readline().decode('utf-8')
488                       for _ in range(line_count))
489         return res[:-len('\n')]
490
491     def to_screen(self, message, skip_eol=False):
492         """Print message to stdout if not in quiet mode."""
493         return self.to_stdout(message, skip_eol, check_quiet=True)
494
495     def _write_string(self, s, out=None):
496         write_string(s, out=out, encoding=self.params.get('encoding'))
497
498     def to_stdout(self, message, skip_eol=False, check_quiet=False):
499         """Print message to stdout if not in quiet mode."""
500         if self.params.get('logger'):
501             self.params['logger'].debug(message)
502         elif not check_quiet or not self.params.get('quiet', False):
503             message = self._bidi_workaround(message)
504             terminator = ['\n', ''][skip_eol]
505             output = message + terminator
506
507             self._write_string(output, self._screen_file)
508
509     def to_stderr(self, message):
510         """Print message to stderr."""
511         assert isinstance(message, compat_str)
512         if self.params.get('logger'):
513             self.params['logger'].error(message)
514         else:
515             message = self._bidi_workaround(message)
516             output = message + '\n'
517             self._write_string(output, self._err_file)
518
519     def to_console_title(self, message):
520         if not self.params.get('consoletitle', False):
521             return
522         if compat_os_name == 'nt':
523             if ctypes.windll.kernel32.GetConsoleWindow():
524                 # c_wchar_p() might not be necessary if `message` is
525                 # already of type unicode()
526                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
527         elif 'TERM' in os.environ:
528             self._write_string('\033]0;%s\007' % message, self._screen_file)
529
530     def save_console_title(self):
531         if not self.params.get('consoletitle', False):
532             return
533         if compat_os_name != 'nt' and 'TERM' in os.environ:
534             # Save the title on stack
535             self._write_string('\033[22;0t', self._screen_file)
536
537     def restore_console_title(self):
538         if not self.params.get('consoletitle', False):
539             return
540         if compat_os_name != 'nt' and 'TERM' in os.environ:
541             # Restore the title from stack
542             self._write_string('\033[23;0t', self._screen_file)
543
544     def __enter__(self):
545         self.save_console_title()
546         return self
547
548     def __exit__(self, *args):
549         self.restore_console_title()
550
551         if self.params.get('cookiefile') is not None:
552             self.cookiejar.save()
553
554     def trouble(self, message=None, tb=None):
555         """Determine action to take when a download problem appears.
556
557         Depending on if the downloader has been configured to ignore
558         download errors or not, this method may throw an exception or
559         not when errors are found, after printing the message.
560
561         tb, if given, is additional traceback information.
562         """
563         if message is not None:
564             self.to_stderr(message)
565         if self.params.get('verbose'):
566             if tb is None:
567                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
568                     tb = ''
569                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
570                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
571                     tb += encode_compat_str(traceback.format_exc())
572                 else:
573                     tb_data = traceback.format_list(traceback.extract_stack())
574                     tb = ''.join(tb_data)
575             self.to_stderr(tb)
576         if not self.params.get('ignoreerrors', False):
577             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
578                 exc_info = sys.exc_info()[1].exc_info
579             else:
580                 exc_info = sys.exc_info()
581             raise DownloadError(message, exc_info)
582         self._download_retcode = 1
583
584     def report_warning(self, message):
585         '''
586         Print the message to stderr, it will be prefixed with 'WARNING:'
587         If stderr is a tty file the 'WARNING:' will be colored
588         '''
589         if self.params.get('logger') is not None:
590             self.params['logger'].warning(message)
591         else:
592             if self.params.get('no_warnings'):
593                 return
594             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
595                 _msg_header = '\033[0;33mWARNING:\033[0m'
596             else:
597                 _msg_header = 'WARNING:'
598             warning_message = '%s %s' % (_msg_header, message)
599             self.to_stderr(warning_message)
600
601     def report_error(self, message, tb=None):
602         '''
603         Do the same as trouble, but prefixes the message with 'ERROR:', colored
604         in red if stderr is a tty file.
605         '''
606         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
607             _msg_header = '\033[0;31mERROR:\033[0m'
608         else:
609             _msg_header = 'ERROR:'
610         error_message = '%s %s' % (_msg_header, message)
611         self.trouble(error_message, tb)
612
613     def report_file_already_downloaded(self, file_name):
614         """Report file has already been fully downloaded."""
615         try:
616             self.to_screen('[download] %s has already been downloaded' % file_name)
617         except UnicodeEncodeError:
618             self.to_screen('[download] The file has already been downloaded')
619
620     def prepare_filename(self, info_dict):
621         """Generate the output filename."""
622         try:
623             template_dict = dict(info_dict)
624
625             template_dict['epoch'] = int(time.time())
626             autonumber_size = self.params.get('autonumber_size')
627             if autonumber_size is None:
628                 autonumber_size = 5
629             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
630             if template_dict.get('resolution') is None:
631                 if template_dict.get('width') and template_dict.get('height'):
632                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
633                 elif template_dict.get('height'):
634                     template_dict['resolution'] = '%sp' % template_dict['height']
635                 elif template_dict.get('width'):
636                     template_dict['resolution'] = '%dx?' % template_dict['width']
637
638             sanitize = lambda k, v: sanitize_filename(
639                 compat_str(v),
640                 restricted=self.params.get('restrictfilenames'),
641                 is_id=(k == 'id' or k.endswith('_id')))
642             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
643                                  for k, v in template_dict.items()
644                                  if v is not None and not isinstance(v, (list, tuple, dict)))
645             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
646
647             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
648
649             # For fields playlist_index and autonumber convert all occurrences
650             # of %(field)s to %(field)0Nd for backward compatibility
651             field_size_compat_map = {
652                 'playlist_index': len(str(template_dict['n_entries'])),
653                 'autonumber': autonumber_size,
654             }
655             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
656             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
657             if mobj:
658                 outtmpl = re.sub(
659                     FIELD_SIZE_COMPAT_RE,
660                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
661                     outtmpl)
662
663             # Missing numeric fields used together with integer presentation types
664             # in format specification will break the argument substitution since
665             # string 'NA' is returned for missing fields. We will patch output
666             # template for missing fields to meet string presentation type.
667             for numeric_field in self._NUMERIC_FIELDS:
668                 if numeric_field not in template_dict:
669                     # As of [1] format syntax is:
670                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
671                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
672                     FORMAT_RE = r'''(?x)
673                         (?<!%)
674                         %
675                         \({0}\)  # mapping key
676                         (?:[#0\-+ ]+)?  # conversion flags (optional)
677                         (?:\d+)?  # minimum field width (optional)
678                         (?:\.\d+)?  # precision (optional)
679                         [hlL]?  # length modifier (optional)
680                         [diouxXeEfFgGcrs%]  # conversion type
681                     '''
682                     outtmpl = re.sub(
683                         FORMAT_RE.format(numeric_field),
684                         r'%({0})s'.format(numeric_field), outtmpl)
685
686             # expand_path translates '%%' into '%' and '$$' into '$'
687             # correspondingly that is not what we want since we need to keep
688             # '%%' intact for template dict substitution step. Working around
689             # with boundary-alike separator hack.
690             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
691             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
692
693             # outtmpl should be expand_path'ed before template dict substitution
694             # because meta fields may contain env variables we don't want to
695             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
696             # title "Hello $PATH", we don't want `$PATH` to be expanded.
697             filename = expand_path(outtmpl).replace(sep, '') % template_dict
698
699             # Temporary fix for #4787
700             # 'Treat' all problem characters by passing filename through preferredencoding
701             # to workaround encoding issues with subprocess on python2 @ Windows
702             if sys.version_info < (3, 0) and sys.platform == 'win32':
703                 filename = encodeFilename(filename, True).decode(preferredencoding())
704             return sanitize_path(filename)
705         except ValueError as err:
706             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
707             return None
708
709     def _match_entry(self, info_dict, incomplete):
710         """ Returns None iff the file should be downloaded """
711
712         video_title = info_dict.get('title', info_dict.get('id', 'video'))
713         if 'title' in info_dict:
714             # This can happen when we're just evaluating the playlist
715             title = info_dict['title']
716             matchtitle = self.params.get('matchtitle', False)
717             if matchtitle:
718                 if not re.search(matchtitle, title, re.IGNORECASE):
719                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
720             rejecttitle = self.params.get('rejecttitle', False)
721             if rejecttitle:
722                 if re.search(rejecttitle, title, re.IGNORECASE):
723                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724         date = info_dict.get('upload_date')
725         if date is not None:
726             dateRange = self.params.get('daterange', DateRange())
727             if date not in dateRange:
728                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
729         view_count = info_dict.get('view_count')
730         if view_count is not None:
731             min_views = self.params.get('min_views')
732             if min_views is not None and view_count < min_views:
733                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
734             max_views = self.params.get('max_views')
735             if max_views is not None and view_count > max_views:
736                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
737         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
738             return 'Skipping "%s" because it is age restricted' % video_title
739         if self.in_download_archive(info_dict):
740             return '%s has already been recorded in archive' % video_title
741
742         if not incomplete:
743             match_filter = self.params.get('match_filter')
744             if match_filter is not None:
745                 ret = match_filter(info_dict)
746                 if ret is not None:
747                     return ret
748
749         return None
750
751     @staticmethod
752     def add_extra_info(info_dict, extra_info):
753         '''Set the keys from extra_info in info dict if they are missing'''
754         for key, value in extra_info.items():
755             info_dict.setdefault(key, value)
756
757     def extract_info(self, url, download=True, ie_key=None, extra_info={},
758                      process=True, force_generic_extractor=False):
759         '''
760         Returns a list with a dictionary for each video we find.
761         If 'download', also downloads the videos.
762         extra_info is a dict containing the extra values to add to each result
763         '''
764
765         if not ie_key and force_generic_extractor:
766             ie_key = 'Generic'
767
768         if ie_key:
769             ies = [self.get_info_extractor(ie_key)]
770         else:
771             ies = self._ies
772
773         for ie in ies:
774             if not ie.suitable(url):
775                 continue
776
777             ie = self.get_info_extractor(ie.ie_key())
778             if not ie.working():
779                 self.report_warning('The program functionality for this site has been marked as broken, '
780                                     'and will probably not work.')
781
782             try:
783                 ie_result = ie.extract(url)
784                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
785                     break
786                 if isinstance(ie_result, list):
787                     # Backwards compatibility: old IE result format
788                     ie_result = {
789                         '_type': 'compat_list',
790                         'entries': ie_result,
791                     }
792                 self.add_default_extra_info(ie_result, ie, url)
793                 if process:
794                     return self.process_ie_result(ie_result, download, extra_info)
795                 else:
796                     return ie_result
797             except GeoRestrictedError as e:
798                 msg = e.msg
799                 if e.countries:
800                     msg += '\nThis video is available in %s.' % ', '.join(
801                         map(ISO3166Utils.short2full, e.countries))
802                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
803                 self.report_error(msg)
804                 break
805             except ExtractorError as e:  # An error we somewhat expected
806                 self.report_error(compat_str(e), e.format_traceback())
807                 break
808             except MaxDownloadsReached:
809                 raise
810             except Exception as e:
811                 if self.params.get('ignoreerrors', False):
812                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
813                     break
814                 else:
815                     raise
816         else:
817             self.report_error('no suitable InfoExtractor for URL %s' % url)
818
819     def add_default_extra_info(self, ie_result, ie, url):
820         self.add_extra_info(ie_result, {
821             'extractor': ie.IE_NAME,
822             'webpage_url': url,
823             'webpage_url_basename': url_basename(url),
824             'extractor_key': ie.ie_key(),
825         })
826
827     def process_ie_result(self, ie_result, download=True, extra_info={}):
828         """
829         Take the result of the ie(may be modified) and resolve all unresolved
830         references (URLs, playlist items).
831
832         It will also download the videos if 'download'.
833         Returns the resolved ie_result.
834         """
835         result_type = ie_result.get('_type', 'video')
836
837         if result_type in ('url', 'url_transparent'):
838             ie_result['url'] = sanitize_url(ie_result['url'])
839             extract_flat = self.params.get('extract_flat', False)
840             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
841                     extract_flat is True):
842                 if self.params.get('forcejson', False):
843                     self.to_stdout(json.dumps(ie_result))
844                 return ie_result
845
846         if result_type == 'video':
847             self.add_extra_info(ie_result, extra_info)
848             return self.process_video_result(ie_result, download=download)
849         elif result_type == 'url':
850             # We have to add extra_info to the results because it may be
851             # contained in a playlist
852             return self.extract_info(ie_result['url'],
853                                      download,
854                                      ie_key=ie_result.get('ie_key'),
855                                      extra_info=extra_info)
856         elif result_type == 'url_transparent':
857             # Use the information from the embedding page
858             info = self.extract_info(
859                 ie_result['url'], ie_key=ie_result.get('ie_key'),
860                 extra_info=extra_info, download=False, process=False)
861
862             # extract_info may return None when ignoreerrors is enabled and
863             # extraction failed with an error, don't crash and return early
864             # in this case
865             if not info:
866                 return info
867
868             force_properties = dict(
869                 (k, v) for k, v in ie_result.items() if v is not None)
870             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
871                 if f in force_properties:
872                     del force_properties[f]
873             new_result = info.copy()
874             new_result.update(force_properties)
875
876             # Extracted info may not be a video result (i.e.
877             # info.get('_type', 'video') != video) but rather an url or
878             # url_transparent. In such cases outer metadata (from ie_result)
879             # should be propagated to inner one (info). For this to happen
880             # _type of info should be overridden with url_transparent. This
881             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
882             if new_result.get('_type') == 'url':
883                 new_result['_type'] = 'url_transparent'
884
885             return self.process_ie_result(
886                 new_result, download=download, extra_info=extra_info)
887         elif result_type in ('playlist', 'multi_video'):
888             # We process each entry in the playlist
889             playlist = ie_result.get('title') or ie_result.get('id')
890             self.to_screen('[download] Downloading playlist: %s' % playlist)
891
892             playlist_results = []
893
894             playliststart = self.params.get('playliststart', 1) - 1
895             playlistend = self.params.get('playlistend')
896             # For backwards compatibility, interpret -1 as whole list
897             if playlistend == -1:
898                 playlistend = None
899
900             playlistitems_str = self.params.get('playlist_items')
901             playlistitems = None
902             if playlistitems_str is not None:
903                 def iter_playlistitems(format):
904                     for string_segment in format.split(','):
905                         if '-' in string_segment:
906                             start, end = string_segment.split('-')
907                             for item in range(int(start), int(end) + 1):
908                                 yield int(item)
909                         else:
910                             yield int(string_segment)
911                 playlistitems = iter_playlistitems(playlistitems_str)
912
913             ie_entries = ie_result['entries']
914
915             def make_playlistitems_entries(list_ie_entries):
916                 num_entries = len(list_ie_entries)
917                 return [
918                     list_ie_entries[i - 1] for i in playlistitems
919                     if -num_entries <= i - 1 < num_entries]
920
921             def report_download(num_entries):
922                 self.to_screen(
923                     '[%s] playlist %s: Downloading %d videos' %
924                     (ie_result['extractor'], playlist, num_entries))
925
926             if isinstance(ie_entries, list):
927                 n_all_entries = len(ie_entries)
928                 if playlistitems:
929                     entries = make_playlistitems_entries(ie_entries)
930                 else:
931                     entries = ie_entries[playliststart:playlistend]
932                 n_entries = len(entries)
933                 self.to_screen(
934                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
935                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
936             elif isinstance(ie_entries, PagedList):
937                 if playlistitems:
938                     entries = []
939                     for item in playlistitems:
940                         entries.extend(ie_entries.getslice(
941                             item - 1, item
942                         ))
943                 else:
944                     entries = ie_entries.getslice(
945                         playliststart, playlistend)
946                 n_entries = len(entries)
947                 report_download(n_entries)
948             else:  # iterable
949                 if playlistitems:
950                     entries = make_playlistitems_entries(list(ie_entries))
951                 else:
952                     entries = list(itertools.islice(
953                         ie_entries, playliststart, playlistend))
954                 n_entries = len(entries)
955                 report_download(n_entries)
956
957             if self.params.get('playlistreverse', False):
958                 entries = entries[::-1]
959
960             if self.params.get('playlistrandom', False):
961                 random.shuffle(entries)
962
963             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
964
965             for i, entry in enumerate(entries, 1):
966                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
967                 # This __x_forwarded_for_ip thing is a bit ugly but requires
968                 # minimal changes
969                 if x_forwarded_for:
970                     entry['__x_forwarded_for_ip'] = x_forwarded_for
971                 extra = {
972                     'n_entries': n_entries,
973                     'playlist': playlist,
974                     'playlist_id': ie_result.get('id'),
975                     'playlist_title': ie_result.get('title'),
976                     'playlist_index': i + playliststart,
977                     'extractor': ie_result['extractor'],
978                     'webpage_url': ie_result['webpage_url'],
979                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
980                     'extractor_key': ie_result['extractor_key'],
981                 }
982
983                 reason = self._match_entry(entry, incomplete=True)
984                 if reason is not None:
985                     self.to_screen('[download] ' + reason)
986                     continue
987
988                 entry_result = self.process_ie_result(entry,
989                                                       download=download,
990                                                       extra_info=extra)
991                 playlist_results.append(entry_result)
992             ie_result['entries'] = playlist_results
993             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
994             return ie_result
995         elif result_type == 'compat_list':
996             self.report_warning(
997                 'Extractor %s returned a compat_list result. '
998                 'It needs to be updated.' % ie_result.get('extractor'))
999
1000             def _fixup(r):
1001                 self.add_extra_info(
1002                     r,
1003                     {
1004                         'extractor': ie_result['extractor'],
1005                         'webpage_url': ie_result['webpage_url'],
1006                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1007                         'extractor_key': ie_result['extractor_key'],
1008                     }
1009                 )
1010                 return r
1011             ie_result['entries'] = [
1012                 self.process_ie_result(_fixup(r), download, extra_info)
1013                 for r in ie_result['entries']
1014             ]
1015             return ie_result
1016         else:
1017             raise Exception('Invalid result type: %s' % result_type)
1018
1019     def _build_format_filter(self, filter_spec):
1020         " Returns a function to filter the formats according to the filter_spec "
1021
1022         OPERATORS = {
1023             '<': operator.lt,
1024             '<=': operator.le,
1025             '>': operator.gt,
1026             '>=': operator.ge,
1027             '=': operator.eq,
1028             '!=': operator.ne,
1029         }
1030         operator_rex = re.compile(r'''(?x)\s*
1031             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1032             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1033             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1034             $
1035             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1036         m = operator_rex.search(filter_spec)
1037         if m:
1038             try:
1039                 comparison_value = int(m.group('value'))
1040             except ValueError:
1041                 comparison_value = parse_filesize(m.group('value'))
1042                 if comparison_value is None:
1043                     comparison_value = parse_filesize(m.group('value') + 'B')
1044                 if comparison_value is None:
1045                     raise ValueError(
1046                         'Invalid value %r in format specification %r' % (
1047                             m.group('value'), filter_spec))
1048             op = OPERATORS[m.group('op')]
1049
1050         if not m:
1051             STR_OPERATORS = {
1052                 '=': operator.eq,
1053                 '!=': operator.ne,
1054                 '^=': lambda attr, value: attr.startswith(value),
1055                 '$=': lambda attr, value: attr.endswith(value),
1056                 '*=': lambda attr, value: value in attr,
1057             }
1058             str_operator_rex = re.compile(r'''(?x)
1059                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1060                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1061                 \s*(?P<value>[a-zA-Z0-9._-]+)
1062                 \s*$
1063                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1064             m = str_operator_rex.search(filter_spec)
1065             if m:
1066                 comparison_value = m.group('value')
1067                 op = STR_OPERATORS[m.group('op')]
1068
1069         if not m:
1070             raise ValueError('Invalid filter specification %r' % filter_spec)
1071
1072         def _filter(f):
1073             actual_value = f.get(m.group('key'))
1074             if actual_value is None:
1075                 return m.group('none_inclusive')
1076             return op(actual_value, comparison_value)
1077         return _filter
1078
1079     def _default_format_spec(self, info_dict, download=True):
1080         req_format_list = []
1081
1082         def can_have_partial_formats():
1083             if self.params.get('simulate', False):
1084                 return True
1085             if not download:
1086                 return True
1087             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1088                 return False
1089             if info_dict.get('is_live'):
1090                 return False
1091             merger = FFmpegMergerPP(self)
1092             return merger.available and merger.can_merge()
1093         if can_have_partial_formats():
1094             req_format_list.append('bestvideo+bestaudio')
1095         req_format_list.append('best')
1096         return '/'.join(req_format_list)
1097
1098     def build_format_selector(self, format_spec):
1099         def syntax_error(note, start):
1100             message = (
1101                 'Invalid format specification: '
1102                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1103             return SyntaxError(message)
1104
1105         PICKFIRST = 'PICKFIRST'
1106         MERGE = 'MERGE'
1107         SINGLE = 'SINGLE'
1108         GROUP = 'GROUP'
1109         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1110
1111         def _parse_filter(tokens):
1112             filter_parts = []
1113             for type, string, start, _, _ in tokens:
1114                 if type == tokenize.OP and string == ']':
1115                     return ''.join(filter_parts)
1116                 else:
1117                     filter_parts.append(string)
1118
1119         def _remove_unused_ops(tokens):
1120             # Remove operators that we don't use and join them with the surrounding strings
1121             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1122             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1123             last_string, last_start, last_end, last_line = None, None, None, None
1124             for type, string, start, end, line in tokens:
1125                 if type == tokenize.OP and string == '[':
1126                     if last_string:
1127                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1128                         last_string = None
1129                     yield type, string, start, end, line
1130                     # everything inside brackets will be handled by _parse_filter
1131                     for type, string, start, end, line in tokens:
1132                         yield type, string, start, end, line
1133                         if type == tokenize.OP and string == ']':
1134                             break
1135                 elif type == tokenize.OP and string in ALLOWED_OPS:
1136                     if last_string:
1137                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1138                         last_string = None
1139                     yield type, string, start, end, line
1140                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1141                     if not last_string:
1142                         last_string = string
1143                         last_start = start
1144                         last_end = end
1145                     else:
1146                         last_string += string
1147             if last_string:
1148                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1149
1150         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1151             selectors = []
1152             current_selector = None
1153             for type, string, start, _, _ in tokens:
1154                 # ENCODING is only defined in python 3.x
1155                 if type == getattr(tokenize, 'ENCODING', None):
1156                     continue
1157                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1158                     current_selector = FormatSelector(SINGLE, string, [])
1159                 elif type == tokenize.OP:
1160                     if string == ')':
1161                         if not inside_group:
1162                             # ')' will be handled by the parentheses group
1163                             tokens.restore_last_token()
1164                         break
1165                     elif inside_merge and string in ['/', ',']:
1166                         tokens.restore_last_token()
1167                         break
1168                     elif inside_choice and string == ',':
1169                         tokens.restore_last_token()
1170                         break
1171                     elif string == ',':
1172                         if not current_selector:
1173                             raise syntax_error('"," must follow a format selector', start)
1174                         selectors.append(current_selector)
1175                         current_selector = None
1176                     elif string == '/':
1177                         if not current_selector:
1178                             raise syntax_error('"/" must follow a format selector', start)
1179                         first_choice = current_selector
1180                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1181                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1182                     elif string == '[':
1183                         if not current_selector:
1184                             current_selector = FormatSelector(SINGLE, 'best', [])
1185                         format_filter = _parse_filter(tokens)
1186                         current_selector.filters.append(format_filter)
1187                     elif string == '(':
1188                         if current_selector:
1189                             raise syntax_error('Unexpected "("', start)
1190                         group = _parse_format_selection(tokens, inside_group=True)
1191                         current_selector = FormatSelector(GROUP, group, [])
1192                     elif string == '+':
1193                         video_selector = current_selector
1194                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1195                         if not video_selector or not audio_selector:
1196                             raise syntax_error('"+" must be between two format selectors', start)
1197                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1198                     else:
1199                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1200                 elif type == tokenize.ENDMARKER:
1201                     break
1202             if current_selector:
1203                 selectors.append(current_selector)
1204             return selectors
1205
1206         def _build_selector_function(selector):
1207             if isinstance(selector, list):
1208                 fs = [_build_selector_function(s) for s in selector]
1209
1210                 def selector_function(ctx):
1211                     for f in fs:
1212                         for format in f(ctx):
1213                             yield format
1214                 return selector_function
1215             elif selector.type == GROUP:
1216                 selector_function = _build_selector_function(selector.selector)
1217             elif selector.type == PICKFIRST:
1218                 fs = [_build_selector_function(s) for s in selector.selector]
1219
1220                 def selector_function(ctx):
1221                     for f in fs:
1222                         picked_formats = list(f(ctx))
1223                         if picked_formats:
1224                             return picked_formats
1225                     return []
1226             elif selector.type == SINGLE:
1227                 format_spec = selector.selector
1228
1229                 def selector_function(ctx):
1230                     formats = list(ctx['formats'])
1231                     if not formats:
1232                         return
1233                     if format_spec == 'all':
1234                         for f in formats:
1235                             yield f
1236                     elif format_spec in ['best', 'worst', None]:
1237                         format_idx = 0 if format_spec == 'worst' else -1
1238                         audiovideo_formats = [
1239                             f for f in formats
1240                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1241                         if audiovideo_formats:
1242                             yield audiovideo_formats[format_idx]
1243                         # for extractors with incomplete formats (audio only (soundcloud)
1244                         # or video only (imgur)) we will fallback to best/worst
1245                         # {video,audio}-only format
1246                         elif ctx['incomplete_formats']:
1247                             yield formats[format_idx]
1248                     elif format_spec == 'bestaudio':
1249                         audio_formats = [
1250                             f for f in formats
1251                             if f.get('vcodec') == 'none']
1252                         if audio_formats:
1253                             yield audio_formats[-1]
1254                     elif format_spec == 'worstaudio':
1255                         audio_formats = [
1256                             f for f in formats
1257                             if f.get('vcodec') == 'none']
1258                         if audio_formats:
1259                             yield audio_formats[0]
1260                     elif format_spec == 'bestvideo':
1261                         video_formats = [
1262                             f for f in formats
1263                             if f.get('acodec') == 'none']
1264                         if video_formats:
1265                             yield video_formats[-1]
1266                     elif format_spec == 'worstvideo':
1267                         video_formats = [
1268                             f for f in formats
1269                             if f.get('acodec') == 'none']
1270                         if video_formats:
1271                             yield video_formats[0]
1272                     else:
1273                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1274                         if format_spec in extensions:
1275                             filter_f = lambda f: f['ext'] == format_spec
1276                         else:
1277                             filter_f = lambda f: f['format_id'] == format_spec
1278                         matches = list(filter(filter_f, formats))
1279                         if matches:
1280                             yield matches[-1]
1281             elif selector.type == MERGE:
1282                 def _merge(formats_info):
1283                     format_1, format_2 = [f['format_id'] for f in formats_info]
1284                     # The first format must contain the video and the
1285                     # second the audio
1286                     if formats_info[0].get('vcodec') == 'none':
1287                         self.report_error('The first format must '
1288                                           'contain the video, try using '
1289                                           '"-f %s+%s"' % (format_2, format_1))
1290                         return
1291                     # Formats must be opposite (video+audio)
1292                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1293                         self.report_error(
1294                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1295                             % (format_1, format_2))
1296                         return
1297                     output_ext = (
1298                         formats_info[0]['ext']
1299                         if self.params.get('merge_output_format') is None
1300                         else self.params['merge_output_format'])
1301                     return {
1302                         'requested_formats': formats_info,
1303                         'format': '%s+%s' % (formats_info[0].get('format'),
1304                                              formats_info[1].get('format')),
1305                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1306                                                 formats_info[1].get('format_id')),
1307                         'width': formats_info[0].get('width'),
1308                         'height': formats_info[0].get('height'),
1309                         'resolution': formats_info[0].get('resolution'),
1310                         'fps': formats_info[0].get('fps'),
1311                         'vcodec': formats_info[0].get('vcodec'),
1312                         'vbr': formats_info[0].get('vbr'),
1313                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1314                         'acodec': formats_info[1].get('acodec'),
1315                         'abr': formats_info[1].get('abr'),
1316                         'ext': output_ext,
1317                     }
1318                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1319
1320                 def selector_function(ctx):
1321                     for pair in itertools.product(
1322                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1323                         yield _merge(pair)
1324
1325             filters = [self._build_format_filter(f) for f in selector.filters]
1326
1327             def final_selector(ctx):
1328                 ctx_copy = copy.deepcopy(ctx)
1329                 for _filter in filters:
1330                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1331                 return selector_function(ctx_copy)
1332             return final_selector
1333
1334         stream = io.BytesIO(format_spec.encode('utf-8'))
1335         try:
1336             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1337         except tokenize.TokenError:
1338             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1339
1340         class TokenIterator(object):
1341             def __init__(self, tokens):
1342                 self.tokens = tokens
1343                 self.counter = 0
1344
1345             def __iter__(self):
1346                 return self
1347
1348             def __next__(self):
1349                 if self.counter >= len(self.tokens):
1350                     raise StopIteration()
1351                 value = self.tokens[self.counter]
1352                 self.counter += 1
1353                 return value
1354
1355             next = __next__
1356
1357             def restore_last_token(self):
1358                 self.counter -= 1
1359
1360         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1361         return _build_selector_function(parsed_selector)
1362
1363     def _calc_headers(self, info_dict):
1364         res = std_headers.copy()
1365
1366         add_headers = info_dict.get('http_headers')
1367         if add_headers:
1368             res.update(add_headers)
1369
1370         cookies = self._calc_cookies(info_dict)
1371         if cookies:
1372             res['Cookie'] = cookies
1373
1374         if 'X-Forwarded-For' not in res:
1375             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1376             if x_forwarded_for_ip:
1377                 res['X-Forwarded-For'] = x_forwarded_for_ip
1378
1379         return res
1380
1381     def _calc_cookies(self, info_dict):
1382         pr = sanitized_Request(info_dict['url'])
1383         self.cookiejar.add_cookie_header(pr)
1384         return pr.get_header('Cookie')
1385
1386     def process_video_result(self, info_dict, download=True):
1387         assert info_dict.get('_type', 'video') == 'video'
1388
1389         if 'id' not in info_dict:
1390             raise ExtractorError('Missing "id" field in extractor result')
1391         if 'title' not in info_dict:
1392             raise ExtractorError('Missing "title" field in extractor result')
1393
1394         def report_force_conversion(field, field_not, conversion):
1395             self.report_warning(
1396                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1397                 % (field, field_not, conversion))
1398
1399         def sanitize_string_field(info, string_field):
1400             field = info.get(string_field)
1401             if field is None or isinstance(field, compat_str):
1402                 return
1403             report_force_conversion(string_field, 'a string', 'string')
1404             info[string_field] = compat_str(field)
1405
1406         def sanitize_numeric_fields(info):
1407             for numeric_field in self._NUMERIC_FIELDS:
1408                 field = info.get(numeric_field)
1409                 if field is None or isinstance(field, compat_numeric_types):
1410                     continue
1411                 report_force_conversion(numeric_field, 'numeric', 'int')
1412                 info[numeric_field] = int_or_none(field)
1413
1414         sanitize_string_field(info_dict, 'id')
1415         sanitize_numeric_fields(info_dict)
1416
1417         if 'playlist' not in info_dict:
1418             # It isn't part of a playlist
1419             info_dict['playlist'] = None
1420             info_dict['playlist_index'] = None
1421
1422         thumbnails = info_dict.get('thumbnails')
1423         if thumbnails is None:
1424             thumbnail = info_dict.get('thumbnail')
1425             if thumbnail:
1426                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1427         if thumbnails:
1428             thumbnails.sort(key=lambda t: (
1429                 t.get('preference') if t.get('preference') is not None else -1,
1430                 t.get('width') if t.get('width') is not None else -1,
1431                 t.get('height') if t.get('height') is not None else -1,
1432                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1433             for i, t in enumerate(thumbnails):
1434                 t['url'] = sanitize_url(t['url'])
1435                 if t.get('width') and t.get('height'):
1436                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1437                 if t.get('id') is None:
1438                     t['id'] = '%d' % i
1439
1440         if self.params.get('list_thumbnails'):
1441             self.list_thumbnails(info_dict)
1442             return
1443
1444         thumbnail = info_dict.get('thumbnail')
1445         if thumbnail:
1446             info_dict['thumbnail'] = sanitize_url(thumbnail)
1447         elif thumbnails:
1448             info_dict['thumbnail'] = thumbnails[-1]['url']
1449
1450         if 'display_id' not in info_dict and 'id' in info_dict:
1451             info_dict['display_id'] = info_dict['id']
1452
1453         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1454             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1455             # see http://bugs.python.org/issue1646728)
1456             try:
1457                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1458                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1459             except (ValueError, OverflowError, OSError):
1460                 pass
1461
1462         # Auto generate title fields corresponding to the *_number fields when missing
1463         # in order to always have clean titles. This is very common for TV series.
1464         for field in ('chapter', 'season', 'episode'):
1465             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1466                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1467
1468         subtitles = info_dict.get('subtitles')
1469         if subtitles:
1470             for _, subtitle in subtitles.items():
1471                 for subtitle_format in subtitle:
1472                     if subtitle_format.get('url'):
1473                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1474                     if subtitle_format.get('ext') is None:
1475                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1476
1477         if self.params.get('listsubtitles', False):
1478             if 'automatic_captions' in info_dict:
1479                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1480             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1481             return
1482         info_dict['requested_subtitles'] = self.process_subtitles(
1483             info_dict['id'], subtitles,
1484             info_dict.get('automatic_captions'))
1485
1486         # We now pick which formats have to be downloaded
1487         if info_dict.get('formats') is None:
1488             # There's only one format available
1489             formats = [info_dict]
1490         else:
1491             formats = info_dict['formats']
1492
1493         if not formats:
1494             raise ExtractorError('No video formats found!')
1495
1496         def is_wellformed(f):
1497             url = f.get('url')
1498             if not url:
1499                 self.report_warning(
1500                     '"url" field is missing or empty - skipping format, '
1501                     'there is an error in extractor')
1502                 return False
1503             if isinstance(url, bytes):
1504                 sanitize_string_field(f, 'url')
1505             return True
1506
1507         # Filter out malformed formats for better extraction robustness
1508         formats = list(filter(is_wellformed, formats))
1509
1510         formats_dict = {}
1511
1512         # We check that all the formats have the format and format_id fields
1513         for i, format in enumerate(formats):
1514             sanitize_string_field(format, 'format_id')
1515             sanitize_numeric_fields(format)
1516             format['url'] = sanitize_url(format['url'])
1517             if not format.get('format_id'):
1518                 format['format_id'] = compat_str(i)
1519             else:
1520                 # Sanitize format_id from characters used in format selector expression
1521                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1522             format_id = format['format_id']
1523             if format_id not in formats_dict:
1524                 formats_dict[format_id] = []
1525             formats_dict[format_id].append(format)
1526
1527         # Make sure all formats have unique format_id
1528         for format_id, ambiguous_formats in formats_dict.items():
1529             if len(ambiguous_formats) > 1:
1530                 for i, format in enumerate(ambiguous_formats):
1531                     format['format_id'] = '%s-%d' % (format_id, i)
1532
1533         for i, format in enumerate(formats):
1534             if format.get('format') is None:
1535                 format['format'] = '{id} - {res}{note}'.format(
1536                     id=format['format_id'],
1537                     res=self.format_resolution(format),
1538                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1539                 )
1540             # Automatically determine file extension if missing
1541             if format.get('ext') is None:
1542                 format['ext'] = determine_ext(format['url']).lower()
1543             # Automatically determine protocol if missing (useful for format
1544             # selection purposes)
1545             if format.get('protocol') is None:
1546                 format['protocol'] = determine_protocol(format)
1547             # Add HTTP headers, so that external programs can use them from the
1548             # json output
1549             full_format_info = info_dict.copy()
1550             full_format_info.update(format)
1551             format['http_headers'] = self._calc_headers(full_format_info)
1552         # Remove private housekeeping stuff
1553         if '__x_forwarded_for_ip' in info_dict:
1554             del info_dict['__x_forwarded_for_ip']
1555
1556         # TODO Central sorting goes here
1557
1558         if formats[0] is not info_dict:
1559             # only set the 'formats' fields if the original info_dict list them
1560             # otherwise we end up with a circular reference, the first (and unique)
1561             # element in the 'formats' field in info_dict is info_dict itself,
1562             # which can't be exported to json
1563             info_dict['formats'] = formats
1564         if self.params.get('listformats'):
1565             self.list_formats(info_dict)
1566             return
1567
1568         req_format = self.params.get('format')
1569         if req_format is None:
1570             req_format = self._default_format_spec(info_dict, download=download)
1571             if self.params.get('verbose'):
1572                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1573
1574         format_selector = self.build_format_selector(req_format)
1575
1576         # While in format selection we may need to have an access to the original
1577         # format set in order to calculate some metrics or do some processing.
1578         # For now we need to be able to guess whether original formats provided
1579         # by extractor are incomplete or not (i.e. whether extractor provides only
1580         # video-only or audio-only formats) for proper formats selection for
1581         # extractors with such incomplete formats (see
1582         # https://github.com/rg3/youtube-dl/pull/5556).
1583         # Since formats may be filtered during format selection and may not match
1584         # the original formats the results may be incorrect. Thus original formats
1585         # or pre-calculated metrics should be passed to format selection routines
1586         # as well.
1587         # We will pass a context object containing all necessary additional data
1588         # instead of just formats.
1589         # This fixes incorrect format selection issue (see
1590         # https://github.com/rg3/youtube-dl/issues/10083).
1591         incomplete_formats = (
1592             # All formats are video-only or
1593             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1594             # all formats are audio-only
1595             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1596
1597         ctx = {
1598             'formats': formats,
1599             'incomplete_formats': incomplete_formats,
1600         }
1601
1602         formats_to_download = list(format_selector(ctx))
1603         if not formats_to_download:
1604             raise ExtractorError('requested format not available',
1605                                  expected=True)
1606
1607         if download:
1608             if len(formats_to_download) > 1:
1609                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1610             for format in formats_to_download:
1611                 new_info = dict(info_dict)
1612                 new_info.update(format)
1613                 self.process_info(new_info)
1614         # We update the info dict with the best quality format (backwards compatibility)
1615         info_dict.update(formats_to_download[-1])
1616         return info_dict
1617
1618     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1619         """Select the requested subtitles and their format"""
1620         available_subs = {}
1621         if normal_subtitles and self.params.get('writesubtitles'):
1622             available_subs.update(normal_subtitles)
1623         if automatic_captions and self.params.get('writeautomaticsub'):
1624             for lang, cap_info in automatic_captions.items():
1625                 if lang not in available_subs:
1626                     available_subs[lang] = cap_info
1627
1628         if (not self.params.get('writesubtitles') and not
1629                 self.params.get('writeautomaticsub') or not
1630                 available_subs):
1631             return None
1632
1633         if self.params.get('allsubtitles', False):
1634             requested_langs = available_subs.keys()
1635         else:
1636             if self.params.get('subtitleslangs', False):
1637                 requested_langs = self.params.get('subtitleslangs')
1638             elif 'en' in available_subs:
1639                 requested_langs = ['en']
1640             else:
1641                 requested_langs = [list(available_subs.keys())[0]]
1642
1643         formats_query = self.params.get('subtitlesformat', 'best')
1644         formats_preference = formats_query.split('/') if formats_query else []
1645         subs = {}
1646         for lang in requested_langs:
1647             formats = available_subs.get(lang)
1648             if formats is None:
1649                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1650                 continue
1651             for ext in formats_preference:
1652                 if ext == 'best':
1653                     f = formats[-1]
1654                     break
1655                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1656                 if matches:
1657                     f = matches[-1]
1658                     break
1659             else:
1660                 f = formats[-1]
1661                 self.report_warning(
1662                     'No subtitle format found matching "%s" for language %s, '
1663                     'using %s' % (formats_query, lang, f['ext']))
1664             subs[lang] = f
1665         return subs
1666
1667     def process_info(self, info_dict):
1668         """Process a single resolved IE result."""
1669
1670         assert info_dict.get('_type', 'video') == 'video'
1671
1672         max_downloads = self.params.get('max_downloads')
1673         if max_downloads is not None:
1674             if self._num_downloads >= int(max_downloads):
1675                 raise MaxDownloadsReached()
1676
1677         info_dict['fulltitle'] = info_dict['title']
1678         if len(info_dict['title']) > 200:
1679             info_dict['title'] = info_dict['title'][:197] + '...'
1680
1681         if 'format' not in info_dict:
1682             info_dict['format'] = info_dict['ext']
1683
1684         reason = self._match_entry(info_dict, incomplete=False)
1685         if reason is not None:
1686             self.to_screen('[download] ' + reason)
1687             return
1688
1689         self._num_downloads += 1
1690
1691         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1692
1693         # Forced printings
1694         if self.params.get('forcetitle', False):
1695             self.to_stdout(info_dict['fulltitle'])
1696         if self.params.get('forceid', False):
1697             self.to_stdout(info_dict['id'])
1698         if self.params.get('forceurl', False):
1699             if info_dict.get('requested_formats') is not None:
1700                 for f in info_dict['requested_formats']:
1701                     self.to_stdout(f['url'] + f.get('play_path', ''))
1702             else:
1703                 # For RTMP URLs, also include the playpath
1704                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1705         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1706             self.to_stdout(info_dict['thumbnail'])
1707         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1708             self.to_stdout(info_dict['description'])
1709         if self.params.get('forcefilename', False) and filename is not None:
1710             self.to_stdout(filename)
1711         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1712             self.to_stdout(formatSeconds(info_dict['duration']))
1713         if self.params.get('forceformat', False):
1714             self.to_stdout(info_dict['format'])
1715         if self.params.get('forcejson', False):
1716             self.to_stdout(json.dumps(info_dict))
1717
1718         # Do nothing else if in simulate mode
1719         if self.params.get('simulate', False):
1720             return
1721
1722         if filename is None:
1723             return
1724
1725         def ensure_dir_exists(path):
1726             try:
1727                 dn = os.path.dirname(path)
1728                 if dn and not os.path.exists(dn):
1729                     os.makedirs(dn)
1730                 return True
1731             except (OSError, IOError) as err:
1732                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1733                 return False
1734
1735         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1736             return
1737
1738         if self.params.get('writedescription', False):
1739             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1740             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1741                 self.to_screen('[info] Video description is already present')
1742             elif info_dict.get('description') is None:
1743                 self.report_warning('There\'s no description to write.')
1744             else:
1745                 try:
1746                     self.to_screen('[info] Writing video description to: ' + descfn)
1747                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1748                         descfile.write(info_dict['description'])
1749                 except (OSError, IOError):
1750                     self.report_error('Cannot write description file ' + descfn)
1751                     return
1752
1753         if self.params.get('writeannotations', False):
1754             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1755             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1756                 self.to_screen('[info] Video annotations are already present')
1757             else:
1758                 try:
1759                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1760                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1761                         annofile.write(info_dict['annotations'])
1762                 except (KeyError, TypeError):
1763                     self.report_warning('There are no annotations to write.')
1764                 except (OSError, IOError):
1765                     self.report_error('Cannot write annotations file: ' + annofn)
1766                     return
1767
1768         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1769                                        self.params.get('writeautomaticsub')])
1770
1771         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1772             # subtitles download errors are already managed as troubles in relevant IE
1773             # that way it will silently go on when used with unsupporting IE
1774             subtitles = info_dict['requested_subtitles']
1775             ie = self.get_info_extractor(info_dict['extractor_key'])
1776             for sub_lang, sub_info in subtitles.items():
1777                 sub_format = sub_info['ext']
1778                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1779                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1780                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1781                 else:
1782                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1783                     if sub_info.get('data') is not None:
1784                         try:
1785                             # Use newline='' to prevent conversion of newline characters
1786                             # See https://github.com/rg3/youtube-dl/issues/10268
1787                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1788                                 subfile.write(sub_info['data'])
1789                         except (OSError, IOError):
1790                             self.report_error('Cannot write subtitles file ' + sub_filename)
1791                             return
1792                     else:
1793                         try:
1794                             sub_data = ie._request_webpage(
1795                                 sub_info['url'], info_dict['id'], note=False).read()
1796                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1797                                 subfile.write(sub_data)
1798                         except (ExtractorError, IOError, OSError, ValueError) as err:
1799                             self.report_warning('Unable to download subtitle for "%s": %s' %
1800                                                 (sub_lang, error_to_compat_str(err)))
1801                             continue
1802
1803         if self.params.get('writeinfojson', False):
1804             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1805             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1806                 self.to_screen('[info] Video description metadata is already present')
1807             else:
1808                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1809                 try:
1810                     write_json_file(self.filter_requested_info(info_dict), infofn)
1811                 except (OSError, IOError):
1812                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1813                     return
1814
1815         self._write_thumbnails(info_dict, filename)
1816
1817         if not self.params.get('skip_download', False):
1818             try:
1819                 def dl(name, info):
1820                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1821                     for ph in self._progress_hooks:
1822                         fd.add_progress_hook(ph)
1823                     if self.params.get('verbose'):
1824                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1825                     return fd.download(name, info)
1826
1827                 if info_dict.get('requested_formats') is not None:
1828                     downloaded = []
1829                     success = True
1830                     merger = FFmpegMergerPP(self)
1831                     if not merger.available:
1832                         postprocessors = []
1833                         self.report_warning('You have requested multiple '
1834                                             'formats but ffmpeg or avconv are not installed.'
1835                                             ' The formats won\'t be merged.')
1836                     else:
1837                         postprocessors = [merger]
1838
1839                     def compatible_formats(formats):
1840                         video, audio = formats
1841                         # Check extension
1842                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1843                         if video_ext and audio_ext:
1844                             COMPATIBLE_EXTS = (
1845                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1846                                 ('webm')
1847                             )
1848                             for exts in COMPATIBLE_EXTS:
1849                                 if video_ext in exts and audio_ext in exts:
1850                                     return True
1851                         # TODO: Check acodec/vcodec
1852                         return False
1853
1854                     filename_real_ext = os.path.splitext(filename)[1][1:]
1855                     filename_wo_ext = (
1856                         os.path.splitext(filename)[0]
1857                         if filename_real_ext == info_dict['ext']
1858                         else filename)
1859                     requested_formats = info_dict['requested_formats']
1860                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1861                         info_dict['ext'] = 'mkv'
1862                         self.report_warning(
1863                             'Requested formats are incompatible for merge and will be merged into mkv.')
1864                     # Ensure filename always has a correct extension for successful merge
1865                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1866                     if os.path.exists(encodeFilename(filename)):
1867                         self.to_screen(
1868                             '[download] %s has already been downloaded and '
1869                             'merged' % filename)
1870                     else:
1871                         for f in requested_formats:
1872                             new_info = dict(info_dict)
1873                             new_info.update(f)
1874                             fname = prepend_extension(
1875                                 self.prepare_filename(new_info),
1876                                 'f%s' % f['format_id'], new_info['ext'])
1877                             if not ensure_dir_exists(fname):
1878                                 return
1879                             downloaded.append(fname)
1880                             partial_success = dl(fname, new_info)
1881                             success = success and partial_success
1882                         info_dict['__postprocessors'] = postprocessors
1883                         info_dict['__files_to_merge'] = downloaded
1884                 else:
1885                     # Just a single file
1886                     success = dl(filename, info_dict)
1887             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1888                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1889                 return
1890             except (OSError, IOError) as err:
1891                 raise UnavailableVideoError(err)
1892             except (ContentTooShortError, ) as err:
1893                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1894                 return
1895
1896             if success and filename != '-':
1897                 # Fixup content
1898                 fixup_policy = self.params.get('fixup')
1899                 if fixup_policy is None:
1900                     fixup_policy = 'detect_or_warn'
1901
1902                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1903
1904                 stretched_ratio = info_dict.get('stretched_ratio')
1905                 if stretched_ratio is not None and stretched_ratio != 1:
1906                     if fixup_policy == 'warn':
1907                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1908                             info_dict['id'], stretched_ratio))
1909                     elif fixup_policy == 'detect_or_warn':
1910                         stretched_pp = FFmpegFixupStretchedPP(self)
1911                         if stretched_pp.available:
1912                             info_dict.setdefault('__postprocessors', [])
1913                             info_dict['__postprocessors'].append(stretched_pp)
1914                         else:
1915                             self.report_warning(
1916                                 '%s: Non-uniform pixel ratio (%s). %s'
1917                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1918                     else:
1919                         assert fixup_policy in ('ignore', 'never')
1920
1921                 if (info_dict.get('requested_formats') is None and
1922                         info_dict.get('container') == 'm4a_dash'):
1923                     if fixup_policy == 'warn':
1924                         self.report_warning(
1925                             '%s: writing DASH m4a. '
1926                             'Only some players support this container.'
1927                             % info_dict['id'])
1928                     elif fixup_policy == 'detect_or_warn':
1929                         fixup_pp = FFmpegFixupM4aPP(self)
1930                         if fixup_pp.available:
1931                             info_dict.setdefault('__postprocessors', [])
1932                             info_dict['__postprocessors'].append(fixup_pp)
1933                         else:
1934                             self.report_warning(
1935                                 '%s: writing DASH m4a. '
1936                                 'Only some players support this container. %s'
1937                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1938                     else:
1939                         assert fixup_policy in ('ignore', 'never')
1940
1941                 if (info_dict.get('protocol') == 'm3u8_native' or
1942                         info_dict.get('protocol') == 'm3u8' and
1943                         self.params.get('hls_prefer_native')):
1944                     if fixup_policy == 'warn':
1945                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1946                             info_dict['id']))
1947                     elif fixup_policy == 'detect_or_warn':
1948                         fixup_pp = FFmpegFixupM3u8PP(self)
1949                         if fixup_pp.available:
1950                             info_dict.setdefault('__postprocessors', [])
1951                             info_dict['__postprocessors'].append(fixup_pp)
1952                         else:
1953                             self.report_warning(
1954                                 '%s: malformed AAC bitstream detected. %s'
1955                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1956                     else:
1957                         assert fixup_policy in ('ignore', 'never')
1958
1959                 try:
1960                     self.post_process(filename, info_dict)
1961                 except (PostProcessingError) as err:
1962                     self.report_error('postprocessing: %s' % str(err))
1963                     return
1964                 self.record_download_archive(info_dict)
1965
1966     def download(self, url_list):
1967         """Download a given list of URLs."""
1968         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1969         if (len(url_list) > 1 and
1970                 outtmpl != '-' and
1971                 '%' not in outtmpl and
1972                 self.params.get('max_downloads') != 1):
1973             raise SameFileError(outtmpl)
1974
1975         for url in url_list:
1976             try:
1977                 # It also downloads the videos
1978                 res = self.extract_info(
1979                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1980             except UnavailableVideoError:
1981                 self.report_error('unable to download video')
1982             except MaxDownloadsReached:
1983                 self.to_screen('[info] Maximum number of downloaded files reached.')
1984                 raise
1985             else:
1986                 if self.params.get('dump_single_json', False):
1987                     self.to_stdout(json.dumps(res))
1988
1989         return self._download_retcode
1990
1991     def download_with_info_file(self, info_filename):
1992         with contextlib.closing(fileinput.FileInput(
1993                 [info_filename], mode='r',
1994                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1995             # FileInput doesn't have a read method, we can't call json.load
1996             info = self.filter_requested_info(json.loads('\n'.join(f)))
1997         try:
1998             self.process_ie_result(info, download=True)
1999         except DownloadError:
2000             webpage_url = info.get('webpage_url')
2001             if webpage_url is not None:
2002                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2003                 return self.download([webpage_url])
2004             else:
2005                 raise
2006         return self._download_retcode
2007
2008     @staticmethod
2009     def filter_requested_info(info_dict):
2010         return dict(
2011             (k, v) for k, v in info_dict.items()
2012             if k not in ['requested_formats', 'requested_subtitles'])
2013
2014     def post_process(self, filename, ie_info):
2015         """Run all the postprocessors on the given file."""
2016         info = dict(ie_info)
2017         info['filepath'] = filename
2018         pps_chain = []
2019         if ie_info.get('__postprocessors') is not None:
2020             pps_chain.extend(ie_info['__postprocessors'])
2021         pps_chain.extend(self._pps)
2022         for pp in pps_chain:
2023             files_to_delete = []
2024             try:
2025                 files_to_delete, info = pp.run(info)
2026             except PostProcessingError as e:
2027                 self.report_error(e.msg)
2028             if files_to_delete and not self.params.get('keepvideo', False):
2029                 for old_filename in files_to_delete:
2030                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2031                     try:
2032                         os.remove(encodeFilename(old_filename))
2033                     except (IOError, OSError):
2034                         self.report_warning('Unable to remove downloaded original file')
2035
2036     def _make_archive_id(self, info_dict):
2037         # Future-proof against any change in case
2038         # and backwards compatibility with prior versions
2039         extractor = info_dict.get('extractor_key')
2040         if extractor is None:
2041             if 'id' in info_dict:
2042                 extractor = info_dict.get('ie_key')  # key in a playlist
2043         if extractor is None:
2044             return None  # Incomplete video information
2045         return extractor.lower() + ' ' + info_dict['id']
2046
2047     def in_download_archive(self, info_dict):
2048         fn = self.params.get('download_archive')
2049         if fn is None:
2050             return False
2051
2052         vid_id = self._make_archive_id(info_dict)
2053         if vid_id is None:
2054             return False  # Incomplete video information
2055
2056         try:
2057             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2058                 for line in archive_file:
2059                     if line.strip() == vid_id:
2060                         return True
2061         except IOError as ioe:
2062             if ioe.errno != errno.ENOENT:
2063                 raise
2064         return False
2065
2066     def record_download_archive(self, info_dict):
2067         fn = self.params.get('download_archive')
2068         if fn is None:
2069             return
2070         vid_id = self._make_archive_id(info_dict)
2071         assert vid_id
2072         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2073             archive_file.write(vid_id + '\n')
2074
2075     @staticmethod
2076     def format_resolution(format, default='unknown'):
2077         if format.get('vcodec') == 'none':
2078             return 'audio only'
2079         if format.get('resolution') is not None:
2080             return format['resolution']
2081         if format.get('height') is not None:
2082             if format.get('width') is not None:
2083                 res = '%sx%s' % (format['width'], format['height'])
2084             else:
2085                 res = '%sp' % format['height']
2086         elif format.get('width') is not None:
2087             res = '%dx?' % format['width']
2088         else:
2089             res = default
2090         return res
2091
2092     def _format_note(self, fdict):
2093         res = ''
2094         if fdict.get('ext') in ['f4f', 'f4m']:
2095             res += '(unsupported) '
2096         if fdict.get('language'):
2097             if res:
2098                 res += ' '
2099             res += '[%s] ' % fdict['language']
2100         if fdict.get('format_note') is not None:
2101             res += fdict['format_note'] + ' '
2102         if fdict.get('tbr') is not None:
2103             res += '%4dk ' % fdict['tbr']
2104         if fdict.get('container') is not None:
2105             if res:
2106                 res += ', '
2107             res += '%s container' % fdict['container']
2108         if (fdict.get('vcodec') is not None and
2109                 fdict.get('vcodec') != 'none'):
2110             if res:
2111                 res += ', '
2112             res += fdict['vcodec']
2113             if fdict.get('vbr') is not None:
2114                 res += '@'
2115         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2116             res += 'video@'
2117         if fdict.get('vbr') is not None:
2118             res += '%4dk' % fdict['vbr']
2119         if fdict.get('fps') is not None:
2120             if res:
2121                 res += ', '
2122             res += '%sfps' % fdict['fps']
2123         if fdict.get('acodec') is not None:
2124             if res:
2125                 res += ', '
2126             if fdict['acodec'] == 'none':
2127                 res += 'video only'
2128             else:
2129                 res += '%-5s' % fdict['acodec']
2130         elif fdict.get('abr') is not None:
2131             if res:
2132                 res += ', '
2133             res += 'audio'
2134         if fdict.get('abr') is not None:
2135             res += '@%3dk' % fdict['abr']
2136         if fdict.get('asr') is not None:
2137             res += ' (%5dHz)' % fdict['asr']
2138         if fdict.get('filesize') is not None:
2139             if res:
2140                 res += ', '
2141             res += format_bytes(fdict['filesize'])
2142         elif fdict.get('filesize_approx') is not None:
2143             if res:
2144                 res += ', '
2145             res += '~' + format_bytes(fdict['filesize_approx'])
2146         return res
2147
2148     def list_formats(self, info_dict):
2149         formats = info_dict.get('formats', [info_dict])
2150         table = [
2151             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2152             for f in formats
2153             if f.get('preference') is None or f['preference'] >= -1000]
2154         if len(formats) > 1:
2155             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2156
2157         header_line = ['format code', 'extension', 'resolution', 'note']
2158         self.to_screen(
2159             '[info] Available formats for %s:\n%s' %
2160             (info_dict['id'], render_table(header_line, table)))
2161
2162     def list_thumbnails(self, info_dict):
2163         thumbnails = info_dict.get('thumbnails')
2164         if not thumbnails:
2165             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2166             return
2167
2168         self.to_screen(
2169             '[info] Thumbnails for %s:' % info_dict['id'])
2170         self.to_screen(render_table(
2171             ['ID', 'width', 'height', 'URL'],
2172             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2173
2174     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2175         if not subtitles:
2176             self.to_screen('%s has no %s' % (video_id, name))
2177             return
2178         self.to_screen(
2179             'Available %s for %s:' % (name, video_id))
2180         self.to_screen(render_table(
2181             ['Language', 'formats'],
2182             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2183                 for lang, formats in subtitles.items()]))
2184
2185     def urlopen(self, req):
2186         """ Start an HTTP download """
2187         if isinstance(req, compat_basestring):
2188             req = sanitized_Request(req)
2189         return self._opener.open(req, timeout=self._socket_timeout)
2190
2191     def print_debug_header(self):
2192         if not self.params.get('verbose'):
2193             return
2194
2195         if type('') is not compat_str:
2196             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2197             self.report_warning(
2198                 'Your Python is broken! Update to a newer and supported version')
2199
2200         stdout_encoding = getattr(
2201             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2202         encoding_str = (
2203             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2204                 locale.getpreferredencoding(),
2205                 sys.getfilesystemencoding(),
2206                 stdout_encoding,
2207                 self.get_encoding()))
2208         write_string(encoding_str, encoding=None)
2209
2210         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2211         if _LAZY_LOADER:
2212             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2213         try:
2214             sp = subprocess.Popen(
2215                 ['git', 'rev-parse', '--short', 'HEAD'],
2216                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2217                 cwd=os.path.dirname(os.path.abspath(__file__)))
2218             out, err = sp.communicate()
2219             out = out.decode().strip()
2220             if re.match('[0-9a-f]+', out):
2221                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2222         except Exception:
2223             try:
2224                 sys.exc_clear()
2225             except Exception:
2226                 pass
2227         self._write_string('[debug] Python version %s - %s\n' % (
2228             platform.python_version(), platform_name()))
2229
2230         exe_versions = FFmpegPostProcessor.get_versions(self)
2231         exe_versions['rtmpdump'] = rtmpdump_version()
2232         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2233         exe_str = ', '.join(
2234             '%s %s' % (exe, v)
2235             for exe, v in sorted(exe_versions.items())
2236             if v
2237         )
2238         if not exe_str:
2239             exe_str = 'none'
2240         self._write_string('[debug] exe versions: %s\n' % exe_str)
2241
2242         proxy_map = {}
2243         for handler in self._opener.handlers:
2244             if hasattr(handler, 'proxies'):
2245                 proxy_map.update(handler.proxies)
2246         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2247
2248         if self.params.get('call_home', False):
2249             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2250             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2251             latest_version = self.urlopen(
2252                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2253             if version_tuple(latest_version) > version_tuple(__version__):
2254                 self.report_warning(
2255                     'You are using an outdated version (newest version: %s)! '
2256                     'See https://yt-dl.org/update if you need help updating.' %
2257                     latest_version)
2258
2259     def _setup_opener(self):
2260         timeout_val = self.params.get('socket_timeout')
2261         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2262
2263         opts_cookiefile = self.params.get('cookiefile')
2264         opts_proxy = self.params.get('proxy')
2265
2266         if opts_cookiefile is None:
2267             self.cookiejar = compat_cookiejar.CookieJar()
2268         else:
2269             opts_cookiefile = expand_path(opts_cookiefile)
2270             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2271                 opts_cookiefile)
2272             if os.access(opts_cookiefile, os.R_OK):
2273                 self.cookiejar.load()
2274
2275         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2276         if opts_proxy is not None:
2277             if opts_proxy == '':
2278                 proxies = {}
2279             else:
2280                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2281         else:
2282             proxies = compat_urllib_request.getproxies()
2283             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2284             if 'http' in proxies and 'https' not in proxies:
2285                 proxies['https'] = proxies['http']
2286         proxy_handler = PerRequestProxyHandler(proxies)
2287
2288         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2289         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2290         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2291         data_handler = compat_urllib_request_DataHandler()
2292
2293         # When passing our own FileHandler instance, build_opener won't add the
2294         # default FileHandler and allows us to disable the file protocol, which
2295         # can be used for malicious purposes (see
2296         # https://github.com/rg3/youtube-dl/issues/8227)
2297         file_handler = compat_urllib_request.FileHandler()
2298
2299         def file_open(*args, **kwargs):
2300             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2301         file_handler.file_open = file_open
2302
2303         opener = compat_urllib_request.build_opener(
2304             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2305
2306         # Delete the default user-agent header, which would otherwise apply in
2307         # cases where our custom HTTP handler doesn't come into play
2308         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2309         opener.addheaders = []
2310         self._opener = opener
2311
2312     def encode(self, s):
2313         if isinstance(s, bytes):
2314             return s  # Already encoded
2315
2316         try:
2317             return s.encode(self.get_encoding())
2318         except UnicodeEncodeError as err:
2319             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2320             raise
2321
2322     def get_encoding(self):
2323         encoding = self.params.get('encoding')
2324         if encoding is None:
2325             encoding = preferredencoding()
2326         return encoding
2327
2328     def _write_thumbnails(self, info_dict, filename):
2329         if self.params.get('writethumbnail', False):
2330             thumbnails = info_dict.get('thumbnails')
2331             if thumbnails:
2332                 thumbnails = [thumbnails[-1]]
2333         elif self.params.get('write_all_thumbnails', False):
2334             thumbnails = info_dict.get('thumbnails')
2335         else:
2336             return
2337
2338         if not thumbnails:
2339             # No thumbnails present, so return immediately
2340             return
2341
2342         for t in thumbnails:
2343             thumb_ext = determine_ext(t['url'], 'jpg')
2344             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2345             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2346             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2347
2348             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2349                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2350                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2351             else:
2352                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2353                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2354                 try:
2355                     uf = self.urlopen(t['url'])
2356                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2357                         shutil.copyfileobj(uf, thumbf)
2358                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2359                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2360                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2361                     self.report_warning('Unable to download thumbnail "%s": %s' %
2362                                         (t['url'], error_to_compat_str(err)))