[YoutubeDL] Ignore duplicates in --playlist-items
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     subtitles_filename,
86     UnavailableVideoError,
87     url_basename,
88     version_tuple,
89     write_json_file,
90     write_string,
91     YoutubeDLCookieProcessor,
92     YoutubeDLHandler,
93 )
94 from .cache import Cache
95 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
96 from .extractor.openload import PhantomJSwrapper
97 from .downloader import get_suitable_downloader
98 from .downloader.rtmp import rtmpdump_version
99 from .postprocessor import (
100     FFmpegFixupM3u8PP,
101     FFmpegFixupM4aPP,
102     FFmpegFixupStretchedPP,
103     FFmpegMergerPP,
104     FFmpegPostProcessor,
105     get_postprocessor,
106 )
107 from .version import __version__
108
109 if compat_os_name == 'nt':
110     import ctypes
111
112
113 class YoutubeDL(object):
114     """YoutubeDL class.
115
116     YoutubeDL objects are the ones responsible of downloading the
117     actual video file and writing it to disk if the user has requested
118     it, among some other tasks. In most cases there should be one per
119     program. As, given a video URL, the downloader doesn't know how to
120     extract all the needed information, task that InfoExtractors do, it
121     has to pass the URL to one of them.
122
123     For this, YoutubeDL objects have a method that allows
124     InfoExtractors to be registered in a given order. When it is passed
125     a URL, the YoutubeDL object handles it to the first InfoExtractor it
126     finds that reports being able to handle it. The InfoExtractor extracts
127     all the information about the video or videos the URL refers to, and
128     YoutubeDL process the extracted information, possibly using a File
129     Downloader to download the video.
130
131     YoutubeDL objects accept a lot of parameters. In order not to saturate
132     the object constructor with arguments, it receives a dictionary of
133     options instead. These options are available through the params
134     attribute for the InfoExtractors to use. The YoutubeDL also
135     registers itself as the downloader in charge for the InfoExtractors
136     that are added to it, so this is a "mutual registration".
137
138     Available options:
139
140     username:          Username for authentication purposes.
141     password:          Password for authentication purposes.
142     videopassword:     Password for accessing a video.
143     ap_mso:            Adobe Pass multiple-system operator identifier.
144     ap_username:       Multiple-system operator account username.
145     ap_password:       Multiple-system operator account password.
146     usenetrc:          Use netrc for authentication instead.
147     verbose:           Print additional info to stdout.
148     quiet:             Do not print messages to stdout.
149     no_warnings:       Do not print out anything for warnings.
150     forceurl:          Force printing final URL.
151     forcetitle:        Force printing title.
152     forceid:           Force printing ID.
153     forcethumbnail:    Force printing thumbnail URL.
154     forcedescription:  Force printing description.
155     forcefilename:     Force printing final filename.
156     forceduration:     Force printing duration.
157     forcejson:         Force printing info_dict as JSON.
158     dump_single_json:  Force printing the info_dict of the whole playlist
159                        (or video) as a single JSON line.
160     simulate:          Do not download the video files.
161     format:            Video format code. See options.py for more information.
162     outtmpl:           Template for output names.
163     restrictfilenames: Do not allow "&" and spaces in file names
164     ignoreerrors:      Do not stop on download errors.
165     force_generic_extractor: Force downloader to use the generic extractor
166     nooverwrites:      Prevent overwriting files.
167     playliststart:     Playlist item to start at.
168     playlistend:       Playlist item to end at.
169     playlist_items:    Specific indices of playlist to download.
170     playlistreverse:   Download playlist items in reverse order.
171     playlistrandom:    Download playlist items in random order.
172     matchtitle:        Download only matching titles.
173     rejecttitle:       Reject downloads for matching titles.
174     logger:            Log messages to a logging.Logger instance.
175     logtostderr:       Log messages to stderr instead of stdout.
176     writedescription:  Write the video description to a .description file
177     writeinfojson:     Write the video description to a .info.json file
178     writeannotations:  Write the video annotations to a .annotations.xml file
179     writethumbnail:    Write the thumbnail image to a file
180     write_all_thumbnails:  Write all thumbnail formats to files
181     writesubtitles:    Write the video subtitles to a file
182     writeautomaticsub: Write the automatically generated subtitles to a file
183     allsubtitles:      Downloads all the subtitles of the video
184                        (requires writesubtitles or writeautomaticsub)
185     listsubtitles:     Lists all available subtitles for the video
186     subtitlesformat:   The format code for subtitles
187     subtitleslangs:    List of languages of the subtitles to download
188     keepvideo:         Keep the video file after post-processing
189     daterange:         A DateRange object, download only if the upload_date is in the range.
190     skip_download:     Skip the actual download of the video file
191     cachedir:          Location of the cache files in the filesystem.
192                        False to disable filesystem cache.
193     noplaylist:        Download single video instead of a playlist if in doubt.
194     age_limit:         An integer representing the user's age in years.
195                        Unsuitable videos for the given age are skipped.
196     min_views:         An integer representing the minimum view count the video
197                        must have in order to not be skipped.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     max_views:         An integer representing the maximum view count.
201                        Videos that are more popular than that are not
202                        downloaded.
203                        Videos without view count information are always
204                        downloaded. None for no limit.
205     download_archive:  File name of a file where all downloads are recorded.
206                        Videos already present in the file are not downloaded
207                        again.
208     cookiefile:        File name where cookies should be read from and dumped to.
209     nocheckcertificate:Do not verify SSL certificates
210     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
211                        At the moment, this is only supported by YouTube.
212     proxy:             URL of the proxy server to use
213     geo_verification_proxy:  URL of the proxy to use for IP address verification
214                        on geo-restricted sites. (Experimental)
215     socket_timeout:    Time to wait for unresponsive hosts, in seconds
216     bidi_workaround:   Work around buggy terminals without bidirectional text
217                        support, using fridibi
218     debug_printtraffic:Print out sent and received HTTP traffic
219     include_ads:       Download ads as well
220     default_search:    Prepend this string if an input url is not valid.
221                        'auto' for elaborate guessing
222     encoding:          Use this encoding instead of the system-specified.
223     extract_flat:      Do not resolve URLs, return the immediate result.
224                        Pass in 'in_playlist' to only show this behavior for
225                        playlist items.
226     postprocessors:    A list of dictionaries, each with an entry
227                        * key:  The name of the postprocessor. See
228                                youtube_dl/postprocessor/__init__.py for a list.
229                        as well as any further keyword arguments for the
230                        postprocessor.
231     progress_hooks:    A list of functions that get called on download
232                        progress, with a dictionary with the entries
233                        * status: One of "downloading", "error", or "finished".
234                                  Check this first and ignore unknown values.
235
236                        If status is one of "downloading", or "finished", the
237                        following properties may also be present:
238                        * filename: The final filename (always present)
239                        * tmpfilename: The filename we're currently writing to
240                        * downloaded_bytes: Bytes on disk
241                        * total_bytes: Size of the whole file, None if unknown
242                        * total_bytes_estimate: Guess of the eventual file size,
243                                                None if unavailable.
244                        * elapsed: The number of seconds since download started.
245                        * eta: The estimated time in seconds, None if unknown
246                        * speed: The download speed in bytes/second, None if
247                                 unknown
248                        * fragment_index: The counter of the currently
249                                          downloaded video fragment.
250                        * fragment_count: The number of fragments (= individual
251                                          files that will be merged)
252
253                        Progress hooks are guaranteed to be called at least once
254                        (with status "finished") if the download is successful.
255     merge_output_format: Extension to use when merging formats.
256     fixup:             Automatically correct known faults of the file.
257                        One of:
258                        - "never": do nothing
259                        - "warn": only emit a warning
260                        - "detect_or_warn": check whether we can do anything
261                                            about it, warn otherwise (default)
262     source_address:    (Experimental) Client-side IP address to bind to.
263     call_home:         Boolean, true iff we are allowed to contact the
264                        youtube-dl servers for debugging.
265     sleep_interval:    Number of seconds to sleep before each download when
266                        used alone or a lower bound of a range for randomized
267                        sleep before each download (minimum possible number
268                        of seconds to sleep) when used along with
269                        max_sleep_interval.
270     max_sleep_interval:Upper bound of a range for randomized sleep before each
271                        download (maximum possible number of seconds to sleep).
272                        Must only be used along with sleep_interval.
273                        Actual sleep time will be a random float from range
274                        [sleep_interval; max_sleep_interval].
275     listformats:       Print an overview of available video formats and exit.
276     list_thumbnails:   Print a table of all thumbnails and exit.
277     match_filter:      A function that gets called with the info_dict of
278                        every video.
279                        If it returns a message, the video is ignored.
280                        If it returns None, the video is downloaded.
281                        match_filter_func in utils.py is one example for this.
282     no_color:          Do not emit color codes in output.
283     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
284                        HTTP header (experimental)
285     geo_bypass_country:
286                        Two-letter ISO 3166-2 country code that will be used for
287                        explicit geographic restriction bypassing via faking
288                        X-Forwarded-For HTTP header (experimental)
289
290     The following options determine which downloader is picked:
291     external_downloader: Executable of the external downloader to call.
292                        None or unset for standard (built-in) downloader.
293     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
294                        if True, otherwise use ffmpeg/avconv if False, otherwise
295                        use downloader suggested by extractor if None.
296
297     The following parameters are not used by YoutubeDL itself, they are used by
298     the downloader (see youtube_dl/downloader/common.py):
299     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
300     noresizebuffer, retries, continuedl, noprogress, consoletitle,
301     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
302
303     The following options are used by the post processors:
304     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
305                        otherwise prefer avconv.
306     postprocessor_args: A list of additional command-line arguments for the
307                         postprocessor.
308
309     The following options are used by the Youtube extractor:
310     youtube_include_dash_manifest: If True (default), DASH manifests and related
311                         data will be downloaded and processed by extractor.
312                         You can reduce network I/O by disabling it if you don't
313                         care about DASH.
314     """
315
316     _NUMERIC_FIELDS = set((
317         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
318         'timestamp', 'upload_year', 'upload_month', 'upload_day',
319         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
320         'average_rating', 'comment_count', 'age_limit',
321         'start_time', 'end_time',
322         'chapter_number', 'season_number', 'episode_number',
323         'track_number', 'disc_number', 'release_year',
324         'playlist_index',
325     ))
326
327     params = None
328     _ies = []
329     _pps = []
330     _download_retcode = None
331     _num_downloads = None
332     _screen_file = None
333
334     def __init__(self, params=None, auto_init=True):
335         """Create a FileDownloader object with the given options."""
336         if params is None:
337             params = {}
338         self._ies = []
339         self._ies_instances = {}
340         self._pps = []
341         self._progress_hooks = []
342         self._download_retcode = 0
343         self._num_downloads = 0
344         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
345         self._err_file = sys.stderr
346         self.params = {
347             # Default parameters
348             'nocheckcertificate': False,
349         }
350         self.params.update(params)
351         self.cache = Cache(self)
352
353         def check_deprecated(param, option, suggestion):
354             if self.params.get(param) is not None:
355                 self.report_warning(
356                     '%s is deprecated. Use %s instead.' % (option, suggestion))
357                 return True
358             return False
359
360         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
361             if self.params.get('geo_verification_proxy') is None:
362                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
363
364         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
365         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
366         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
367
368         if params.get('bidi_workaround', False):
369             try:
370                 import pty
371                 master, slave = pty.openpty()
372                 width = compat_get_terminal_size().columns
373                 if width is None:
374                     width_args = []
375                 else:
376                     width_args = ['-w', str(width)]
377                 sp_kwargs = dict(
378                     stdin=subprocess.PIPE,
379                     stdout=slave,
380                     stderr=self._err_file)
381                 try:
382                     self._output_process = subprocess.Popen(
383                         ['bidiv'] + width_args, **sp_kwargs
384                     )
385                 except OSError:
386                     self._output_process = subprocess.Popen(
387                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
388                 self._output_channel = os.fdopen(master, 'rb')
389             except OSError as ose:
390                 if ose.errno == errno.ENOENT:
391                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
392                 else:
393                     raise
394
395         if (sys.platform != 'win32' and
396                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
397                 not params.get('restrictfilenames', False)):
398             # Unicode filesystem API will throw errors (#1474, #13027)
399             self.report_warning(
400                 'Assuming --restrict-filenames since file system encoding '
401                 'cannot encode all characters. '
402                 'Set the LC_ALL environment variable to fix this.')
403             self.params['restrictfilenames'] = True
404
405         if isinstance(params.get('outtmpl'), bytes):
406             self.report_warning(
407                 'Parameter outtmpl is bytes, but should be a unicode string. '
408                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
409
410         self._setup_opener()
411
412         if auto_init:
413             self.print_debug_header()
414             self.add_default_info_extractors()
415
416         for pp_def_raw in self.params.get('postprocessors', []):
417             pp_class = get_postprocessor(pp_def_raw['key'])
418             pp_def = dict(pp_def_raw)
419             del pp_def['key']
420             pp = pp_class(self, **compat_kwargs(pp_def))
421             self.add_post_processor(pp)
422
423         for ph in self.params.get('progress_hooks', []):
424             self.add_progress_hook(ph)
425
426         register_socks_protocols()
427
428     def warn_if_short_id(self, argv):
429         # short YouTube ID starting with dash?
430         idxs = [
431             i for i, a in enumerate(argv)
432             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
433         if idxs:
434             correct_argv = (
435                 ['youtube-dl'] +
436                 [a for i, a in enumerate(argv) if i not in idxs] +
437                 ['--'] + [argv[i] for i in idxs]
438             )
439             self.report_warning(
440                 'Long argument string detected. '
441                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
442                 args_to_str(correct_argv))
443
444     def add_info_extractor(self, ie):
445         """Add an InfoExtractor object to the end of the list."""
446         self._ies.append(ie)
447         if not isinstance(ie, type):
448             self._ies_instances[ie.ie_key()] = ie
449             ie.set_downloader(self)
450
451     def get_info_extractor(self, ie_key):
452         """
453         Get an instance of an IE with name ie_key, it will try to get one from
454         the _ies list, if there's no instance it will create a new one and add
455         it to the extractor list.
456         """
457         ie = self._ies_instances.get(ie_key)
458         if ie is None:
459             ie = get_info_extractor(ie_key)()
460             self.add_info_extractor(ie)
461         return ie
462
463     def add_default_info_extractors(self):
464         """
465         Add the InfoExtractors returned by gen_extractors to the end of the list
466         """
467         for ie in gen_extractor_classes():
468             self.add_info_extractor(ie)
469
470     def add_post_processor(self, pp):
471         """Add a PostProcessor object to the end of the chain."""
472         self._pps.append(pp)
473         pp.set_downloader(self)
474
475     def add_progress_hook(self, ph):
476         """Add the progress hook (currently only for the file downloader)"""
477         self._progress_hooks.append(ph)
478
479     def _bidi_workaround(self, message):
480         if not hasattr(self, '_output_channel'):
481             return message
482
483         assert hasattr(self, '_output_process')
484         assert isinstance(message, compat_str)
485         line_count = message.count('\n') + 1
486         self._output_process.stdin.write((message + '\n').encode('utf-8'))
487         self._output_process.stdin.flush()
488         res = ''.join(self._output_channel.readline().decode('utf-8')
489                       for _ in range(line_count))
490         return res[:-len('\n')]
491
492     def to_screen(self, message, skip_eol=False):
493         """Print message to stdout if not in quiet mode."""
494         return self.to_stdout(message, skip_eol, check_quiet=True)
495
496     def _write_string(self, s, out=None):
497         write_string(s, out=out, encoding=self.params.get('encoding'))
498
499     def to_stdout(self, message, skip_eol=False, check_quiet=False):
500         """Print message to stdout if not in quiet mode."""
501         if self.params.get('logger'):
502             self.params['logger'].debug(message)
503         elif not check_quiet or not self.params.get('quiet', False):
504             message = self._bidi_workaround(message)
505             terminator = ['\n', ''][skip_eol]
506             output = message + terminator
507
508             self._write_string(output, self._screen_file)
509
510     def to_stderr(self, message):
511         """Print message to stderr."""
512         assert isinstance(message, compat_str)
513         if self.params.get('logger'):
514             self.params['logger'].error(message)
515         else:
516             message = self._bidi_workaround(message)
517             output = message + '\n'
518             self._write_string(output, self._err_file)
519
520     def to_console_title(self, message):
521         if not self.params.get('consoletitle', False):
522             return
523         if compat_os_name == 'nt':
524             if ctypes.windll.kernel32.GetConsoleWindow():
525                 # c_wchar_p() might not be necessary if `message` is
526                 # already of type unicode()
527                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
528         elif 'TERM' in os.environ:
529             self._write_string('\033]0;%s\007' % message, self._screen_file)
530
531     def save_console_title(self):
532         if not self.params.get('consoletitle', False):
533             return
534         if compat_os_name != 'nt' and 'TERM' in os.environ:
535             # Save the title on stack
536             self._write_string('\033[22;0t', self._screen_file)
537
538     def restore_console_title(self):
539         if not self.params.get('consoletitle', False):
540             return
541         if compat_os_name != 'nt' and 'TERM' in os.environ:
542             # Restore the title from stack
543             self._write_string('\033[23;0t', self._screen_file)
544
545     def __enter__(self):
546         self.save_console_title()
547         return self
548
549     def __exit__(self, *args):
550         self.restore_console_title()
551
552         if self.params.get('cookiefile') is not None:
553             self.cookiejar.save()
554
555     def trouble(self, message=None, tb=None):
556         """Determine action to take when a download problem appears.
557
558         Depending on if the downloader has been configured to ignore
559         download errors or not, this method may throw an exception or
560         not when errors are found, after printing the message.
561
562         tb, if given, is additional traceback information.
563         """
564         if message is not None:
565             self.to_stderr(message)
566         if self.params.get('verbose'):
567             if tb is None:
568                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
569                     tb = ''
570                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
571                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
572                     tb += encode_compat_str(traceback.format_exc())
573                 else:
574                     tb_data = traceback.format_list(traceback.extract_stack())
575                     tb = ''.join(tb_data)
576             self.to_stderr(tb)
577         if not self.params.get('ignoreerrors', False):
578             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
579                 exc_info = sys.exc_info()[1].exc_info
580             else:
581                 exc_info = sys.exc_info()
582             raise DownloadError(message, exc_info)
583         self._download_retcode = 1
584
585     def report_warning(self, message):
586         '''
587         Print the message to stderr, it will be prefixed with 'WARNING:'
588         If stderr is a tty file the 'WARNING:' will be colored
589         '''
590         if self.params.get('logger') is not None:
591             self.params['logger'].warning(message)
592         else:
593             if self.params.get('no_warnings'):
594                 return
595             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
596                 _msg_header = '\033[0;33mWARNING:\033[0m'
597             else:
598                 _msg_header = 'WARNING:'
599             warning_message = '%s %s' % (_msg_header, message)
600             self.to_stderr(warning_message)
601
602     def report_error(self, message, tb=None):
603         '''
604         Do the same as trouble, but prefixes the message with 'ERROR:', colored
605         in red if stderr is a tty file.
606         '''
607         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
608             _msg_header = '\033[0;31mERROR:\033[0m'
609         else:
610             _msg_header = 'ERROR:'
611         error_message = '%s %s' % (_msg_header, message)
612         self.trouble(error_message, tb)
613
614     def report_file_already_downloaded(self, file_name):
615         """Report file has already been fully downloaded."""
616         try:
617             self.to_screen('[download] %s has already been downloaded' % file_name)
618         except UnicodeEncodeError:
619             self.to_screen('[download] The file has already been downloaded')
620
621     def prepare_filename(self, info_dict):
622         """Generate the output filename."""
623         try:
624             template_dict = dict(info_dict)
625
626             template_dict['epoch'] = int(time.time())
627             autonumber_size = self.params.get('autonumber_size')
628             if autonumber_size is None:
629                 autonumber_size = 5
630             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
631             if template_dict.get('resolution') is None:
632                 if template_dict.get('width') and template_dict.get('height'):
633                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
634                 elif template_dict.get('height'):
635                     template_dict['resolution'] = '%sp' % template_dict['height']
636                 elif template_dict.get('width'):
637                     template_dict['resolution'] = '%dx?' % template_dict['width']
638
639             sanitize = lambda k, v: sanitize_filename(
640                 compat_str(v),
641                 restricted=self.params.get('restrictfilenames'),
642                 is_id=(k == 'id' or k.endswith('_id')))
643             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
644                                  for k, v in template_dict.items()
645                                  if v is not None and not isinstance(v, (list, tuple, dict)))
646             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
647
648             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
649
650             # For fields playlist_index and autonumber convert all occurrences
651             # of %(field)s to %(field)0Nd for backward compatibility
652             field_size_compat_map = {
653                 'playlist_index': len(str(template_dict['n_entries'])),
654                 'autonumber': autonumber_size,
655             }
656             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
657             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
658             if mobj:
659                 outtmpl = re.sub(
660                     FIELD_SIZE_COMPAT_RE,
661                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
662                     outtmpl)
663
664             # Missing numeric fields used together with integer presentation types
665             # in format specification will break the argument substitution since
666             # string 'NA' is returned for missing fields. We will patch output
667             # template for missing fields to meet string presentation type.
668             for numeric_field in self._NUMERIC_FIELDS:
669                 if numeric_field not in template_dict:
670                     # As of [1] format syntax is:
671                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
672                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
673                     FORMAT_RE = r'''(?x)
674                         (?<!%)
675                         %
676                         \({0}\)  # mapping key
677                         (?:[#0\-+ ]+)?  # conversion flags (optional)
678                         (?:\d+)?  # minimum field width (optional)
679                         (?:\.\d+)?  # precision (optional)
680                         [hlL]?  # length modifier (optional)
681                         [diouxXeEfFgGcrs%]  # conversion type
682                     '''
683                     outtmpl = re.sub(
684                         FORMAT_RE.format(numeric_field),
685                         r'%({0})s'.format(numeric_field), outtmpl)
686
687             # expand_path translates '%%' into '%' and '$$' into '$'
688             # correspondingly that is not what we want since we need to keep
689             # '%%' intact for template dict substitution step. Working around
690             # with boundary-alike separator hack.
691             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
692             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
693
694             # outtmpl should be expand_path'ed before template dict substitution
695             # because meta fields may contain env variables we don't want to
696             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
697             # title "Hello $PATH", we don't want `$PATH` to be expanded.
698             filename = expand_path(outtmpl).replace(sep, '') % template_dict
699
700             # Temporary fix for #4787
701             # 'Treat' all problem characters by passing filename through preferredencoding
702             # to workaround encoding issues with subprocess on python2 @ Windows
703             if sys.version_info < (3, 0) and sys.platform == 'win32':
704                 filename = encodeFilename(filename, True).decode(preferredencoding())
705             return sanitize_path(filename)
706         except ValueError as err:
707             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
708             return None
709
710     def _match_entry(self, info_dict, incomplete):
711         """ Returns None iff the file should be downloaded """
712
713         video_title = info_dict.get('title', info_dict.get('id', 'video'))
714         if 'title' in info_dict:
715             # This can happen when we're just evaluating the playlist
716             title = info_dict['title']
717             matchtitle = self.params.get('matchtitle', False)
718             if matchtitle:
719                 if not re.search(matchtitle, title, re.IGNORECASE):
720                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
721             rejecttitle = self.params.get('rejecttitle', False)
722             if rejecttitle:
723                 if re.search(rejecttitle, title, re.IGNORECASE):
724                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
725         date = info_dict.get('upload_date')
726         if date is not None:
727             dateRange = self.params.get('daterange', DateRange())
728             if date not in dateRange:
729                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
730         view_count = info_dict.get('view_count')
731         if view_count is not None:
732             min_views = self.params.get('min_views')
733             if min_views is not None and view_count < min_views:
734                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
735             max_views = self.params.get('max_views')
736             if max_views is not None and view_count > max_views:
737                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
738         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
739             return 'Skipping "%s" because it is age restricted' % video_title
740         if self.in_download_archive(info_dict):
741             return '%s has already been recorded in archive' % video_title
742
743         if not incomplete:
744             match_filter = self.params.get('match_filter')
745             if match_filter is not None:
746                 ret = match_filter(info_dict)
747                 if ret is not None:
748                     return ret
749
750         return None
751
752     @staticmethod
753     def add_extra_info(info_dict, extra_info):
754         '''Set the keys from extra_info in info dict if they are missing'''
755         for key, value in extra_info.items():
756             info_dict.setdefault(key, value)
757
758     def extract_info(self, url, download=True, ie_key=None, extra_info={},
759                      process=True, force_generic_extractor=False):
760         '''
761         Returns a list with a dictionary for each video we find.
762         If 'download', also downloads the videos.
763         extra_info is a dict containing the extra values to add to each result
764         '''
765
766         if not ie_key and force_generic_extractor:
767             ie_key = 'Generic'
768
769         if ie_key:
770             ies = [self.get_info_extractor(ie_key)]
771         else:
772             ies = self._ies
773
774         for ie in ies:
775             if not ie.suitable(url):
776                 continue
777
778             ie = self.get_info_extractor(ie.ie_key())
779             if not ie.working():
780                 self.report_warning('The program functionality for this site has been marked as broken, '
781                                     'and will probably not work.')
782
783             try:
784                 ie_result = ie.extract(url)
785                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
786                     break
787                 if isinstance(ie_result, list):
788                     # Backwards compatibility: old IE result format
789                     ie_result = {
790                         '_type': 'compat_list',
791                         'entries': ie_result,
792                     }
793                 self.add_default_extra_info(ie_result, ie, url)
794                 if process:
795                     return self.process_ie_result(ie_result, download, extra_info)
796                 else:
797                     return ie_result
798             except GeoRestrictedError as e:
799                 msg = e.msg
800                 if e.countries:
801                     msg += '\nThis video is available in %s.' % ', '.join(
802                         map(ISO3166Utils.short2full, e.countries))
803                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
804                 self.report_error(msg)
805                 break
806             except ExtractorError as e:  # An error we somewhat expected
807                 self.report_error(compat_str(e), e.format_traceback())
808                 break
809             except MaxDownloadsReached:
810                 raise
811             except Exception as e:
812                 if self.params.get('ignoreerrors', False):
813                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
814                     break
815                 else:
816                     raise
817         else:
818             self.report_error('no suitable InfoExtractor for URL %s' % url)
819
820     def add_default_extra_info(self, ie_result, ie, url):
821         self.add_extra_info(ie_result, {
822             'extractor': ie.IE_NAME,
823             'webpage_url': url,
824             'webpage_url_basename': url_basename(url),
825             'extractor_key': ie.ie_key(),
826         })
827
828     def process_ie_result(self, ie_result, download=True, extra_info={}):
829         """
830         Take the result of the ie(may be modified) and resolve all unresolved
831         references (URLs, playlist items).
832
833         It will also download the videos if 'download'.
834         Returns the resolved ie_result.
835         """
836         result_type = ie_result.get('_type', 'video')
837
838         if result_type in ('url', 'url_transparent'):
839             ie_result['url'] = sanitize_url(ie_result['url'])
840             extract_flat = self.params.get('extract_flat', False)
841             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
842                     extract_flat is True):
843                 if self.params.get('forcejson', False):
844                     self.to_stdout(json.dumps(ie_result))
845                 return ie_result
846
847         if result_type == 'video':
848             self.add_extra_info(ie_result, extra_info)
849             return self.process_video_result(ie_result, download=download)
850         elif result_type == 'url':
851             # We have to add extra_info to the results because it may be
852             # contained in a playlist
853             return self.extract_info(ie_result['url'],
854                                      download,
855                                      ie_key=ie_result.get('ie_key'),
856                                      extra_info=extra_info)
857         elif result_type == 'url_transparent':
858             # Use the information from the embedding page
859             info = self.extract_info(
860                 ie_result['url'], ie_key=ie_result.get('ie_key'),
861                 extra_info=extra_info, download=False, process=False)
862
863             # extract_info may return None when ignoreerrors is enabled and
864             # extraction failed with an error, don't crash and return early
865             # in this case
866             if not info:
867                 return info
868
869             force_properties = dict(
870                 (k, v) for k, v in ie_result.items() if v is not None)
871             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
872                 if f in force_properties:
873                     del force_properties[f]
874             new_result = info.copy()
875             new_result.update(force_properties)
876
877             # Extracted info may not be a video result (i.e.
878             # info.get('_type', 'video') != video) but rather an url or
879             # url_transparent. In such cases outer metadata (from ie_result)
880             # should be propagated to inner one (info). For this to happen
881             # _type of info should be overridden with url_transparent. This
882             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
883             if new_result.get('_type') == 'url':
884                 new_result['_type'] = 'url_transparent'
885
886             return self.process_ie_result(
887                 new_result, download=download, extra_info=extra_info)
888         elif result_type in ('playlist', 'multi_video'):
889             # We process each entry in the playlist
890             playlist = ie_result.get('title') or ie_result.get('id')
891             self.to_screen('[download] Downloading playlist: %s' % playlist)
892
893             playlist_results = []
894
895             playliststart = self.params.get('playliststart', 1) - 1
896             playlistend = self.params.get('playlistend')
897             # For backwards compatibility, interpret -1 as whole list
898             if playlistend == -1:
899                 playlistend = None
900
901             playlistitems_str = self.params.get('playlist_items')
902             playlistitems = None
903             if playlistitems_str is not None:
904                 def iter_playlistitems(format):
905                     for string_segment in format.split(','):
906                         if '-' in string_segment:
907                             start, end = string_segment.split('-')
908                             for item in range(int(start), int(end) + 1):
909                                 yield int(item)
910                         else:
911                             yield int(string_segment)
912                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
913
914             ie_entries = ie_result['entries']
915
916             def make_playlistitems_entries(list_ie_entries):
917                 num_entries = len(list_ie_entries)
918                 return [
919                     list_ie_entries[i - 1] for i in playlistitems
920                     if -num_entries <= i - 1 < num_entries]
921
922             def report_download(num_entries):
923                 self.to_screen(
924                     '[%s] playlist %s: Downloading %d videos' %
925                     (ie_result['extractor'], playlist, num_entries))
926
927             if isinstance(ie_entries, list):
928                 n_all_entries = len(ie_entries)
929                 if playlistitems:
930                     entries = make_playlistitems_entries(ie_entries)
931                 else:
932                     entries = ie_entries[playliststart:playlistend]
933                 n_entries = len(entries)
934                 self.to_screen(
935                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
936                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
937             elif isinstance(ie_entries, PagedList):
938                 if playlistitems:
939                     entries = []
940                     for item in playlistitems:
941                         entries.extend(ie_entries.getslice(
942                             item - 1, item
943                         ))
944                 else:
945                     entries = ie_entries.getslice(
946                         playliststart, playlistend)
947                 n_entries = len(entries)
948                 report_download(n_entries)
949             else:  # iterable
950                 if playlistitems:
951                     entries = make_playlistitems_entries(list(ie_entries))
952                 else:
953                     entries = list(itertools.islice(
954                         ie_entries, playliststart, playlistend))
955                 n_entries = len(entries)
956                 report_download(n_entries)
957
958             if self.params.get('playlistreverse', False):
959                 entries = entries[::-1]
960
961             if self.params.get('playlistrandom', False):
962                 random.shuffle(entries)
963
964             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
965
966             for i, entry in enumerate(entries, 1):
967                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
968                 # This __x_forwarded_for_ip thing is a bit ugly but requires
969                 # minimal changes
970                 if x_forwarded_for:
971                     entry['__x_forwarded_for_ip'] = x_forwarded_for
972                 extra = {
973                     'n_entries': n_entries,
974                     'playlist': playlist,
975                     'playlist_id': ie_result.get('id'),
976                     'playlist_title': ie_result.get('title'),
977                     'playlist_index': i + playliststart,
978                     'extractor': ie_result['extractor'],
979                     'webpage_url': ie_result['webpage_url'],
980                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
981                     'extractor_key': ie_result['extractor_key'],
982                 }
983
984                 reason = self._match_entry(entry, incomplete=True)
985                 if reason is not None:
986                     self.to_screen('[download] ' + reason)
987                     continue
988
989                 entry_result = self.process_ie_result(entry,
990                                                       download=download,
991                                                       extra_info=extra)
992                 playlist_results.append(entry_result)
993             ie_result['entries'] = playlist_results
994             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
995             return ie_result
996         elif result_type == 'compat_list':
997             self.report_warning(
998                 'Extractor %s returned a compat_list result. '
999                 'It needs to be updated.' % ie_result.get('extractor'))
1000
1001             def _fixup(r):
1002                 self.add_extra_info(
1003                     r,
1004                     {
1005                         'extractor': ie_result['extractor'],
1006                         'webpage_url': ie_result['webpage_url'],
1007                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1008                         'extractor_key': ie_result['extractor_key'],
1009                     }
1010                 )
1011                 return r
1012             ie_result['entries'] = [
1013                 self.process_ie_result(_fixup(r), download, extra_info)
1014                 for r in ie_result['entries']
1015             ]
1016             return ie_result
1017         else:
1018             raise Exception('Invalid result type: %s' % result_type)
1019
1020     def _build_format_filter(self, filter_spec):
1021         " Returns a function to filter the formats according to the filter_spec "
1022
1023         OPERATORS = {
1024             '<': operator.lt,
1025             '<=': operator.le,
1026             '>': operator.gt,
1027             '>=': operator.ge,
1028             '=': operator.eq,
1029             '!=': operator.ne,
1030         }
1031         operator_rex = re.compile(r'''(?x)\s*
1032             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1033             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1034             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1035             $
1036             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1037         m = operator_rex.search(filter_spec)
1038         if m:
1039             try:
1040                 comparison_value = int(m.group('value'))
1041             except ValueError:
1042                 comparison_value = parse_filesize(m.group('value'))
1043                 if comparison_value is None:
1044                     comparison_value = parse_filesize(m.group('value') + 'B')
1045                 if comparison_value is None:
1046                     raise ValueError(
1047                         'Invalid value %r in format specification %r' % (
1048                             m.group('value'), filter_spec))
1049             op = OPERATORS[m.group('op')]
1050
1051         if not m:
1052             STR_OPERATORS = {
1053                 '=': operator.eq,
1054                 '!=': operator.ne,
1055                 '^=': lambda attr, value: attr.startswith(value),
1056                 '$=': lambda attr, value: attr.endswith(value),
1057                 '*=': lambda attr, value: value in attr,
1058             }
1059             str_operator_rex = re.compile(r'''(?x)
1060                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1061                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1062                 \s*(?P<value>[a-zA-Z0-9._-]+)
1063                 \s*$
1064                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1065             m = str_operator_rex.search(filter_spec)
1066             if m:
1067                 comparison_value = m.group('value')
1068                 op = STR_OPERATORS[m.group('op')]
1069
1070         if not m:
1071             raise ValueError('Invalid filter specification %r' % filter_spec)
1072
1073         def _filter(f):
1074             actual_value = f.get(m.group('key'))
1075             if actual_value is None:
1076                 return m.group('none_inclusive')
1077             return op(actual_value, comparison_value)
1078         return _filter
1079
1080     def _default_format_spec(self, info_dict, download=True):
1081         req_format_list = []
1082
1083         def can_have_partial_formats():
1084             if self.params.get('simulate', False):
1085                 return True
1086             if not download:
1087                 return True
1088             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1089                 return False
1090             if info_dict.get('is_live'):
1091                 return False
1092             merger = FFmpegMergerPP(self)
1093             return merger.available and merger.can_merge()
1094         if can_have_partial_formats():
1095             req_format_list.append('bestvideo+bestaudio')
1096         req_format_list.append('best')
1097         return '/'.join(req_format_list)
1098
1099     def build_format_selector(self, format_spec):
1100         def syntax_error(note, start):
1101             message = (
1102                 'Invalid format specification: '
1103                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1104             return SyntaxError(message)
1105
1106         PICKFIRST = 'PICKFIRST'
1107         MERGE = 'MERGE'
1108         SINGLE = 'SINGLE'
1109         GROUP = 'GROUP'
1110         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1111
1112         def _parse_filter(tokens):
1113             filter_parts = []
1114             for type, string, start, _, _ in tokens:
1115                 if type == tokenize.OP and string == ']':
1116                     return ''.join(filter_parts)
1117                 else:
1118                     filter_parts.append(string)
1119
1120         def _remove_unused_ops(tokens):
1121             # Remove operators that we don't use and join them with the surrounding strings
1122             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1123             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1124             last_string, last_start, last_end, last_line = None, None, None, None
1125             for type, string, start, end, line in tokens:
1126                 if type == tokenize.OP and string == '[':
1127                     if last_string:
1128                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1129                         last_string = None
1130                     yield type, string, start, end, line
1131                     # everything inside brackets will be handled by _parse_filter
1132                     for type, string, start, end, line in tokens:
1133                         yield type, string, start, end, line
1134                         if type == tokenize.OP and string == ']':
1135                             break
1136                 elif type == tokenize.OP and string in ALLOWED_OPS:
1137                     if last_string:
1138                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1139                         last_string = None
1140                     yield type, string, start, end, line
1141                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1142                     if not last_string:
1143                         last_string = string
1144                         last_start = start
1145                         last_end = end
1146                     else:
1147                         last_string += string
1148             if last_string:
1149                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1150
1151         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1152             selectors = []
1153             current_selector = None
1154             for type, string, start, _, _ in tokens:
1155                 # ENCODING is only defined in python 3.x
1156                 if type == getattr(tokenize, 'ENCODING', None):
1157                     continue
1158                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1159                     current_selector = FormatSelector(SINGLE, string, [])
1160                 elif type == tokenize.OP:
1161                     if string == ')':
1162                         if not inside_group:
1163                             # ')' will be handled by the parentheses group
1164                             tokens.restore_last_token()
1165                         break
1166                     elif inside_merge and string in ['/', ',']:
1167                         tokens.restore_last_token()
1168                         break
1169                     elif inside_choice and string == ',':
1170                         tokens.restore_last_token()
1171                         break
1172                     elif string == ',':
1173                         if not current_selector:
1174                             raise syntax_error('"," must follow a format selector', start)
1175                         selectors.append(current_selector)
1176                         current_selector = None
1177                     elif string == '/':
1178                         if not current_selector:
1179                             raise syntax_error('"/" must follow a format selector', start)
1180                         first_choice = current_selector
1181                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1182                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1183                     elif string == '[':
1184                         if not current_selector:
1185                             current_selector = FormatSelector(SINGLE, 'best', [])
1186                         format_filter = _parse_filter(tokens)
1187                         current_selector.filters.append(format_filter)
1188                     elif string == '(':
1189                         if current_selector:
1190                             raise syntax_error('Unexpected "("', start)
1191                         group = _parse_format_selection(tokens, inside_group=True)
1192                         current_selector = FormatSelector(GROUP, group, [])
1193                     elif string == '+':
1194                         video_selector = current_selector
1195                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1196                         if not video_selector or not audio_selector:
1197                             raise syntax_error('"+" must be between two format selectors', start)
1198                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1199                     else:
1200                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1201                 elif type == tokenize.ENDMARKER:
1202                     break
1203             if current_selector:
1204                 selectors.append(current_selector)
1205             return selectors
1206
1207         def _build_selector_function(selector):
1208             if isinstance(selector, list):
1209                 fs = [_build_selector_function(s) for s in selector]
1210
1211                 def selector_function(ctx):
1212                     for f in fs:
1213                         for format in f(ctx):
1214                             yield format
1215                 return selector_function
1216             elif selector.type == GROUP:
1217                 selector_function = _build_selector_function(selector.selector)
1218             elif selector.type == PICKFIRST:
1219                 fs = [_build_selector_function(s) for s in selector.selector]
1220
1221                 def selector_function(ctx):
1222                     for f in fs:
1223                         picked_formats = list(f(ctx))
1224                         if picked_formats:
1225                             return picked_formats
1226                     return []
1227             elif selector.type == SINGLE:
1228                 format_spec = selector.selector
1229
1230                 def selector_function(ctx):
1231                     formats = list(ctx['formats'])
1232                     if not formats:
1233                         return
1234                     if format_spec == 'all':
1235                         for f in formats:
1236                             yield f
1237                     elif format_spec in ['best', 'worst', None]:
1238                         format_idx = 0 if format_spec == 'worst' else -1
1239                         audiovideo_formats = [
1240                             f for f in formats
1241                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1242                         if audiovideo_formats:
1243                             yield audiovideo_formats[format_idx]
1244                         # for extractors with incomplete formats (audio only (soundcloud)
1245                         # or video only (imgur)) we will fallback to best/worst
1246                         # {video,audio}-only format
1247                         elif ctx['incomplete_formats']:
1248                             yield formats[format_idx]
1249                     elif format_spec == 'bestaudio':
1250                         audio_formats = [
1251                             f for f in formats
1252                             if f.get('vcodec') == 'none']
1253                         if audio_formats:
1254                             yield audio_formats[-1]
1255                     elif format_spec == 'worstaudio':
1256                         audio_formats = [
1257                             f for f in formats
1258                             if f.get('vcodec') == 'none']
1259                         if audio_formats:
1260                             yield audio_formats[0]
1261                     elif format_spec == 'bestvideo':
1262                         video_formats = [
1263                             f for f in formats
1264                             if f.get('acodec') == 'none']
1265                         if video_formats:
1266                             yield video_formats[-1]
1267                     elif format_spec == 'worstvideo':
1268                         video_formats = [
1269                             f for f in formats
1270                             if f.get('acodec') == 'none']
1271                         if video_formats:
1272                             yield video_formats[0]
1273                     else:
1274                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1275                         if format_spec in extensions:
1276                             filter_f = lambda f: f['ext'] == format_spec
1277                         else:
1278                             filter_f = lambda f: f['format_id'] == format_spec
1279                         matches = list(filter(filter_f, formats))
1280                         if matches:
1281                             yield matches[-1]
1282             elif selector.type == MERGE:
1283                 def _merge(formats_info):
1284                     format_1, format_2 = [f['format_id'] for f in formats_info]
1285                     # The first format must contain the video and the
1286                     # second the audio
1287                     if formats_info[0].get('vcodec') == 'none':
1288                         self.report_error('The first format must '
1289                                           'contain the video, try using '
1290                                           '"-f %s+%s"' % (format_2, format_1))
1291                         return
1292                     # Formats must be opposite (video+audio)
1293                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1294                         self.report_error(
1295                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1296                             % (format_1, format_2))
1297                         return
1298                     output_ext = (
1299                         formats_info[0]['ext']
1300                         if self.params.get('merge_output_format') is None
1301                         else self.params['merge_output_format'])
1302                     return {
1303                         'requested_formats': formats_info,
1304                         'format': '%s+%s' % (formats_info[0].get('format'),
1305                                              formats_info[1].get('format')),
1306                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1307                                                 formats_info[1].get('format_id')),
1308                         'width': formats_info[0].get('width'),
1309                         'height': formats_info[0].get('height'),
1310                         'resolution': formats_info[0].get('resolution'),
1311                         'fps': formats_info[0].get('fps'),
1312                         'vcodec': formats_info[0].get('vcodec'),
1313                         'vbr': formats_info[0].get('vbr'),
1314                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1315                         'acodec': formats_info[1].get('acodec'),
1316                         'abr': formats_info[1].get('abr'),
1317                         'ext': output_ext,
1318                     }
1319                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1320
1321                 def selector_function(ctx):
1322                     for pair in itertools.product(
1323                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1324                         yield _merge(pair)
1325
1326             filters = [self._build_format_filter(f) for f in selector.filters]
1327
1328             def final_selector(ctx):
1329                 ctx_copy = copy.deepcopy(ctx)
1330                 for _filter in filters:
1331                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1332                 return selector_function(ctx_copy)
1333             return final_selector
1334
1335         stream = io.BytesIO(format_spec.encode('utf-8'))
1336         try:
1337             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1338         except tokenize.TokenError:
1339             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1340
1341         class TokenIterator(object):
1342             def __init__(self, tokens):
1343                 self.tokens = tokens
1344                 self.counter = 0
1345
1346             def __iter__(self):
1347                 return self
1348
1349             def __next__(self):
1350                 if self.counter >= len(self.tokens):
1351                     raise StopIteration()
1352                 value = self.tokens[self.counter]
1353                 self.counter += 1
1354                 return value
1355
1356             next = __next__
1357
1358             def restore_last_token(self):
1359                 self.counter -= 1
1360
1361         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1362         return _build_selector_function(parsed_selector)
1363
1364     def _calc_headers(self, info_dict):
1365         res = std_headers.copy()
1366
1367         add_headers = info_dict.get('http_headers')
1368         if add_headers:
1369             res.update(add_headers)
1370
1371         cookies = self._calc_cookies(info_dict)
1372         if cookies:
1373             res['Cookie'] = cookies
1374
1375         if 'X-Forwarded-For' not in res:
1376             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1377             if x_forwarded_for_ip:
1378                 res['X-Forwarded-For'] = x_forwarded_for_ip
1379
1380         return res
1381
1382     def _calc_cookies(self, info_dict):
1383         pr = sanitized_Request(info_dict['url'])
1384         self.cookiejar.add_cookie_header(pr)
1385         return pr.get_header('Cookie')
1386
1387     def process_video_result(self, info_dict, download=True):
1388         assert info_dict.get('_type', 'video') == 'video'
1389
1390         if 'id' not in info_dict:
1391             raise ExtractorError('Missing "id" field in extractor result')
1392         if 'title' not in info_dict:
1393             raise ExtractorError('Missing "title" field in extractor result')
1394
1395         def report_force_conversion(field, field_not, conversion):
1396             self.report_warning(
1397                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1398                 % (field, field_not, conversion))
1399
1400         def sanitize_string_field(info, string_field):
1401             field = info.get(string_field)
1402             if field is None or isinstance(field, compat_str):
1403                 return
1404             report_force_conversion(string_field, 'a string', 'string')
1405             info[string_field] = compat_str(field)
1406
1407         def sanitize_numeric_fields(info):
1408             for numeric_field in self._NUMERIC_FIELDS:
1409                 field = info.get(numeric_field)
1410                 if field is None or isinstance(field, compat_numeric_types):
1411                     continue
1412                 report_force_conversion(numeric_field, 'numeric', 'int')
1413                 info[numeric_field] = int_or_none(field)
1414
1415         sanitize_string_field(info_dict, 'id')
1416         sanitize_numeric_fields(info_dict)
1417
1418         if 'playlist' not in info_dict:
1419             # It isn't part of a playlist
1420             info_dict['playlist'] = None
1421             info_dict['playlist_index'] = None
1422
1423         thumbnails = info_dict.get('thumbnails')
1424         if thumbnails is None:
1425             thumbnail = info_dict.get('thumbnail')
1426             if thumbnail:
1427                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1428         if thumbnails:
1429             thumbnails.sort(key=lambda t: (
1430                 t.get('preference') if t.get('preference') is not None else -1,
1431                 t.get('width') if t.get('width') is not None else -1,
1432                 t.get('height') if t.get('height') is not None else -1,
1433                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1434             for i, t in enumerate(thumbnails):
1435                 t['url'] = sanitize_url(t['url'])
1436                 if t.get('width') and t.get('height'):
1437                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1438                 if t.get('id') is None:
1439                     t['id'] = '%d' % i
1440
1441         if self.params.get('list_thumbnails'):
1442             self.list_thumbnails(info_dict)
1443             return
1444
1445         thumbnail = info_dict.get('thumbnail')
1446         if thumbnail:
1447             info_dict['thumbnail'] = sanitize_url(thumbnail)
1448         elif thumbnails:
1449             info_dict['thumbnail'] = thumbnails[-1]['url']
1450
1451         if 'display_id' not in info_dict and 'id' in info_dict:
1452             info_dict['display_id'] = info_dict['id']
1453
1454         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1455             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1456             # see http://bugs.python.org/issue1646728)
1457             try:
1458                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1459                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1460             except (ValueError, OverflowError, OSError):
1461                 pass
1462
1463         # Auto generate title fields corresponding to the *_number fields when missing
1464         # in order to always have clean titles. This is very common for TV series.
1465         for field in ('chapter', 'season', 'episode'):
1466             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1467                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1468
1469         subtitles = info_dict.get('subtitles')
1470         if subtitles:
1471             for _, subtitle in subtitles.items():
1472                 for subtitle_format in subtitle:
1473                     if subtitle_format.get('url'):
1474                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1475                     if subtitle_format.get('ext') is None:
1476                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1477
1478         if self.params.get('listsubtitles', False):
1479             if 'automatic_captions' in info_dict:
1480                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1481             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1482             return
1483         info_dict['requested_subtitles'] = self.process_subtitles(
1484             info_dict['id'], subtitles,
1485             info_dict.get('automatic_captions'))
1486
1487         # We now pick which formats have to be downloaded
1488         if info_dict.get('formats') is None:
1489             # There's only one format available
1490             formats = [info_dict]
1491         else:
1492             formats = info_dict['formats']
1493
1494         if not formats:
1495             raise ExtractorError('No video formats found!')
1496
1497         def is_wellformed(f):
1498             url = f.get('url')
1499             if not url:
1500                 self.report_warning(
1501                     '"url" field is missing or empty - skipping format, '
1502                     'there is an error in extractor')
1503                 return False
1504             if isinstance(url, bytes):
1505                 sanitize_string_field(f, 'url')
1506             return True
1507
1508         # Filter out malformed formats for better extraction robustness
1509         formats = list(filter(is_wellformed, formats))
1510
1511         formats_dict = {}
1512
1513         # We check that all the formats have the format and format_id fields
1514         for i, format in enumerate(formats):
1515             sanitize_string_field(format, 'format_id')
1516             sanitize_numeric_fields(format)
1517             format['url'] = sanitize_url(format['url'])
1518             if not format.get('format_id'):
1519                 format['format_id'] = compat_str(i)
1520             else:
1521                 # Sanitize format_id from characters used in format selector expression
1522                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1523             format_id = format['format_id']
1524             if format_id not in formats_dict:
1525                 formats_dict[format_id] = []
1526             formats_dict[format_id].append(format)
1527
1528         # Make sure all formats have unique format_id
1529         for format_id, ambiguous_formats in formats_dict.items():
1530             if len(ambiguous_formats) > 1:
1531                 for i, format in enumerate(ambiguous_formats):
1532                     format['format_id'] = '%s-%d' % (format_id, i)
1533
1534         for i, format in enumerate(formats):
1535             if format.get('format') is None:
1536                 format['format'] = '{id} - {res}{note}'.format(
1537                     id=format['format_id'],
1538                     res=self.format_resolution(format),
1539                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1540                 )
1541             # Automatically determine file extension if missing
1542             if format.get('ext') is None:
1543                 format['ext'] = determine_ext(format['url']).lower()
1544             # Automatically determine protocol if missing (useful for format
1545             # selection purposes)
1546             if format.get('protocol') is None:
1547                 format['protocol'] = determine_protocol(format)
1548             # Add HTTP headers, so that external programs can use them from the
1549             # json output
1550             full_format_info = info_dict.copy()
1551             full_format_info.update(format)
1552             format['http_headers'] = self._calc_headers(full_format_info)
1553         # Remove private housekeeping stuff
1554         if '__x_forwarded_for_ip' in info_dict:
1555             del info_dict['__x_forwarded_for_ip']
1556
1557         # TODO Central sorting goes here
1558
1559         if formats[0] is not info_dict:
1560             # only set the 'formats' fields if the original info_dict list them
1561             # otherwise we end up with a circular reference, the first (and unique)
1562             # element in the 'formats' field in info_dict is info_dict itself,
1563             # which can't be exported to json
1564             info_dict['formats'] = formats
1565         if self.params.get('listformats'):
1566             self.list_formats(info_dict)
1567             return
1568
1569         req_format = self.params.get('format')
1570         if req_format is None:
1571             req_format = self._default_format_spec(info_dict, download=download)
1572             if self.params.get('verbose'):
1573                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1574
1575         format_selector = self.build_format_selector(req_format)
1576
1577         # While in format selection we may need to have an access to the original
1578         # format set in order to calculate some metrics or do some processing.
1579         # For now we need to be able to guess whether original formats provided
1580         # by extractor are incomplete or not (i.e. whether extractor provides only
1581         # video-only or audio-only formats) for proper formats selection for
1582         # extractors with such incomplete formats (see
1583         # https://github.com/rg3/youtube-dl/pull/5556).
1584         # Since formats may be filtered during format selection and may not match
1585         # the original formats the results may be incorrect. Thus original formats
1586         # or pre-calculated metrics should be passed to format selection routines
1587         # as well.
1588         # We will pass a context object containing all necessary additional data
1589         # instead of just formats.
1590         # This fixes incorrect format selection issue (see
1591         # https://github.com/rg3/youtube-dl/issues/10083).
1592         incomplete_formats = (
1593             # All formats are video-only or
1594             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1595             # all formats are audio-only
1596             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1597
1598         ctx = {
1599             'formats': formats,
1600             'incomplete_formats': incomplete_formats,
1601         }
1602
1603         formats_to_download = list(format_selector(ctx))
1604         if not formats_to_download:
1605             raise ExtractorError('requested format not available',
1606                                  expected=True)
1607
1608         if download:
1609             if len(formats_to_download) > 1:
1610                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1611             for format in formats_to_download:
1612                 new_info = dict(info_dict)
1613                 new_info.update(format)
1614                 self.process_info(new_info)
1615         # We update the info dict with the best quality format (backwards compatibility)
1616         info_dict.update(formats_to_download[-1])
1617         return info_dict
1618
1619     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1620         """Select the requested subtitles and their format"""
1621         available_subs = {}
1622         if normal_subtitles and self.params.get('writesubtitles'):
1623             available_subs.update(normal_subtitles)
1624         if automatic_captions and self.params.get('writeautomaticsub'):
1625             for lang, cap_info in automatic_captions.items():
1626                 if lang not in available_subs:
1627                     available_subs[lang] = cap_info
1628
1629         if (not self.params.get('writesubtitles') and not
1630                 self.params.get('writeautomaticsub') or not
1631                 available_subs):
1632             return None
1633
1634         if self.params.get('allsubtitles', False):
1635             requested_langs = available_subs.keys()
1636         else:
1637             if self.params.get('subtitleslangs', False):
1638                 requested_langs = self.params.get('subtitleslangs')
1639             elif 'en' in available_subs:
1640                 requested_langs = ['en']
1641             else:
1642                 requested_langs = [list(available_subs.keys())[0]]
1643
1644         formats_query = self.params.get('subtitlesformat', 'best')
1645         formats_preference = formats_query.split('/') if formats_query else []
1646         subs = {}
1647         for lang in requested_langs:
1648             formats = available_subs.get(lang)
1649             if formats is None:
1650                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1651                 continue
1652             for ext in formats_preference:
1653                 if ext == 'best':
1654                     f = formats[-1]
1655                     break
1656                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1657                 if matches:
1658                     f = matches[-1]
1659                     break
1660             else:
1661                 f = formats[-1]
1662                 self.report_warning(
1663                     'No subtitle format found matching "%s" for language %s, '
1664                     'using %s' % (formats_query, lang, f['ext']))
1665             subs[lang] = f
1666         return subs
1667
1668     def process_info(self, info_dict):
1669         """Process a single resolved IE result."""
1670
1671         assert info_dict.get('_type', 'video') == 'video'
1672
1673         max_downloads = self.params.get('max_downloads')
1674         if max_downloads is not None:
1675             if self._num_downloads >= int(max_downloads):
1676                 raise MaxDownloadsReached()
1677
1678         info_dict['fulltitle'] = info_dict['title']
1679         if len(info_dict['title']) > 200:
1680             info_dict['title'] = info_dict['title'][:197] + '...'
1681
1682         if 'format' not in info_dict:
1683             info_dict['format'] = info_dict['ext']
1684
1685         reason = self._match_entry(info_dict, incomplete=False)
1686         if reason is not None:
1687             self.to_screen('[download] ' + reason)
1688             return
1689
1690         self._num_downloads += 1
1691
1692         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1693
1694         # Forced printings
1695         if self.params.get('forcetitle', False):
1696             self.to_stdout(info_dict['fulltitle'])
1697         if self.params.get('forceid', False):
1698             self.to_stdout(info_dict['id'])
1699         if self.params.get('forceurl', False):
1700             if info_dict.get('requested_formats') is not None:
1701                 for f in info_dict['requested_formats']:
1702                     self.to_stdout(f['url'] + f.get('play_path', ''))
1703             else:
1704                 # For RTMP URLs, also include the playpath
1705                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1706         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1707             self.to_stdout(info_dict['thumbnail'])
1708         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1709             self.to_stdout(info_dict['description'])
1710         if self.params.get('forcefilename', False) and filename is not None:
1711             self.to_stdout(filename)
1712         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1713             self.to_stdout(formatSeconds(info_dict['duration']))
1714         if self.params.get('forceformat', False):
1715             self.to_stdout(info_dict['format'])
1716         if self.params.get('forcejson', False):
1717             self.to_stdout(json.dumps(info_dict))
1718
1719         # Do nothing else if in simulate mode
1720         if self.params.get('simulate', False):
1721             return
1722
1723         if filename is None:
1724             return
1725
1726         def ensure_dir_exists(path):
1727             try:
1728                 dn = os.path.dirname(path)
1729                 if dn and not os.path.exists(dn):
1730                     os.makedirs(dn)
1731                 return True
1732             except (OSError, IOError) as err:
1733                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1734                 return False
1735
1736         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1737             return
1738
1739         if self.params.get('writedescription', False):
1740             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1741             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1742                 self.to_screen('[info] Video description is already present')
1743             elif info_dict.get('description') is None:
1744                 self.report_warning('There\'s no description to write.')
1745             else:
1746                 try:
1747                     self.to_screen('[info] Writing video description to: ' + descfn)
1748                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1749                         descfile.write(info_dict['description'])
1750                 except (OSError, IOError):
1751                     self.report_error('Cannot write description file ' + descfn)
1752                     return
1753
1754         if self.params.get('writeannotations', False):
1755             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1756             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1757                 self.to_screen('[info] Video annotations are already present')
1758             else:
1759                 try:
1760                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1761                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1762                         annofile.write(info_dict['annotations'])
1763                 except (KeyError, TypeError):
1764                     self.report_warning('There are no annotations to write.')
1765                 except (OSError, IOError):
1766                     self.report_error('Cannot write annotations file: ' + annofn)
1767                     return
1768
1769         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1770                                        self.params.get('writeautomaticsub')])
1771
1772         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1773             # subtitles download errors are already managed as troubles in relevant IE
1774             # that way it will silently go on when used with unsupporting IE
1775             subtitles = info_dict['requested_subtitles']
1776             ie = self.get_info_extractor(info_dict['extractor_key'])
1777             for sub_lang, sub_info in subtitles.items():
1778                 sub_format = sub_info['ext']
1779                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1780                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1781                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1782                 else:
1783                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1784                     if sub_info.get('data') is not None:
1785                         try:
1786                             # Use newline='' to prevent conversion of newline characters
1787                             # See https://github.com/rg3/youtube-dl/issues/10268
1788                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1789                                 subfile.write(sub_info['data'])
1790                         except (OSError, IOError):
1791                             self.report_error('Cannot write subtitles file ' + sub_filename)
1792                             return
1793                     else:
1794                         try:
1795                             sub_data = ie._request_webpage(
1796                                 sub_info['url'], info_dict['id'], note=False).read()
1797                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1798                                 subfile.write(sub_data)
1799                         except (ExtractorError, IOError, OSError, ValueError) as err:
1800                             self.report_warning('Unable to download subtitle for "%s": %s' %
1801                                                 (sub_lang, error_to_compat_str(err)))
1802                             continue
1803
1804         if self.params.get('writeinfojson', False):
1805             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1806             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1807                 self.to_screen('[info] Video description metadata is already present')
1808             else:
1809                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1810                 try:
1811                     write_json_file(self.filter_requested_info(info_dict), infofn)
1812                 except (OSError, IOError):
1813                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1814                     return
1815
1816         self._write_thumbnails(info_dict, filename)
1817
1818         if not self.params.get('skip_download', False):
1819             try:
1820                 def dl(name, info):
1821                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1822                     for ph in self._progress_hooks:
1823                         fd.add_progress_hook(ph)
1824                     if self.params.get('verbose'):
1825                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1826                     return fd.download(name, info)
1827
1828                 if info_dict.get('requested_formats') is not None:
1829                     downloaded = []
1830                     success = True
1831                     merger = FFmpegMergerPP(self)
1832                     if not merger.available:
1833                         postprocessors = []
1834                         self.report_warning('You have requested multiple '
1835                                             'formats but ffmpeg or avconv are not installed.'
1836                                             ' The formats won\'t be merged.')
1837                     else:
1838                         postprocessors = [merger]
1839
1840                     def compatible_formats(formats):
1841                         video, audio = formats
1842                         # Check extension
1843                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1844                         if video_ext and audio_ext:
1845                             COMPATIBLE_EXTS = (
1846                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1847                                 ('webm')
1848                             )
1849                             for exts in COMPATIBLE_EXTS:
1850                                 if video_ext in exts and audio_ext in exts:
1851                                     return True
1852                         # TODO: Check acodec/vcodec
1853                         return False
1854
1855                     filename_real_ext = os.path.splitext(filename)[1][1:]
1856                     filename_wo_ext = (
1857                         os.path.splitext(filename)[0]
1858                         if filename_real_ext == info_dict['ext']
1859                         else filename)
1860                     requested_formats = info_dict['requested_formats']
1861                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1862                         info_dict['ext'] = 'mkv'
1863                         self.report_warning(
1864                             'Requested formats are incompatible for merge and will be merged into mkv.')
1865                     # Ensure filename always has a correct extension for successful merge
1866                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1867                     if os.path.exists(encodeFilename(filename)):
1868                         self.to_screen(
1869                             '[download] %s has already been downloaded and '
1870                             'merged' % filename)
1871                     else:
1872                         for f in requested_formats:
1873                             new_info = dict(info_dict)
1874                             new_info.update(f)
1875                             fname = prepend_extension(
1876                                 self.prepare_filename(new_info),
1877                                 'f%s' % f['format_id'], new_info['ext'])
1878                             if not ensure_dir_exists(fname):
1879                                 return
1880                             downloaded.append(fname)
1881                             partial_success = dl(fname, new_info)
1882                             success = success and partial_success
1883                         info_dict['__postprocessors'] = postprocessors
1884                         info_dict['__files_to_merge'] = downloaded
1885                 else:
1886                     # Just a single file
1887                     success = dl(filename, info_dict)
1888             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1889                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1890                 return
1891             except (OSError, IOError) as err:
1892                 raise UnavailableVideoError(err)
1893             except (ContentTooShortError, ) as err:
1894                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1895                 return
1896
1897             if success and filename != '-':
1898                 # Fixup content
1899                 fixup_policy = self.params.get('fixup')
1900                 if fixup_policy is None:
1901                     fixup_policy = 'detect_or_warn'
1902
1903                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1904
1905                 stretched_ratio = info_dict.get('stretched_ratio')
1906                 if stretched_ratio is not None and stretched_ratio != 1:
1907                     if fixup_policy == 'warn':
1908                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1909                             info_dict['id'], stretched_ratio))
1910                     elif fixup_policy == 'detect_or_warn':
1911                         stretched_pp = FFmpegFixupStretchedPP(self)
1912                         if stretched_pp.available:
1913                             info_dict.setdefault('__postprocessors', [])
1914                             info_dict['__postprocessors'].append(stretched_pp)
1915                         else:
1916                             self.report_warning(
1917                                 '%s: Non-uniform pixel ratio (%s). %s'
1918                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1919                     else:
1920                         assert fixup_policy in ('ignore', 'never')
1921
1922                 if (info_dict.get('requested_formats') is None and
1923                         info_dict.get('container') == 'm4a_dash'):
1924                     if fixup_policy == 'warn':
1925                         self.report_warning(
1926                             '%s: writing DASH m4a. '
1927                             'Only some players support this container.'
1928                             % info_dict['id'])
1929                     elif fixup_policy == 'detect_or_warn':
1930                         fixup_pp = FFmpegFixupM4aPP(self)
1931                         if fixup_pp.available:
1932                             info_dict.setdefault('__postprocessors', [])
1933                             info_dict['__postprocessors'].append(fixup_pp)
1934                         else:
1935                             self.report_warning(
1936                                 '%s: writing DASH m4a. '
1937                                 'Only some players support this container. %s'
1938                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1939                     else:
1940                         assert fixup_policy in ('ignore', 'never')
1941
1942                 if (info_dict.get('protocol') == 'm3u8_native' or
1943                         info_dict.get('protocol') == 'm3u8' and
1944                         self.params.get('hls_prefer_native')):
1945                     if fixup_policy == 'warn':
1946                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1947                             info_dict['id']))
1948                     elif fixup_policy == 'detect_or_warn':
1949                         fixup_pp = FFmpegFixupM3u8PP(self)
1950                         if fixup_pp.available:
1951                             info_dict.setdefault('__postprocessors', [])
1952                             info_dict['__postprocessors'].append(fixup_pp)
1953                         else:
1954                             self.report_warning(
1955                                 '%s: malformed AAC bitstream detected. %s'
1956                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1957                     else:
1958                         assert fixup_policy in ('ignore', 'never')
1959
1960                 try:
1961                     self.post_process(filename, info_dict)
1962                 except (PostProcessingError) as err:
1963                     self.report_error('postprocessing: %s' % str(err))
1964                     return
1965                 self.record_download_archive(info_dict)
1966
1967     def download(self, url_list):
1968         """Download a given list of URLs."""
1969         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1970         if (len(url_list) > 1 and
1971                 outtmpl != '-' and
1972                 '%' not in outtmpl and
1973                 self.params.get('max_downloads') != 1):
1974             raise SameFileError(outtmpl)
1975
1976         for url in url_list:
1977             try:
1978                 # It also downloads the videos
1979                 res = self.extract_info(
1980                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1981             except UnavailableVideoError:
1982                 self.report_error('unable to download video')
1983             except MaxDownloadsReached:
1984                 self.to_screen('[info] Maximum number of downloaded files reached.')
1985                 raise
1986             else:
1987                 if self.params.get('dump_single_json', False):
1988                     self.to_stdout(json.dumps(res))
1989
1990         return self._download_retcode
1991
1992     def download_with_info_file(self, info_filename):
1993         with contextlib.closing(fileinput.FileInput(
1994                 [info_filename], mode='r',
1995                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1996             # FileInput doesn't have a read method, we can't call json.load
1997             info = self.filter_requested_info(json.loads('\n'.join(f)))
1998         try:
1999             self.process_ie_result(info, download=True)
2000         except DownloadError:
2001             webpage_url = info.get('webpage_url')
2002             if webpage_url is not None:
2003                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2004                 return self.download([webpage_url])
2005             else:
2006                 raise
2007         return self._download_retcode
2008
2009     @staticmethod
2010     def filter_requested_info(info_dict):
2011         return dict(
2012             (k, v) for k, v in info_dict.items()
2013             if k not in ['requested_formats', 'requested_subtitles'])
2014
2015     def post_process(self, filename, ie_info):
2016         """Run all the postprocessors on the given file."""
2017         info = dict(ie_info)
2018         info['filepath'] = filename
2019         pps_chain = []
2020         if ie_info.get('__postprocessors') is not None:
2021             pps_chain.extend(ie_info['__postprocessors'])
2022         pps_chain.extend(self._pps)
2023         for pp in pps_chain:
2024             files_to_delete = []
2025             try:
2026                 files_to_delete, info = pp.run(info)
2027             except PostProcessingError as e:
2028                 self.report_error(e.msg)
2029             if files_to_delete and not self.params.get('keepvideo', False):
2030                 for old_filename in files_to_delete:
2031                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2032                     try:
2033                         os.remove(encodeFilename(old_filename))
2034                     except (IOError, OSError):
2035                         self.report_warning('Unable to remove downloaded original file')
2036
2037     def _make_archive_id(self, info_dict):
2038         # Future-proof against any change in case
2039         # and backwards compatibility with prior versions
2040         extractor = info_dict.get('extractor_key')
2041         if extractor is None:
2042             if 'id' in info_dict:
2043                 extractor = info_dict.get('ie_key')  # key in a playlist
2044         if extractor is None:
2045             return None  # Incomplete video information
2046         return extractor.lower() + ' ' + info_dict['id']
2047
2048     def in_download_archive(self, info_dict):
2049         fn = self.params.get('download_archive')
2050         if fn is None:
2051             return False
2052
2053         vid_id = self._make_archive_id(info_dict)
2054         if vid_id is None:
2055             return False  # Incomplete video information
2056
2057         try:
2058             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2059                 for line in archive_file:
2060                     if line.strip() == vid_id:
2061                         return True
2062         except IOError as ioe:
2063             if ioe.errno != errno.ENOENT:
2064                 raise
2065         return False
2066
2067     def record_download_archive(self, info_dict):
2068         fn = self.params.get('download_archive')
2069         if fn is None:
2070             return
2071         vid_id = self._make_archive_id(info_dict)
2072         assert vid_id
2073         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2074             archive_file.write(vid_id + '\n')
2075
2076     @staticmethod
2077     def format_resolution(format, default='unknown'):
2078         if format.get('vcodec') == 'none':
2079             return 'audio only'
2080         if format.get('resolution') is not None:
2081             return format['resolution']
2082         if format.get('height') is not None:
2083             if format.get('width') is not None:
2084                 res = '%sx%s' % (format['width'], format['height'])
2085             else:
2086                 res = '%sp' % format['height']
2087         elif format.get('width') is not None:
2088             res = '%dx?' % format['width']
2089         else:
2090             res = default
2091         return res
2092
2093     def _format_note(self, fdict):
2094         res = ''
2095         if fdict.get('ext') in ['f4f', 'f4m']:
2096             res += '(unsupported) '
2097         if fdict.get('language'):
2098             if res:
2099                 res += ' '
2100             res += '[%s] ' % fdict['language']
2101         if fdict.get('format_note') is not None:
2102             res += fdict['format_note'] + ' '
2103         if fdict.get('tbr') is not None:
2104             res += '%4dk ' % fdict['tbr']
2105         if fdict.get('container') is not None:
2106             if res:
2107                 res += ', '
2108             res += '%s container' % fdict['container']
2109         if (fdict.get('vcodec') is not None and
2110                 fdict.get('vcodec') != 'none'):
2111             if res:
2112                 res += ', '
2113             res += fdict['vcodec']
2114             if fdict.get('vbr') is not None:
2115                 res += '@'
2116         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2117             res += 'video@'
2118         if fdict.get('vbr') is not None:
2119             res += '%4dk' % fdict['vbr']
2120         if fdict.get('fps') is not None:
2121             if res:
2122                 res += ', '
2123             res += '%sfps' % fdict['fps']
2124         if fdict.get('acodec') is not None:
2125             if res:
2126                 res += ', '
2127             if fdict['acodec'] == 'none':
2128                 res += 'video only'
2129             else:
2130                 res += '%-5s' % fdict['acodec']
2131         elif fdict.get('abr') is not None:
2132             if res:
2133                 res += ', '
2134             res += 'audio'
2135         if fdict.get('abr') is not None:
2136             res += '@%3dk' % fdict['abr']
2137         if fdict.get('asr') is not None:
2138             res += ' (%5dHz)' % fdict['asr']
2139         if fdict.get('filesize') is not None:
2140             if res:
2141                 res += ', '
2142             res += format_bytes(fdict['filesize'])
2143         elif fdict.get('filesize_approx') is not None:
2144             if res:
2145                 res += ', '
2146             res += '~' + format_bytes(fdict['filesize_approx'])
2147         return res
2148
2149     def list_formats(self, info_dict):
2150         formats = info_dict.get('formats', [info_dict])
2151         table = [
2152             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2153             for f in formats
2154             if f.get('preference') is None or f['preference'] >= -1000]
2155         if len(formats) > 1:
2156             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2157
2158         header_line = ['format code', 'extension', 'resolution', 'note']
2159         self.to_screen(
2160             '[info] Available formats for %s:\n%s' %
2161             (info_dict['id'], render_table(header_line, table)))
2162
2163     def list_thumbnails(self, info_dict):
2164         thumbnails = info_dict.get('thumbnails')
2165         if not thumbnails:
2166             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2167             return
2168
2169         self.to_screen(
2170             '[info] Thumbnails for %s:' % info_dict['id'])
2171         self.to_screen(render_table(
2172             ['ID', 'width', 'height', 'URL'],
2173             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2174
2175     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2176         if not subtitles:
2177             self.to_screen('%s has no %s' % (video_id, name))
2178             return
2179         self.to_screen(
2180             'Available %s for %s:' % (name, video_id))
2181         self.to_screen(render_table(
2182             ['Language', 'formats'],
2183             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2184                 for lang, formats in subtitles.items()]))
2185
2186     def urlopen(self, req):
2187         """ Start an HTTP download """
2188         if isinstance(req, compat_basestring):
2189             req = sanitized_Request(req)
2190         return self._opener.open(req, timeout=self._socket_timeout)
2191
2192     def print_debug_header(self):
2193         if not self.params.get('verbose'):
2194             return
2195
2196         if type('') is not compat_str:
2197             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2198             self.report_warning(
2199                 'Your Python is broken! Update to a newer and supported version')
2200
2201         stdout_encoding = getattr(
2202             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2203         encoding_str = (
2204             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2205                 locale.getpreferredencoding(),
2206                 sys.getfilesystemencoding(),
2207                 stdout_encoding,
2208                 self.get_encoding()))
2209         write_string(encoding_str, encoding=None)
2210
2211         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2212         if _LAZY_LOADER:
2213             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2214         try:
2215             sp = subprocess.Popen(
2216                 ['git', 'rev-parse', '--short', 'HEAD'],
2217                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2218                 cwd=os.path.dirname(os.path.abspath(__file__)))
2219             out, err = sp.communicate()
2220             out = out.decode().strip()
2221             if re.match('[0-9a-f]+', out):
2222                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2223         except Exception:
2224             try:
2225                 sys.exc_clear()
2226             except Exception:
2227                 pass
2228         self._write_string('[debug] Python version %s - %s\n' % (
2229             platform.python_version(), platform_name()))
2230
2231         exe_versions = FFmpegPostProcessor.get_versions(self)
2232         exe_versions['rtmpdump'] = rtmpdump_version()
2233         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2234         exe_str = ', '.join(
2235             '%s %s' % (exe, v)
2236             for exe, v in sorted(exe_versions.items())
2237             if v
2238         )
2239         if not exe_str:
2240             exe_str = 'none'
2241         self._write_string('[debug] exe versions: %s\n' % exe_str)
2242
2243         proxy_map = {}
2244         for handler in self._opener.handlers:
2245             if hasattr(handler, 'proxies'):
2246                 proxy_map.update(handler.proxies)
2247         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2248
2249         if self.params.get('call_home', False):
2250             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2251             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2252             latest_version = self.urlopen(
2253                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2254             if version_tuple(latest_version) > version_tuple(__version__):
2255                 self.report_warning(
2256                     'You are using an outdated version (newest version: %s)! '
2257                     'See https://yt-dl.org/update if you need help updating.' %
2258                     latest_version)
2259
2260     def _setup_opener(self):
2261         timeout_val = self.params.get('socket_timeout')
2262         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2263
2264         opts_cookiefile = self.params.get('cookiefile')
2265         opts_proxy = self.params.get('proxy')
2266
2267         if opts_cookiefile is None:
2268             self.cookiejar = compat_cookiejar.CookieJar()
2269         else:
2270             opts_cookiefile = expand_path(opts_cookiefile)
2271             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2272                 opts_cookiefile)
2273             if os.access(opts_cookiefile, os.R_OK):
2274                 self.cookiejar.load()
2275
2276         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2277         if opts_proxy is not None:
2278             if opts_proxy == '':
2279                 proxies = {}
2280             else:
2281                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2282         else:
2283             proxies = compat_urllib_request.getproxies()
2284             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2285             if 'http' in proxies and 'https' not in proxies:
2286                 proxies['https'] = proxies['http']
2287         proxy_handler = PerRequestProxyHandler(proxies)
2288
2289         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2290         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2291         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2292         data_handler = compat_urllib_request_DataHandler()
2293
2294         # When passing our own FileHandler instance, build_opener won't add the
2295         # default FileHandler and allows us to disable the file protocol, which
2296         # can be used for malicious purposes (see
2297         # https://github.com/rg3/youtube-dl/issues/8227)
2298         file_handler = compat_urllib_request.FileHandler()
2299
2300         def file_open(*args, **kwargs):
2301             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2302         file_handler.file_open = file_open
2303
2304         opener = compat_urllib_request.build_opener(
2305             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2306
2307         # Delete the default user-agent header, which would otherwise apply in
2308         # cases where our custom HTTP handler doesn't come into play
2309         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2310         opener.addheaders = []
2311         self._opener = opener
2312
2313     def encode(self, s):
2314         if isinstance(s, bytes):
2315             return s  # Already encoded
2316
2317         try:
2318             return s.encode(self.get_encoding())
2319         except UnicodeEncodeError as err:
2320             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2321             raise
2322
2323     def get_encoding(self):
2324         encoding = self.params.get('encoding')
2325         if encoding is None:
2326             encoding = preferredencoding()
2327         return encoding
2328
2329     def _write_thumbnails(self, info_dict, filename):
2330         if self.params.get('writethumbnail', False):
2331             thumbnails = info_dict.get('thumbnails')
2332             if thumbnails:
2333                 thumbnails = [thumbnails[-1]]
2334         elif self.params.get('write_all_thumbnails', False):
2335             thumbnails = info_dict.get('thumbnails')
2336         else:
2337             return
2338
2339         if not thumbnails:
2340             # No thumbnails present, so return immediately
2341             return
2342
2343         for t in thumbnails:
2344             thumb_ext = determine_ext(t['url'], 'jpg')
2345             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2346             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2347             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2348
2349             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2350                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2351                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2352             else:
2353                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2354                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2355                 try:
2356                     uf = self.urlopen(t['url'])
2357                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2358                         shutil.copyfileobj(uf, thumbf)
2359                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2360                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2361                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2362                     self.report_warning('Unable to download thumbnail "%s": %s' %
2363                                         (t['url'], error_to_compat_str(err)))