68721e9ab81a3bbebac8846d52ed6bd7f92e05c9
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     subtitles_filename,
86     UnavailableVideoError,
87     url_basename,
88     version_tuple,
89     write_json_file,
90     write_string,
91     YoutubeDLCookieProcessor,
92     YoutubeDLHandler,
93 )
94 from .cache import Cache
95 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
96 from .extractor.openload import PhantomJSwrapper
97 from .downloader import get_suitable_downloader
98 from .downloader.rtmp import rtmpdump_version
99 from .postprocessor import (
100     FFmpegFixupM3u8PP,
101     FFmpegFixupM4aPP,
102     FFmpegFixupStretchedPP,
103     FFmpegMergerPP,
104     FFmpegPostProcessor,
105     get_postprocessor,
106 )
107 from .version import __version__
108
109 if compat_os_name == 'nt':
110     import ctypes
111
112
113 class YoutubeDL(object):
114     """YoutubeDL class.
115
116     YoutubeDL objects are the ones responsible of downloading the
117     actual video file and writing it to disk if the user has requested
118     it, among some other tasks. In most cases there should be one per
119     program. As, given a video URL, the downloader doesn't know how to
120     extract all the needed information, task that InfoExtractors do, it
121     has to pass the URL to one of them.
122
123     For this, YoutubeDL objects have a method that allows
124     InfoExtractors to be registered in a given order. When it is passed
125     a URL, the YoutubeDL object handles it to the first InfoExtractor it
126     finds that reports being able to handle it. The InfoExtractor extracts
127     all the information about the video or videos the URL refers to, and
128     YoutubeDL process the extracted information, possibly using a File
129     Downloader to download the video.
130
131     YoutubeDL objects accept a lot of parameters. In order not to saturate
132     the object constructor with arguments, it receives a dictionary of
133     options instead. These options are available through the params
134     attribute for the InfoExtractors to use. The YoutubeDL also
135     registers itself as the downloader in charge for the InfoExtractors
136     that are added to it, so this is a "mutual registration".
137
138     Available options:
139
140     username:          Username for authentication purposes.
141     password:          Password for authentication purposes.
142     videopassword:     Password for accessing a video.
143     ap_mso:            Adobe Pass multiple-system operator identifier.
144     ap_username:       Multiple-system operator account username.
145     ap_password:       Multiple-system operator account password.
146     usenetrc:          Use netrc for authentication instead.
147     verbose:           Print additional info to stdout.
148     quiet:             Do not print messages to stdout.
149     no_warnings:       Do not print out anything for warnings.
150     forceurl:          Force printing final URL.
151     forcetitle:        Force printing title.
152     forceid:           Force printing ID.
153     forcethumbnail:    Force printing thumbnail URL.
154     forcedescription:  Force printing description.
155     forcefilename:     Force printing final filename.
156     forceduration:     Force printing duration.
157     forcejson:         Force printing info_dict as JSON.
158     dump_single_json:  Force printing the info_dict of the whole playlist
159                        (or video) as a single JSON line.
160     simulate:          Do not download the video files.
161     format:            Video format code. See options.py for more information.
162     outtmpl:           Template for output names.
163     restrictfilenames: Do not allow "&" and spaces in file names
164     ignoreerrors:      Do not stop on download errors.
165     force_generic_extractor: Force downloader to use the generic extractor
166     nooverwrites:      Prevent overwriting files.
167     playliststart:     Playlist item to start at.
168     playlistend:       Playlist item to end at.
169     playlist_items:    Specific indices of playlist to download.
170     playlistreverse:   Download playlist items in reverse order.
171     playlistrandom:    Download playlist items in random order.
172     matchtitle:        Download only matching titles.
173     rejecttitle:       Reject downloads for matching titles.
174     logger:            Log messages to a logging.Logger instance.
175     logtostderr:       Log messages to stderr instead of stdout.
176     writedescription:  Write the video description to a .description file
177     writeinfojson:     Write the video description to a .info.json file
178     writeannotations:  Write the video annotations to a .annotations.xml file
179     writethumbnail:    Write the thumbnail image to a file
180     write_all_thumbnails:  Write all thumbnail formats to files
181     writesubtitles:    Write the video subtitles to a file
182     writeautomaticsub: Write the automatically generated subtitles to a file
183     allsubtitles:      Downloads all the subtitles of the video
184                        (requires writesubtitles or writeautomaticsub)
185     listsubtitles:     Lists all available subtitles for the video
186     subtitlesformat:   The format code for subtitles
187     subtitleslangs:    List of languages of the subtitles to download
188     keepvideo:         Keep the video file after post-processing
189     daterange:         A DateRange object, download only if the upload_date is in the range.
190     skip_download:     Skip the actual download of the video file
191     cachedir:          Location of the cache files in the filesystem.
192                        False to disable filesystem cache.
193     noplaylist:        Download single video instead of a playlist if in doubt.
194     age_limit:         An integer representing the user's age in years.
195                        Unsuitable videos for the given age are skipped.
196     min_views:         An integer representing the minimum view count the video
197                        must have in order to not be skipped.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     max_views:         An integer representing the maximum view count.
201                        Videos that are more popular than that are not
202                        downloaded.
203                        Videos without view count information are always
204                        downloaded. None for no limit.
205     download_archive:  File name of a file where all downloads are recorded.
206                        Videos already present in the file are not downloaded
207                        again.
208     cookiefile:        File name where cookies should be read from and dumped to.
209     nocheckcertificate:Do not verify SSL certificates
210     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
211                        At the moment, this is only supported by YouTube.
212     proxy:             URL of the proxy server to use
213     geo_verification_proxy:  URL of the proxy to use for IP address verification
214                        on geo-restricted sites. (Experimental)
215     socket_timeout:    Time to wait for unresponsive hosts, in seconds
216     bidi_workaround:   Work around buggy terminals without bidirectional text
217                        support, using fridibi
218     debug_printtraffic:Print out sent and received HTTP traffic
219     include_ads:       Download ads as well
220     default_search:    Prepend this string if an input url is not valid.
221                        'auto' for elaborate guessing
222     encoding:          Use this encoding instead of the system-specified.
223     extract_flat:      Do not resolve URLs, return the immediate result.
224                        Pass in 'in_playlist' to only show this behavior for
225                        playlist items.
226     postprocessors:    A list of dictionaries, each with an entry
227                        * key:  The name of the postprocessor. See
228                                youtube_dl/postprocessor/__init__.py for a list.
229                        as well as any further keyword arguments for the
230                        postprocessor.
231     progress_hooks:    A list of functions that get called on download
232                        progress, with a dictionary with the entries
233                        * status: One of "downloading", "error", or "finished".
234                                  Check this first and ignore unknown values.
235
236                        If status is one of "downloading", or "finished", the
237                        following properties may also be present:
238                        * filename: The final filename (always present)
239                        * tmpfilename: The filename we're currently writing to
240                        * downloaded_bytes: Bytes on disk
241                        * total_bytes: Size of the whole file, None if unknown
242                        * total_bytes_estimate: Guess of the eventual file size,
243                                                None if unavailable.
244                        * elapsed: The number of seconds since download started.
245                        * eta: The estimated time in seconds, None if unknown
246                        * speed: The download speed in bytes/second, None if
247                                 unknown
248                        * fragment_index: The counter of the currently
249                                          downloaded video fragment.
250                        * fragment_count: The number of fragments (= individual
251                                          files that will be merged)
252
253                        Progress hooks are guaranteed to be called at least once
254                        (with status "finished") if the download is successful.
255     merge_output_format: Extension to use when merging formats.
256     fixup:             Automatically correct known faults of the file.
257                        One of:
258                        - "never": do nothing
259                        - "warn": only emit a warning
260                        - "detect_or_warn": check whether we can do anything
261                                            about it, warn otherwise (default)
262     source_address:    (Experimental) Client-side IP address to bind to.
263     call_home:         Boolean, true iff we are allowed to contact the
264                        youtube-dl servers for debugging.
265     sleep_interval:    Number of seconds to sleep before each download when
266                        used alone or a lower bound of a range for randomized
267                        sleep before each download (minimum possible number
268                        of seconds to sleep) when used along with
269                        max_sleep_interval.
270     max_sleep_interval:Upper bound of a range for randomized sleep before each
271                        download (maximum possible number of seconds to sleep).
272                        Must only be used along with sleep_interval.
273                        Actual sleep time will be a random float from range
274                        [sleep_interval; max_sleep_interval].
275     listformats:       Print an overview of available video formats and exit.
276     list_thumbnails:   Print a table of all thumbnails and exit.
277     match_filter:      A function that gets called with the info_dict of
278                        every video.
279                        If it returns a message, the video is ignored.
280                        If it returns None, the video is downloaded.
281                        match_filter_func in utils.py is one example for this.
282     no_color:          Do not emit color codes in output.
283     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
284                        HTTP header (experimental)
285     geo_bypass_country:
286                        Two-letter ISO 3166-2 country code that will be used for
287                        explicit geographic restriction bypassing via faking
288                        X-Forwarded-For HTTP header (experimental)
289
290     The following options determine which downloader is picked:
291     external_downloader: Executable of the external downloader to call.
292                        None or unset for standard (built-in) downloader.
293     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
294                        if True, otherwise use ffmpeg/avconv if False, otherwise
295                        use downloader suggested by extractor if None.
296
297     The following parameters are not used by YoutubeDL itself, they are used by
298     the downloader (see youtube_dl/downloader/common.py):
299     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
300     noresizebuffer, retries, continuedl, noprogress, consoletitle,
301     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
302
303     The following options are used by the post processors:
304     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
305                        otherwise prefer avconv.
306     postprocessor_args: A list of additional command-line arguments for the
307                         postprocessor.
308
309     The following options are used by the Youtube extractor:
310     youtube_include_dash_manifest: If True (default), DASH manifests and related
311                         data will be downloaded and processed by extractor.
312                         You can reduce network I/O by disabling it if you don't
313                         care about DASH.
314     """
315
316     _NUMERIC_FIELDS = set((
317         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
318         'timestamp', 'upload_year', 'upload_month', 'upload_day',
319         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
320         'average_rating', 'comment_count', 'age_limit',
321         'start_time', 'end_time',
322         'chapter_number', 'season_number', 'episode_number',
323         'track_number', 'disc_number', 'release_year',
324         'playlist_index',
325     ))
326
327     params = None
328     _ies = []
329     _pps = []
330     _download_retcode = None
331     _num_downloads = None
332     _screen_file = None
333
334     def __init__(self, params=None, auto_init=True):
335         """Create a FileDownloader object with the given options."""
336         if params is None:
337             params = {}
338         self._ies = []
339         self._ies_instances = {}
340         self._pps = []
341         self._progress_hooks = []
342         self._download_retcode = 0
343         self._num_downloads = 0
344         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
345         self._err_file = sys.stderr
346         self.params = {
347             # Default parameters
348             'nocheckcertificate': False,
349         }
350         self.params.update(params)
351         self.cache = Cache(self)
352
353         def check_deprecated(param, option, suggestion):
354             if self.params.get(param) is not None:
355                 self.report_warning(
356                     '%s is deprecated. Use %s instead.' % (option, suggestion))
357                 return True
358             return False
359
360         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
361             if self.params.get('geo_verification_proxy') is None:
362                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
363
364         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
365         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
366         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
367
368         if params.get('bidi_workaround', False):
369             try:
370                 import pty
371                 master, slave = pty.openpty()
372                 width = compat_get_terminal_size().columns
373                 if width is None:
374                     width_args = []
375                 else:
376                     width_args = ['-w', str(width)]
377                 sp_kwargs = dict(
378                     stdin=subprocess.PIPE,
379                     stdout=slave,
380                     stderr=self._err_file)
381                 try:
382                     self._output_process = subprocess.Popen(
383                         ['bidiv'] + width_args, **sp_kwargs
384                     )
385                 except OSError:
386                     self._output_process = subprocess.Popen(
387                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
388                 self._output_channel = os.fdopen(master, 'rb')
389             except OSError as ose:
390                 if ose.errno == errno.ENOENT:
391                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
392                 else:
393                     raise
394
395         if (sys.platform != 'win32' and
396                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
397                 not params.get('restrictfilenames', False)):
398             # Unicode filesystem API will throw errors (#1474, #13027)
399             self.report_warning(
400                 'Assuming --restrict-filenames since file system encoding '
401                 'cannot encode all characters. '
402                 'Set the LC_ALL environment variable to fix this.')
403             self.params['restrictfilenames'] = True
404
405         if isinstance(params.get('outtmpl'), bytes):
406             self.report_warning(
407                 'Parameter outtmpl is bytes, but should be a unicode string. '
408                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
409
410         self._setup_opener()
411
412         if auto_init:
413             self.print_debug_header()
414             self.add_default_info_extractors()
415
416         for pp_def_raw in self.params.get('postprocessors', []):
417             pp_class = get_postprocessor(pp_def_raw['key'])
418             pp_def = dict(pp_def_raw)
419             del pp_def['key']
420             pp = pp_class(self, **compat_kwargs(pp_def))
421             self.add_post_processor(pp)
422
423         for ph in self.params.get('progress_hooks', []):
424             self.add_progress_hook(ph)
425
426         register_socks_protocols()
427
428     def warn_if_short_id(self, argv):
429         # short YouTube ID starting with dash?
430         idxs = [
431             i for i, a in enumerate(argv)
432             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
433         if idxs:
434             correct_argv = (
435                 ['youtube-dl'] +
436                 [a for i, a in enumerate(argv) if i not in idxs] +
437                 ['--'] + [argv[i] for i in idxs]
438             )
439             self.report_warning(
440                 'Long argument string detected. '
441                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
442                 args_to_str(correct_argv))
443
444     def add_info_extractor(self, ie):
445         """Add an InfoExtractor object to the end of the list."""
446         self._ies.append(ie)
447         if not isinstance(ie, type):
448             self._ies_instances[ie.ie_key()] = ie
449             ie.set_downloader(self)
450
451     def get_info_extractor(self, ie_key):
452         """
453         Get an instance of an IE with name ie_key, it will try to get one from
454         the _ies list, if there's no instance it will create a new one and add
455         it to the extractor list.
456         """
457         ie = self._ies_instances.get(ie_key)
458         if ie is None:
459             ie = get_info_extractor(ie_key)()
460             self.add_info_extractor(ie)
461         return ie
462
463     def add_default_info_extractors(self):
464         """
465         Add the InfoExtractors returned by gen_extractors to the end of the list
466         """
467         for ie in gen_extractor_classes():
468             self.add_info_extractor(ie)
469
470     def add_post_processor(self, pp):
471         """Add a PostProcessor object to the end of the chain."""
472         self._pps.append(pp)
473         pp.set_downloader(self)
474
475     def add_progress_hook(self, ph):
476         """Add the progress hook (currently only for the file downloader)"""
477         self._progress_hooks.append(ph)
478
479     def _bidi_workaround(self, message):
480         if not hasattr(self, '_output_channel'):
481             return message
482
483         assert hasattr(self, '_output_process')
484         assert isinstance(message, compat_str)
485         line_count = message.count('\n') + 1
486         self._output_process.stdin.write((message + '\n').encode('utf-8'))
487         self._output_process.stdin.flush()
488         res = ''.join(self._output_channel.readline().decode('utf-8')
489                       for _ in range(line_count))
490         return res[:-len('\n')]
491
492     def to_screen(self, message, skip_eol=False):
493         """Print message to stdout if not in quiet mode."""
494         return self.to_stdout(message, skip_eol, check_quiet=True)
495
496     def _write_string(self, s, out=None):
497         write_string(s, out=out, encoding=self.params.get('encoding'))
498
499     def to_stdout(self, message, skip_eol=False, check_quiet=False):
500         """Print message to stdout if not in quiet mode."""
501         if self.params.get('logger'):
502             self.params['logger'].debug(message)
503         elif not check_quiet or not self.params.get('quiet', False):
504             message = self._bidi_workaround(message)
505             terminator = ['\n', ''][skip_eol]
506             output = message + terminator
507
508             self._write_string(output, self._screen_file)
509
510     def to_stderr(self, message):
511         """Print message to stderr."""
512         assert isinstance(message, compat_str)
513         if self.params.get('logger'):
514             self.params['logger'].error(message)
515         else:
516             message = self._bidi_workaround(message)
517             output = message + '\n'
518             self._write_string(output, self._err_file)
519
520     def to_console_title(self, message):
521         if not self.params.get('consoletitle', False):
522             return
523         if compat_os_name == 'nt':
524             if ctypes.windll.kernel32.GetConsoleWindow():
525                 # c_wchar_p() might not be necessary if `message` is
526                 # already of type unicode()
527                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
528         elif 'TERM' in os.environ:
529             self._write_string('\033]0;%s\007' % message, self._screen_file)
530
531     def save_console_title(self):
532         if not self.params.get('consoletitle', False):
533             return
534         if compat_os_name != 'nt' and 'TERM' in os.environ:
535             # Save the title on stack
536             self._write_string('\033[22;0t', self._screen_file)
537
538     def restore_console_title(self):
539         if not self.params.get('consoletitle', False):
540             return
541         if compat_os_name != 'nt' and 'TERM' in os.environ:
542             # Restore the title from stack
543             self._write_string('\033[23;0t', self._screen_file)
544
545     def __enter__(self):
546         self.save_console_title()
547         return self
548
549     def __exit__(self, *args):
550         self.restore_console_title()
551
552         if self.params.get('cookiefile') is not None:
553             self.cookiejar.save()
554
555     def trouble(self, message=None, tb=None):
556         """Determine action to take when a download problem appears.
557
558         Depending on if the downloader has been configured to ignore
559         download errors or not, this method may throw an exception or
560         not when errors are found, after printing the message.
561
562         tb, if given, is additional traceback information.
563         """
564         if message is not None:
565             self.to_stderr(message)
566         if self.params.get('verbose'):
567             if tb is None:
568                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
569                     tb = ''
570                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
571                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
572                     tb += encode_compat_str(traceback.format_exc())
573                 else:
574                     tb_data = traceback.format_list(traceback.extract_stack())
575                     tb = ''.join(tb_data)
576             self.to_stderr(tb)
577         if not self.params.get('ignoreerrors', False):
578             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
579                 exc_info = sys.exc_info()[1].exc_info
580             else:
581                 exc_info = sys.exc_info()
582             raise DownloadError(message, exc_info)
583         self._download_retcode = 1
584
585     def report_warning(self, message):
586         '''
587         Print the message to stderr, it will be prefixed with 'WARNING:'
588         If stderr is a tty file the 'WARNING:' will be colored
589         '''
590         if self.params.get('logger') is not None:
591             self.params['logger'].warning(message)
592         else:
593             if self.params.get('no_warnings'):
594                 return
595             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
596                 _msg_header = '\033[0;33mWARNING:\033[0m'
597             else:
598                 _msg_header = 'WARNING:'
599             warning_message = '%s %s' % (_msg_header, message)
600             self.to_stderr(warning_message)
601
602     def report_error(self, message, tb=None):
603         '''
604         Do the same as trouble, but prefixes the message with 'ERROR:', colored
605         in red if stderr is a tty file.
606         '''
607         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
608             _msg_header = '\033[0;31mERROR:\033[0m'
609         else:
610             _msg_header = 'ERROR:'
611         error_message = '%s %s' % (_msg_header, message)
612         self.trouble(error_message, tb)
613
614     def report_file_already_downloaded(self, file_name):
615         """Report file has already been fully downloaded."""
616         try:
617             self.to_screen('[download] %s has already been downloaded' % file_name)
618         except UnicodeEncodeError:
619             self.to_screen('[download] The file has already been downloaded')
620
621     def prepare_filename(self, info_dict):
622         """Generate the output filename."""
623         try:
624             template_dict = dict(info_dict)
625
626             template_dict['epoch'] = int(time.time())
627             autonumber_size = self.params.get('autonumber_size')
628             if autonumber_size is None:
629                 autonumber_size = 5
630             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
631             if template_dict.get('resolution') is None:
632                 if template_dict.get('width') and template_dict.get('height'):
633                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
634                 elif template_dict.get('height'):
635                     template_dict['resolution'] = '%sp' % template_dict['height']
636                 elif template_dict.get('width'):
637                     template_dict['resolution'] = '%dx?' % template_dict['width']
638
639             sanitize = lambda k, v: sanitize_filename(
640                 compat_str(v),
641                 restricted=self.params.get('restrictfilenames'),
642                 is_id=(k == 'id' or k.endswith('_id')))
643             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
644                                  for k, v in template_dict.items()
645                                  if v is not None and not isinstance(v, (list, tuple, dict)))
646             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
647
648             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
649
650             # For fields playlist_index and autonumber convert all occurrences
651             # of %(field)s to %(field)0Nd for backward compatibility
652             field_size_compat_map = {
653                 'playlist_index': len(str(template_dict['n_entries'])),
654                 'autonumber': autonumber_size,
655             }
656             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
657             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
658             if mobj:
659                 outtmpl = re.sub(
660                     FIELD_SIZE_COMPAT_RE,
661                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
662                     outtmpl)
663
664             # Missing numeric fields used together with integer presentation types
665             # in format specification will break the argument substitution since
666             # string 'NA' is returned for missing fields. We will patch output
667             # template for missing fields to meet string presentation type.
668             for numeric_field in self._NUMERIC_FIELDS:
669                 if numeric_field not in template_dict:
670                     # As of [1] format syntax is:
671                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
672                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
673                     FORMAT_RE = r'''(?x)
674                         (?<!%)
675                         %
676                         \({0}\)  # mapping key
677                         (?:[#0\-+ ]+)?  # conversion flags (optional)
678                         (?:\d+)?  # minimum field width (optional)
679                         (?:\.\d+)?  # precision (optional)
680                         [hlL]?  # length modifier (optional)
681                         [diouxXeEfFgGcrs%]  # conversion type
682                     '''
683                     outtmpl = re.sub(
684                         FORMAT_RE.format(numeric_field),
685                         r'%({0})s'.format(numeric_field), outtmpl)
686
687             # expand_path translates '%%' into '%' and '$$' into '$'
688             # correspondingly that is not what we want since we need to keep
689             # '%%' intact for template dict substitution step. Working around
690             # with boundary-alike separator hack.
691             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
692             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
693
694             # outtmpl should be expand_path'ed before template dict substitution
695             # because meta fields may contain env variables we don't want to
696             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
697             # title "Hello $PATH", we don't want `$PATH` to be expanded.
698             filename = expand_path(outtmpl).replace(sep, '') % template_dict
699
700             # Temporary fix for #4787
701             # 'Treat' all problem characters by passing filename through preferredencoding
702             # to workaround encoding issues with subprocess on python2 @ Windows
703             if sys.version_info < (3, 0) and sys.platform == 'win32':
704                 filename = encodeFilename(filename, True).decode(preferredencoding())
705             return sanitize_path(filename)
706         except ValueError as err:
707             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
708             return None
709
710     def _match_entry(self, info_dict, incomplete):
711         """ Returns None iff the file should be downloaded """
712
713         video_title = info_dict.get('title', info_dict.get('id', 'video'))
714         if 'title' in info_dict:
715             # This can happen when we're just evaluating the playlist
716             title = info_dict['title']
717             matchtitle = self.params.get('matchtitle', False)
718             if matchtitle:
719                 if not re.search(matchtitle, title, re.IGNORECASE):
720                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
721             rejecttitle = self.params.get('rejecttitle', False)
722             if rejecttitle:
723                 if re.search(rejecttitle, title, re.IGNORECASE):
724                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
725         date = info_dict.get('upload_date')
726         if date is not None:
727             dateRange = self.params.get('daterange', DateRange())
728             if date not in dateRange:
729                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
730         view_count = info_dict.get('view_count')
731         if view_count is not None:
732             min_views = self.params.get('min_views')
733             if min_views is not None and view_count < min_views:
734                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
735             max_views = self.params.get('max_views')
736             if max_views is not None and view_count > max_views:
737                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
738         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
739             return 'Skipping "%s" because it is age restricted' % video_title
740         if self.in_download_archive(info_dict):
741             return '%s has already been recorded in archive' % video_title
742
743         if not incomplete:
744             match_filter = self.params.get('match_filter')
745             if match_filter is not None:
746                 ret = match_filter(info_dict)
747                 if ret is not None:
748                     return ret
749
750         return None
751
752     @staticmethod
753     def add_extra_info(info_dict, extra_info):
754         '''Set the keys from extra_info in info dict if they are missing'''
755         for key, value in extra_info.items():
756             info_dict.setdefault(key, value)
757
758     def extract_info(self, url, download=True, ie_key=None, extra_info={},
759                      process=True, force_generic_extractor=False):
760         '''
761         Returns a list with a dictionary for each video we find.
762         If 'download', also downloads the videos.
763         extra_info is a dict containing the extra values to add to each result
764         '''
765
766         if not ie_key and force_generic_extractor:
767             ie_key = 'Generic'
768
769         if ie_key:
770             ies = [self.get_info_extractor(ie_key)]
771         else:
772             ies = self._ies
773
774         for ie in ies:
775             if not ie.suitable(url):
776                 continue
777
778             ie = self.get_info_extractor(ie.ie_key())
779             if not ie.working():
780                 self.report_warning('The program functionality for this site has been marked as broken, '
781                                     'and will probably not work.')
782
783             try:
784                 ie_result = ie.extract(url)
785                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
786                     break
787                 if isinstance(ie_result, list):
788                     # Backwards compatibility: old IE result format
789                     ie_result = {
790                         '_type': 'compat_list',
791                         'entries': ie_result,
792                     }
793                 self.add_default_extra_info(ie_result, ie, url)
794                 if process:
795                     return self.process_ie_result(ie_result, download, extra_info)
796                 else:
797                     return ie_result
798             except GeoRestrictedError as e:
799                 msg = e.msg
800                 if e.countries:
801                     msg += '\nThis video is available in %s.' % ', '.join(
802                         map(ISO3166Utils.short2full, e.countries))
803                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
804                 self.report_error(msg)
805                 break
806             except ExtractorError as e:  # An error we somewhat expected
807                 self.report_error(compat_str(e), e.format_traceback())
808                 break
809             except MaxDownloadsReached:
810                 raise
811             except Exception as e:
812                 if self.params.get('ignoreerrors', False):
813                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
814                     break
815                 else:
816                     raise
817         else:
818             self.report_error('no suitable InfoExtractor for URL %s' % url)
819
820     def add_default_extra_info(self, ie_result, ie, url):
821         self.add_extra_info(ie_result, {
822             'extractor': ie.IE_NAME,
823             'webpage_url': url,
824             'webpage_url_basename': url_basename(url),
825             'extractor_key': ie.ie_key(),
826         })
827
828     def process_ie_result(self, ie_result, download=True, extra_info={}):
829         """
830         Take the result of the ie(may be modified) and resolve all unresolved
831         references (URLs, playlist items).
832
833         It will also download the videos if 'download'.
834         Returns the resolved ie_result.
835         """
836         result_type = ie_result.get('_type', 'video')
837
838         if result_type in ('url', 'url_transparent'):
839             ie_result['url'] = sanitize_url(ie_result['url'])
840             extract_flat = self.params.get('extract_flat', False)
841             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
842                     extract_flat is True):
843                 if self.params.get('forcejson', False):
844                     self.to_stdout(json.dumps(ie_result))
845                 return ie_result
846
847         if result_type == 'video':
848             self.add_extra_info(ie_result, extra_info)
849             return self.process_video_result(ie_result, download=download)
850         elif result_type == 'url':
851             # We have to add extra_info to the results because it may be
852             # contained in a playlist
853             return self.extract_info(ie_result['url'],
854                                      download,
855                                      ie_key=ie_result.get('ie_key'),
856                                      extra_info=extra_info)
857         elif result_type == 'url_transparent':
858             # Use the information from the embedding page
859             info = self.extract_info(
860                 ie_result['url'], ie_key=ie_result.get('ie_key'),
861                 extra_info=extra_info, download=False, process=False)
862
863             # extract_info may return None when ignoreerrors is enabled and
864             # extraction failed with an error, don't crash and return early
865             # in this case
866             if not info:
867                 return info
868
869             force_properties = dict(
870                 (k, v) for k, v in ie_result.items() if v is not None)
871             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
872                 if f in force_properties:
873                     del force_properties[f]
874             new_result = info.copy()
875             new_result.update(force_properties)
876
877             # Extracted info may not be a video result (i.e.
878             # info.get('_type', 'video') != video) but rather an url or
879             # url_transparent. In such cases outer metadata (from ie_result)
880             # should be propagated to inner one (info). For this to happen
881             # _type of info should be overridden with url_transparent. This
882             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
883             if new_result.get('_type') == 'url':
884                 new_result['_type'] = 'url_transparent'
885
886             return self.process_ie_result(
887                 new_result, download=download, extra_info=extra_info)
888         elif result_type in ('playlist', 'multi_video'):
889             # We process each entry in the playlist
890             playlist = ie_result.get('title') or ie_result.get('id')
891             self.to_screen('[download] Downloading playlist: %s' % playlist)
892
893             playlist_results = []
894
895             playliststart = self.params.get('playliststart', 1) - 1
896             playlistend = self.params.get('playlistend')
897             # For backwards compatibility, interpret -1 as whole list
898             if playlistend == -1:
899                 playlistend = None
900
901             playlistitems_str = self.params.get('playlist_items')
902             playlistitems = None
903             if playlistitems_str is not None:
904                 def iter_playlistitems(format):
905                     for string_segment in format.split(','):
906                         if '-' in string_segment:
907                             start, end = string_segment.split('-')
908                             for item in range(int(start), int(end) + 1):
909                                 yield int(item)
910                         else:
911                             yield int(string_segment)
912                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
913
914             ie_entries = ie_result['entries']
915
916             def make_playlistitems_entries(list_ie_entries):
917                 num_entries = len(list_ie_entries)
918                 return [
919                     list_ie_entries[i - 1] for i in playlistitems
920                     if -num_entries <= i - 1 < num_entries]
921
922             def report_download(num_entries):
923                 self.to_screen(
924                     '[%s] playlist %s: Downloading %d videos' %
925                     (ie_result['extractor'], playlist, num_entries))
926
927             if isinstance(ie_entries, list):
928                 n_all_entries = len(ie_entries)
929                 if playlistitems:
930                     entries = make_playlistitems_entries(ie_entries)
931                 else:
932                     entries = ie_entries[playliststart:playlistend]
933                 n_entries = len(entries)
934                 self.to_screen(
935                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
936                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
937             elif isinstance(ie_entries, PagedList):
938                 if playlistitems:
939                     entries = []
940                     for item in playlistitems:
941                         entries.extend(ie_entries.getslice(
942                             item - 1, item
943                         ))
944                 else:
945                     entries = ie_entries.getslice(
946                         playliststart, playlistend)
947                 n_entries = len(entries)
948                 report_download(n_entries)
949             else:  # iterable
950                 if playlistitems:
951                     entries = make_playlistitems_entries(list(itertools.islice(
952                         ie_entries, 0, max(playlistitems))))
953                 else:
954                     entries = list(itertools.islice(
955                         ie_entries, playliststart, playlistend))
956                 n_entries = len(entries)
957                 report_download(n_entries)
958
959             if self.params.get('playlistreverse', False):
960                 entries = entries[::-1]
961
962             if self.params.get('playlistrandom', False):
963                 random.shuffle(entries)
964
965             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
966
967             for i, entry in enumerate(entries, 1):
968                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
969                 # This __x_forwarded_for_ip thing is a bit ugly but requires
970                 # minimal changes
971                 if x_forwarded_for:
972                     entry['__x_forwarded_for_ip'] = x_forwarded_for
973                 extra = {
974                     'n_entries': n_entries,
975                     'playlist': playlist,
976                     'playlist_id': ie_result.get('id'),
977                     'playlist_title': ie_result.get('title'),
978                     'playlist_index': i + playliststart,
979                     'extractor': ie_result['extractor'],
980                     'webpage_url': ie_result['webpage_url'],
981                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
982                     'extractor_key': ie_result['extractor_key'],
983                 }
984
985                 reason = self._match_entry(entry, incomplete=True)
986                 if reason is not None:
987                     self.to_screen('[download] ' + reason)
988                     continue
989
990                 entry_result = self.process_ie_result(entry,
991                                                       download=download,
992                                                       extra_info=extra)
993                 playlist_results.append(entry_result)
994             ie_result['entries'] = playlist_results
995             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
996             return ie_result
997         elif result_type == 'compat_list':
998             self.report_warning(
999                 'Extractor %s returned a compat_list result. '
1000                 'It needs to be updated.' % ie_result.get('extractor'))
1001
1002             def _fixup(r):
1003                 self.add_extra_info(
1004                     r,
1005                     {
1006                         'extractor': ie_result['extractor'],
1007                         'webpage_url': ie_result['webpage_url'],
1008                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1009                         'extractor_key': ie_result['extractor_key'],
1010                     }
1011                 )
1012                 return r
1013             ie_result['entries'] = [
1014                 self.process_ie_result(_fixup(r), download, extra_info)
1015                 for r in ie_result['entries']
1016             ]
1017             return ie_result
1018         else:
1019             raise Exception('Invalid result type: %s' % result_type)
1020
1021     def _build_format_filter(self, filter_spec):
1022         " Returns a function to filter the formats according to the filter_spec "
1023
1024         OPERATORS = {
1025             '<': operator.lt,
1026             '<=': operator.le,
1027             '>': operator.gt,
1028             '>=': operator.ge,
1029             '=': operator.eq,
1030             '!=': operator.ne,
1031         }
1032         operator_rex = re.compile(r'''(?x)\s*
1033             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1034             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1035             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1036             $
1037             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1038         m = operator_rex.search(filter_spec)
1039         if m:
1040             try:
1041                 comparison_value = int(m.group('value'))
1042             except ValueError:
1043                 comparison_value = parse_filesize(m.group('value'))
1044                 if comparison_value is None:
1045                     comparison_value = parse_filesize(m.group('value') + 'B')
1046                 if comparison_value is None:
1047                     raise ValueError(
1048                         'Invalid value %r in format specification %r' % (
1049                             m.group('value'), filter_spec))
1050             op = OPERATORS[m.group('op')]
1051
1052         if not m:
1053             STR_OPERATORS = {
1054                 '=': operator.eq,
1055                 '!=': operator.ne,
1056                 '^=': lambda attr, value: attr.startswith(value),
1057                 '$=': lambda attr, value: attr.endswith(value),
1058                 '*=': lambda attr, value: value in attr,
1059             }
1060             str_operator_rex = re.compile(r'''(?x)
1061                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1062                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1063                 \s*(?P<value>[a-zA-Z0-9._-]+)
1064                 \s*$
1065                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1066             m = str_operator_rex.search(filter_spec)
1067             if m:
1068                 comparison_value = m.group('value')
1069                 op = STR_OPERATORS[m.group('op')]
1070
1071         if not m:
1072             raise ValueError('Invalid filter specification %r' % filter_spec)
1073
1074         def _filter(f):
1075             actual_value = f.get(m.group('key'))
1076             if actual_value is None:
1077                 return m.group('none_inclusive')
1078             return op(actual_value, comparison_value)
1079         return _filter
1080
1081     def _default_format_spec(self, info_dict, download=True):
1082
1083         def can_merge():
1084             merger = FFmpegMergerPP(self)
1085             return merger.available and merger.can_merge()
1086
1087         def prefer_best():
1088             if self.params.get('simulate', False):
1089                 return False
1090             if not download:
1091                 return False
1092             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1093                 return True
1094             if info_dict.get('is_live'):
1095                 return True
1096             if not can_merge():
1097                 return True
1098             return False
1099
1100         req_format_list = ['bestvideo+bestaudio', 'best']
1101         if prefer_best():
1102             req_format_list.reverse()
1103         return '/'.join(req_format_list)
1104
1105     def build_format_selector(self, format_spec):
1106         def syntax_error(note, start):
1107             message = (
1108                 'Invalid format specification: '
1109                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1110             return SyntaxError(message)
1111
1112         PICKFIRST = 'PICKFIRST'
1113         MERGE = 'MERGE'
1114         SINGLE = 'SINGLE'
1115         GROUP = 'GROUP'
1116         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1117
1118         def _parse_filter(tokens):
1119             filter_parts = []
1120             for type, string, start, _, _ in tokens:
1121                 if type == tokenize.OP and string == ']':
1122                     return ''.join(filter_parts)
1123                 else:
1124                     filter_parts.append(string)
1125
1126         def _remove_unused_ops(tokens):
1127             # Remove operators that we don't use and join them with the surrounding strings
1128             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1129             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1130             last_string, last_start, last_end, last_line = None, None, None, None
1131             for type, string, start, end, line in tokens:
1132                 if type == tokenize.OP and string == '[':
1133                     if last_string:
1134                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1135                         last_string = None
1136                     yield type, string, start, end, line
1137                     # everything inside brackets will be handled by _parse_filter
1138                     for type, string, start, end, line in tokens:
1139                         yield type, string, start, end, line
1140                         if type == tokenize.OP and string == ']':
1141                             break
1142                 elif type == tokenize.OP and string in ALLOWED_OPS:
1143                     if last_string:
1144                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1145                         last_string = None
1146                     yield type, string, start, end, line
1147                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1148                     if not last_string:
1149                         last_string = string
1150                         last_start = start
1151                         last_end = end
1152                     else:
1153                         last_string += string
1154             if last_string:
1155                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1156
1157         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1158             selectors = []
1159             current_selector = None
1160             for type, string, start, _, _ in tokens:
1161                 # ENCODING is only defined in python 3.x
1162                 if type == getattr(tokenize, 'ENCODING', None):
1163                     continue
1164                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1165                     current_selector = FormatSelector(SINGLE, string, [])
1166                 elif type == tokenize.OP:
1167                     if string == ')':
1168                         if not inside_group:
1169                             # ')' will be handled by the parentheses group
1170                             tokens.restore_last_token()
1171                         break
1172                     elif inside_merge and string in ['/', ',']:
1173                         tokens.restore_last_token()
1174                         break
1175                     elif inside_choice and string == ',':
1176                         tokens.restore_last_token()
1177                         break
1178                     elif string == ',':
1179                         if not current_selector:
1180                             raise syntax_error('"," must follow a format selector', start)
1181                         selectors.append(current_selector)
1182                         current_selector = None
1183                     elif string == '/':
1184                         if not current_selector:
1185                             raise syntax_error('"/" must follow a format selector', start)
1186                         first_choice = current_selector
1187                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1188                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1189                     elif string == '[':
1190                         if not current_selector:
1191                             current_selector = FormatSelector(SINGLE, 'best', [])
1192                         format_filter = _parse_filter(tokens)
1193                         current_selector.filters.append(format_filter)
1194                     elif string == '(':
1195                         if current_selector:
1196                             raise syntax_error('Unexpected "("', start)
1197                         group = _parse_format_selection(tokens, inside_group=True)
1198                         current_selector = FormatSelector(GROUP, group, [])
1199                     elif string == '+':
1200                         video_selector = current_selector
1201                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1202                         if not video_selector or not audio_selector:
1203                             raise syntax_error('"+" must be between two format selectors', start)
1204                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1205                     else:
1206                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1207                 elif type == tokenize.ENDMARKER:
1208                     break
1209             if current_selector:
1210                 selectors.append(current_selector)
1211             return selectors
1212
1213         def _build_selector_function(selector):
1214             if isinstance(selector, list):
1215                 fs = [_build_selector_function(s) for s in selector]
1216
1217                 def selector_function(ctx):
1218                     for f in fs:
1219                         for format in f(ctx):
1220                             yield format
1221                 return selector_function
1222             elif selector.type == GROUP:
1223                 selector_function = _build_selector_function(selector.selector)
1224             elif selector.type == PICKFIRST:
1225                 fs = [_build_selector_function(s) for s in selector.selector]
1226
1227                 def selector_function(ctx):
1228                     for f in fs:
1229                         picked_formats = list(f(ctx))
1230                         if picked_formats:
1231                             return picked_formats
1232                     return []
1233             elif selector.type == SINGLE:
1234                 format_spec = selector.selector
1235
1236                 def selector_function(ctx):
1237                     formats = list(ctx['formats'])
1238                     if not formats:
1239                         return
1240                     if format_spec == 'all':
1241                         for f in formats:
1242                             yield f
1243                     elif format_spec in ['best', 'worst', None]:
1244                         format_idx = 0 if format_spec == 'worst' else -1
1245                         audiovideo_formats = [
1246                             f for f in formats
1247                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1248                         if audiovideo_formats:
1249                             yield audiovideo_formats[format_idx]
1250                         # for extractors with incomplete formats (audio only (soundcloud)
1251                         # or video only (imgur)) we will fallback to best/worst
1252                         # {video,audio}-only format
1253                         elif ctx['incomplete_formats']:
1254                             yield formats[format_idx]
1255                     elif format_spec == 'bestaudio':
1256                         audio_formats = [
1257                             f for f in formats
1258                             if f.get('vcodec') == 'none']
1259                         if audio_formats:
1260                             yield audio_formats[-1]
1261                     elif format_spec == 'worstaudio':
1262                         audio_formats = [
1263                             f for f in formats
1264                             if f.get('vcodec') == 'none']
1265                         if audio_formats:
1266                             yield audio_formats[0]
1267                     elif format_spec == 'bestvideo':
1268                         video_formats = [
1269                             f for f in formats
1270                             if f.get('acodec') == 'none']
1271                         if video_formats:
1272                             yield video_formats[-1]
1273                     elif format_spec == 'worstvideo':
1274                         video_formats = [
1275                             f for f in formats
1276                             if f.get('acodec') == 'none']
1277                         if video_formats:
1278                             yield video_formats[0]
1279                     else:
1280                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1281                         if format_spec in extensions:
1282                             filter_f = lambda f: f['ext'] == format_spec
1283                         else:
1284                             filter_f = lambda f: f['format_id'] == format_spec
1285                         matches = list(filter(filter_f, formats))
1286                         if matches:
1287                             yield matches[-1]
1288             elif selector.type == MERGE:
1289                 def _merge(formats_info):
1290                     format_1, format_2 = [f['format_id'] for f in formats_info]
1291                     # The first format must contain the video and the
1292                     # second the audio
1293                     if formats_info[0].get('vcodec') == 'none':
1294                         self.report_error('The first format must '
1295                                           'contain the video, try using '
1296                                           '"-f %s+%s"' % (format_2, format_1))
1297                         return
1298                     # Formats must be opposite (video+audio)
1299                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1300                         self.report_error(
1301                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1302                             % (format_1, format_2))
1303                         return
1304                     output_ext = (
1305                         formats_info[0]['ext']
1306                         if self.params.get('merge_output_format') is None
1307                         else self.params['merge_output_format'])
1308                     return {
1309                         'requested_formats': formats_info,
1310                         'format': '%s+%s' % (formats_info[0].get('format'),
1311                                              formats_info[1].get('format')),
1312                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1313                                                 formats_info[1].get('format_id')),
1314                         'width': formats_info[0].get('width'),
1315                         'height': formats_info[0].get('height'),
1316                         'resolution': formats_info[0].get('resolution'),
1317                         'fps': formats_info[0].get('fps'),
1318                         'vcodec': formats_info[0].get('vcodec'),
1319                         'vbr': formats_info[0].get('vbr'),
1320                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1321                         'acodec': formats_info[1].get('acodec'),
1322                         'abr': formats_info[1].get('abr'),
1323                         'ext': output_ext,
1324                     }
1325                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1326
1327                 def selector_function(ctx):
1328                     for pair in itertools.product(
1329                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1330                         yield _merge(pair)
1331
1332             filters = [self._build_format_filter(f) for f in selector.filters]
1333
1334             def final_selector(ctx):
1335                 ctx_copy = copy.deepcopy(ctx)
1336                 for _filter in filters:
1337                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1338                 return selector_function(ctx_copy)
1339             return final_selector
1340
1341         stream = io.BytesIO(format_spec.encode('utf-8'))
1342         try:
1343             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1344         except tokenize.TokenError:
1345             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1346
1347         class TokenIterator(object):
1348             def __init__(self, tokens):
1349                 self.tokens = tokens
1350                 self.counter = 0
1351
1352             def __iter__(self):
1353                 return self
1354
1355             def __next__(self):
1356                 if self.counter >= len(self.tokens):
1357                     raise StopIteration()
1358                 value = self.tokens[self.counter]
1359                 self.counter += 1
1360                 return value
1361
1362             next = __next__
1363
1364             def restore_last_token(self):
1365                 self.counter -= 1
1366
1367         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1368         return _build_selector_function(parsed_selector)
1369
1370     def _calc_headers(self, info_dict):
1371         res = std_headers.copy()
1372
1373         add_headers = info_dict.get('http_headers')
1374         if add_headers:
1375             res.update(add_headers)
1376
1377         cookies = self._calc_cookies(info_dict)
1378         if cookies:
1379             res['Cookie'] = cookies
1380
1381         if 'X-Forwarded-For' not in res:
1382             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1383             if x_forwarded_for_ip:
1384                 res['X-Forwarded-For'] = x_forwarded_for_ip
1385
1386         return res
1387
1388     def _calc_cookies(self, info_dict):
1389         pr = sanitized_Request(info_dict['url'])
1390         self.cookiejar.add_cookie_header(pr)
1391         return pr.get_header('Cookie')
1392
1393     def process_video_result(self, info_dict, download=True):
1394         assert info_dict.get('_type', 'video') == 'video'
1395
1396         if 'id' not in info_dict:
1397             raise ExtractorError('Missing "id" field in extractor result')
1398         if 'title' not in info_dict:
1399             raise ExtractorError('Missing "title" field in extractor result')
1400
1401         def report_force_conversion(field, field_not, conversion):
1402             self.report_warning(
1403                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1404                 % (field, field_not, conversion))
1405
1406         def sanitize_string_field(info, string_field):
1407             field = info.get(string_field)
1408             if field is None or isinstance(field, compat_str):
1409                 return
1410             report_force_conversion(string_field, 'a string', 'string')
1411             info[string_field] = compat_str(field)
1412
1413         def sanitize_numeric_fields(info):
1414             for numeric_field in self._NUMERIC_FIELDS:
1415                 field = info.get(numeric_field)
1416                 if field is None or isinstance(field, compat_numeric_types):
1417                     continue
1418                 report_force_conversion(numeric_field, 'numeric', 'int')
1419                 info[numeric_field] = int_or_none(field)
1420
1421         sanitize_string_field(info_dict, 'id')
1422         sanitize_numeric_fields(info_dict)
1423
1424         if 'playlist' not in info_dict:
1425             # It isn't part of a playlist
1426             info_dict['playlist'] = None
1427             info_dict['playlist_index'] = None
1428
1429         thumbnails = info_dict.get('thumbnails')
1430         if thumbnails is None:
1431             thumbnail = info_dict.get('thumbnail')
1432             if thumbnail:
1433                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1434         if thumbnails:
1435             thumbnails.sort(key=lambda t: (
1436                 t.get('preference') if t.get('preference') is not None else -1,
1437                 t.get('width') if t.get('width') is not None else -1,
1438                 t.get('height') if t.get('height') is not None else -1,
1439                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1440             for i, t in enumerate(thumbnails):
1441                 t['url'] = sanitize_url(t['url'])
1442                 if t.get('width') and t.get('height'):
1443                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1444                 if t.get('id') is None:
1445                     t['id'] = '%d' % i
1446
1447         if self.params.get('list_thumbnails'):
1448             self.list_thumbnails(info_dict)
1449             return
1450
1451         thumbnail = info_dict.get('thumbnail')
1452         if thumbnail:
1453             info_dict['thumbnail'] = sanitize_url(thumbnail)
1454         elif thumbnails:
1455             info_dict['thumbnail'] = thumbnails[-1]['url']
1456
1457         if 'display_id' not in info_dict and 'id' in info_dict:
1458             info_dict['display_id'] = info_dict['id']
1459
1460         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1461             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1462             # see http://bugs.python.org/issue1646728)
1463             try:
1464                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1465                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1466             except (ValueError, OverflowError, OSError):
1467                 pass
1468
1469         # Auto generate title fields corresponding to the *_number fields when missing
1470         # in order to always have clean titles. This is very common for TV series.
1471         for field in ('chapter', 'season', 'episode'):
1472             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1473                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1474
1475         subtitles = info_dict.get('subtitles')
1476         if subtitles:
1477             for _, subtitle in subtitles.items():
1478                 for subtitle_format in subtitle:
1479                     if subtitle_format.get('url'):
1480                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1481                     if subtitle_format.get('ext') is None:
1482                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1483
1484         if self.params.get('listsubtitles', False):
1485             if 'automatic_captions' in info_dict:
1486                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1487             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1488             return
1489         info_dict['requested_subtitles'] = self.process_subtitles(
1490             info_dict['id'], subtitles,
1491             info_dict.get('automatic_captions'))
1492
1493         # We now pick which formats have to be downloaded
1494         if info_dict.get('formats') is None:
1495             # There's only one format available
1496             formats = [info_dict]
1497         else:
1498             formats = info_dict['formats']
1499
1500         if not formats:
1501             raise ExtractorError('No video formats found!')
1502
1503         def is_wellformed(f):
1504             url = f.get('url')
1505             if not url:
1506                 self.report_warning(
1507                     '"url" field is missing or empty - skipping format, '
1508                     'there is an error in extractor')
1509                 return False
1510             if isinstance(url, bytes):
1511                 sanitize_string_field(f, 'url')
1512             return True
1513
1514         # Filter out malformed formats for better extraction robustness
1515         formats = list(filter(is_wellformed, formats))
1516
1517         formats_dict = {}
1518
1519         # We check that all the formats have the format and format_id fields
1520         for i, format in enumerate(formats):
1521             sanitize_string_field(format, 'format_id')
1522             sanitize_numeric_fields(format)
1523             format['url'] = sanitize_url(format['url'])
1524             if not format.get('format_id'):
1525                 format['format_id'] = compat_str(i)
1526             else:
1527                 # Sanitize format_id from characters used in format selector expression
1528                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1529             format_id = format['format_id']
1530             if format_id not in formats_dict:
1531                 formats_dict[format_id] = []
1532             formats_dict[format_id].append(format)
1533
1534         # Make sure all formats have unique format_id
1535         for format_id, ambiguous_formats in formats_dict.items():
1536             if len(ambiguous_formats) > 1:
1537                 for i, format in enumerate(ambiguous_formats):
1538                     format['format_id'] = '%s-%d' % (format_id, i)
1539
1540         for i, format in enumerate(formats):
1541             if format.get('format') is None:
1542                 format['format'] = '{id} - {res}{note}'.format(
1543                     id=format['format_id'],
1544                     res=self.format_resolution(format),
1545                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1546                 )
1547             # Automatically determine file extension if missing
1548             if format.get('ext') is None:
1549                 format['ext'] = determine_ext(format['url']).lower()
1550             # Automatically determine protocol if missing (useful for format
1551             # selection purposes)
1552             if format.get('protocol') is None:
1553                 format['protocol'] = determine_protocol(format)
1554             # Add HTTP headers, so that external programs can use them from the
1555             # json output
1556             full_format_info = info_dict.copy()
1557             full_format_info.update(format)
1558             format['http_headers'] = self._calc_headers(full_format_info)
1559         # Remove private housekeeping stuff
1560         if '__x_forwarded_for_ip' in info_dict:
1561             del info_dict['__x_forwarded_for_ip']
1562
1563         # TODO Central sorting goes here
1564
1565         if formats[0] is not info_dict:
1566             # only set the 'formats' fields if the original info_dict list them
1567             # otherwise we end up with a circular reference, the first (and unique)
1568             # element in the 'formats' field in info_dict is info_dict itself,
1569             # which can't be exported to json
1570             info_dict['formats'] = formats
1571         if self.params.get('listformats'):
1572             self.list_formats(info_dict)
1573             return
1574
1575         req_format = self.params.get('format')
1576         if req_format is None:
1577             req_format = self._default_format_spec(info_dict, download=download)
1578             if self.params.get('verbose'):
1579                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1580
1581         format_selector = self.build_format_selector(req_format)
1582
1583         # While in format selection we may need to have an access to the original
1584         # format set in order to calculate some metrics or do some processing.
1585         # For now we need to be able to guess whether original formats provided
1586         # by extractor are incomplete or not (i.e. whether extractor provides only
1587         # video-only or audio-only formats) for proper formats selection for
1588         # extractors with such incomplete formats (see
1589         # https://github.com/rg3/youtube-dl/pull/5556).
1590         # Since formats may be filtered during format selection and may not match
1591         # the original formats the results may be incorrect. Thus original formats
1592         # or pre-calculated metrics should be passed to format selection routines
1593         # as well.
1594         # We will pass a context object containing all necessary additional data
1595         # instead of just formats.
1596         # This fixes incorrect format selection issue (see
1597         # https://github.com/rg3/youtube-dl/issues/10083).
1598         incomplete_formats = (
1599             # All formats are video-only or
1600             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1601             # all formats are audio-only
1602             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1603
1604         ctx = {
1605             'formats': formats,
1606             'incomplete_formats': incomplete_formats,
1607         }
1608
1609         formats_to_download = list(format_selector(ctx))
1610         if not formats_to_download:
1611             raise ExtractorError('requested format not available',
1612                                  expected=True)
1613
1614         if download:
1615             if len(formats_to_download) > 1:
1616                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1617             for format in formats_to_download:
1618                 new_info = dict(info_dict)
1619                 new_info.update(format)
1620                 self.process_info(new_info)
1621         # We update the info dict with the best quality format (backwards compatibility)
1622         info_dict.update(formats_to_download[-1])
1623         return info_dict
1624
1625     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1626         """Select the requested subtitles and their format"""
1627         available_subs = {}
1628         if normal_subtitles and self.params.get('writesubtitles'):
1629             available_subs.update(normal_subtitles)
1630         if automatic_captions and self.params.get('writeautomaticsub'):
1631             for lang, cap_info in automatic_captions.items():
1632                 if lang not in available_subs:
1633                     available_subs[lang] = cap_info
1634
1635         if (not self.params.get('writesubtitles') and not
1636                 self.params.get('writeautomaticsub') or not
1637                 available_subs):
1638             return None
1639
1640         if self.params.get('allsubtitles', False):
1641             requested_langs = available_subs.keys()
1642         else:
1643             if self.params.get('subtitleslangs', False):
1644                 requested_langs = self.params.get('subtitleslangs')
1645             elif 'en' in available_subs:
1646                 requested_langs = ['en']
1647             else:
1648                 requested_langs = [list(available_subs.keys())[0]]
1649
1650         formats_query = self.params.get('subtitlesformat', 'best')
1651         formats_preference = formats_query.split('/') if formats_query else []
1652         subs = {}
1653         for lang in requested_langs:
1654             formats = available_subs.get(lang)
1655             if formats is None:
1656                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1657                 continue
1658             for ext in formats_preference:
1659                 if ext == 'best':
1660                     f = formats[-1]
1661                     break
1662                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1663                 if matches:
1664                     f = matches[-1]
1665                     break
1666             else:
1667                 f = formats[-1]
1668                 self.report_warning(
1669                     'No subtitle format found matching "%s" for language %s, '
1670                     'using %s' % (formats_query, lang, f['ext']))
1671             subs[lang] = f
1672         return subs
1673
1674     def process_info(self, info_dict):
1675         """Process a single resolved IE result."""
1676
1677         assert info_dict.get('_type', 'video') == 'video'
1678
1679         max_downloads = self.params.get('max_downloads')
1680         if max_downloads is not None:
1681             if self._num_downloads >= int(max_downloads):
1682                 raise MaxDownloadsReached()
1683
1684         info_dict['fulltitle'] = info_dict['title']
1685         if len(info_dict['title']) > 200:
1686             info_dict['title'] = info_dict['title'][:197] + '...'
1687
1688         if 'format' not in info_dict:
1689             info_dict['format'] = info_dict['ext']
1690
1691         reason = self._match_entry(info_dict, incomplete=False)
1692         if reason is not None:
1693             self.to_screen('[download] ' + reason)
1694             return
1695
1696         self._num_downloads += 1
1697
1698         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1699
1700         # Forced printings
1701         if self.params.get('forcetitle', False):
1702             self.to_stdout(info_dict['fulltitle'])
1703         if self.params.get('forceid', False):
1704             self.to_stdout(info_dict['id'])
1705         if self.params.get('forceurl', False):
1706             if info_dict.get('requested_formats') is not None:
1707                 for f in info_dict['requested_formats']:
1708                     self.to_stdout(f['url'] + f.get('play_path', ''))
1709             else:
1710                 # For RTMP URLs, also include the playpath
1711                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1712         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1713             self.to_stdout(info_dict['thumbnail'])
1714         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1715             self.to_stdout(info_dict['description'])
1716         if self.params.get('forcefilename', False) and filename is not None:
1717             self.to_stdout(filename)
1718         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1719             self.to_stdout(formatSeconds(info_dict['duration']))
1720         if self.params.get('forceformat', False):
1721             self.to_stdout(info_dict['format'])
1722         if self.params.get('forcejson', False):
1723             self.to_stdout(json.dumps(info_dict))
1724
1725         # Do nothing else if in simulate mode
1726         if self.params.get('simulate', False):
1727             return
1728
1729         if filename is None:
1730             return
1731
1732         def ensure_dir_exists(path):
1733             try:
1734                 dn = os.path.dirname(path)
1735                 if dn and not os.path.exists(dn):
1736                     os.makedirs(dn)
1737                 return True
1738             except (OSError, IOError) as err:
1739                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1740                 return False
1741
1742         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1743             return
1744
1745         if self.params.get('writedescription', False):
1746             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1747             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1748                 self.to_screen('[info] Video description is already present')
1749             elif info_dict.get('description') is None:
1750                 self.report_warning('There\'s no description to write.')
1751             else:
1752                 try:
1753                     self.to_screen('[info] Writing video description to: ' + descfn)
1754                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1755                         descfile.write(info_dict['description'])
1756                 except (OSError, IOError):
1757                     self.report_error('Cannot write description file ' + descfn)
1758                     return
1759
1760         if self.params.get('writeannotations', False):
1761             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1762             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1763                 self.to_screen('[info] Video annotations are already present')
1764             else:
1765                 try:
1766                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1767                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1768                         annofile.write(info_dict['annotations'])
1769                 except (KeyError, TypeError):
1770                     self.report_warning('There are no annotations to write.')
1771                 except (OSError, IOError):
1772                     self.report_error('Cannot write annotations file: ' + annofn)
1773                     return
1774
1775         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1776                                        self.params.get('writeautomaticsub')])
1777
1778         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1779             # subtitles download errors are already managed as troubles in relevant IE
1780             # that way it will silently go on when used with unsupporting IE
1781             subtitles = info_dict['requested_subtitles']
1782             ie = self.get_info_extractor(info_dict['extractor_key'])
1783             for sub_lang, sub_info in subtitles.items():
1784                 sub_format = sub_info['ext']
1785                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1786                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1787                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1788                 else:
1789                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1790                     if sub_info.get('data') is not None:
1791                         try:
1792                             # Use newline='' to prevent conversion of newline characters
1793                             # See https://github.com/rg3/youtube-dl/issues/10268
1794                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1795                                 subfile.write(sub_info['data'])
1796                         except (OSError, IOError):
1797                             self.report_error('Cannot write subtitles file ' + sub_filename)
1798                             return
1799                     else:
1800                         try:
1801                             sub_data = ie._request_webpage(
1802                                 sub_info['url'], info_dict['id'], note=False).read()
1803                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1804                                 subfile.write(sub_data)
1805                         except (ExtractorError, IOError, OSError, ValueError) as err:
1806                             self.report_warning('Unable to download subtitle for "%s": %s' %
1807                                                 (sub_lang, error_to_compat_str(err)))
1808                             continue
1809
1810         if self.params.get('writeinfojson', False):
1811             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1812             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1813                 self.to_screen('[info] Video description metadata is already present')
1814             else:
1815                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1816                 try:
1817                     write_json_file(self.filter_requested_info(info_dict), infofn)
1818                 except (OSError, IOError):
1819                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1820                     return
1821
1822         self._write_thumbnails(info_dict, filename)
1823
1824         if not self.params.get('skip_download', False):
1825             try:
1826                 def dl(name, info):
1827                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1828                     for ph in self._progress_hooks:
1829                         fd.add_progress_hook(ph)
1830                     if self.params.get('verbose'):
1831                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1832                     return fd.download(name, info)
1833
1834                 if info_dict.get('requested_formats') is not None:
1835                     downloaded = []
1836                     success = True
1837                     merger = FFmpegMergerPP(self)
1838                     if not merger.available:
1839                         postprocessors = []
1840                         self.report_warning('You have requested multiple '
1841                                             'formats but ffmpeg or avconv are not installed.'
1842                                             ' The formats won\'t be merged.')
1843                     else:
1844                         postprocessors = [merger]
1845
1846                     def compatible_formats(formats):
1847                         video, audio = formats
1848                         # Check extension
1849                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1850                         if video_ext and audio_ext:
1851                             COMPATIBLE_EXTS = (
1852                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1853                                 ('webm')
1854                             )
1855                             for exts in COMPATIBLE_EXTS:
1856                                 if video_ext in exts and audio_ext in exts:
1857                                     return True
1858                         # TODO: Check acodec/vcodec
1859                         return False
1860
1861                     filename_real_ext = os.path.splitext(filename)[1][1:]
1862                     filename_wo_ext = (
1863                         os.path.splitext(filename)[0]
1864                         if filename_real_ext == info_dict['ext']
1865                         else filename)
1866                     requested_formats = info_dict['requested_formats']
1867                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1868                         info_dict['ext'] = 'mkv'
1869                         self.report_warning(
1870                             'Requested formats are incompatible for merge and will be merged into mkv.')
1871                     # Ensure filename always has a correct extension for successful merge
1872                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1873                     if os.path.exists(encodeFilename(filename)):
1874                         self.to_screen(
1875                             '[download] %s has already been downloaded and '
1876                             'merged' % filename)
1877                     else:
1878                         for f in requested_formats:
1879                             new_info = dict(info_dict)
1880                             new_info.update(f)
1881                             fname = prepend_extension(
1882                                 self.prepare_filename(new_info),
1883                                 'f%s' % f['format_id'], new_info['ext'])
1884                             if not ensure_dir_exists(fname):
1885                                 return
1886                             downloaded.append(fname)
1887                             partial_success = dl(fname, new_info)
1888                             success = success and partial_success
1889                         info_dict['__postprocessors'] = postprocessors
1890                         info_dict['__files_to_merge'] = downloaded
1891                 else:
1892                     # Just a single file
1893                     success = dl(filename, info_dict)
1894             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1895                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1896                 return
1897             except (OSError, IOError) as err:
1898                 raise UnavailableVideoError(err)
1899             except (ContentTooShortError, ) as err:
1900                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1901                 return
1902
1903             if success and filename != '-':
1904                 # Fixup content
1905                 fixup_policy = self.params.get('fixup')
1906                 if fixup_policy is None:
1907                     fixup_policy = 'detect_or_warn'
1908
1909                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1910
1911                 stretched_ratio = info_dict.get('stretched_ratio')
1912                 if stretched_ratio is not None and stretched_ratio != 1:
1913                     if fixup_policy == 'warn':
1914                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1915                             info_dict['id'], stretched_ratio))
1916                     elif fixup_policy == 'detect_or_warn':
1917                         stretched_pp = FFmpegFixupStretchedPP(self)
1918                         if stretched_pp.available:
1919                             info_dict.setdefault('__postprocessors', [])
1920                             info_dict['__postprocessors'].append(stretched_pp)
1921                         else:
1922                             self.report_warning(
1923                                 '%s: Non-uniform pixel ratio (%s). %s'
1924                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1925                     else:
1926                         assert fixup_policy in ('ignore', 'never')
1927
1928                 if (info_dict.get('requested_formats') is None and
1929                         info_dict.get('container') == 'm4a_dash'):
1930                     if fixup_policy == 'warn':
1931                         self.report_warning(
1932                             '%s: writing DASH m4a. '
1933                             'Only some players support this container.'
1934                             % info_dict['id'])
1935                     elif fixup_policy == 'detect_or_warn':
1936                         fixup_pp = FFmpegFixupM4aPP(self)
1937                         if fixup_pp.available:
1938                             info_dict.setdefault('__postprocessors', [])
1939                             info_dict['__postprocessors'].append(fixup_pp)
1940                         else:
1941                             self.report_warning(
1942                                 '%s: writing DASH m4a. '
1943                                 'Only some players support this container. %s'
1944                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1945                     else:
1946                         assert fixup_policy in ('ignore', 'never')
1947
1948                 if (info_dict.get('protocol') == 'm3u8_native' or
1949                         info_dict.get('protocol') == 'm3u8' and
1950                         self.params.get('hls_prefer_native')):
1951                     if fixup_policy == 'warn':
1952                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1953                             info_dict['id']))
1954                     elif fixup_policy == 'detect_or_warn':
1955                         fixup_pp = FFmpegFixupM3u8PP(self)
1956                         if fixup_pp.available:
1957                             info_dict.setdefault('__postprocessors', [])
1958                             info_dict['__postprocessors'].append(fixup_pp)
1959                         else:
1960                             self.report_warning(
1961                                 '%s: malformed AAC bitstream detected. %s'
1962                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1963                     else:
1964                         assert fixup_policy in ('ignore', 'never')
1965
1966                 try:
1967                     self.post_process(filename, info_dict)
1968                 except (PostProcessingError) as err:
1969                     self.report_error('postprocessing: %s' % str(err))
1970                     return
1971                 self.record_download_archive(info_dict)
1972
1973     def download(self, url_list):
1974         """Download a given list of URLs."""
1975         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1976         if (len(url_list) > 1 and
1977                 outtmpl != '-' and
1978                 '%' not in outtmpl and
1979                 self.params.get('max_downloads') != 1):
1980             raise SameFileError(outtmpl)
1981
1982         for url in url_list:
1983             try:
1984                 # It also downloads the videos
1985                 res = self.extract_info(
1986                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1987             except UnavailableVideoError:
1988                 self.report_error('unable to download video')
1989             except MaxDownloadsReached:
1990                 self.to_screen('[info] Maximum number of downloaded files reached.')
1991                 raise
1992             else:
1993                 if self.params.get('dump_single_json', False):
1994                     self.to_stdout(json.dumps(res))
1995
1996         return self._download_retcode
1997
1998     def download_with_info_file(self, info_filename):
1999         with contextlib.closing(fileinput.FileInput(
2000                 [info_filename], mode='r',
2001                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2002             # FileInput doesn't have a read method, we can't call json.load
2003             info = self.filter_requested_info(json.loads('\n'.join(f)))
2004         try:
2005             self.process_ie_result(info, download=True)
2006         except DownloadError:
2007             webpage_url = info.get('webpage_url')
2008             if webpage_url is not None:
2009                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2010                 return self.download([webpage_url])
2011             else:
2012                 raise
2013         return self._download_retcode
2014
2015     @staticmethod
2016     def filter_requested_info(info_dict):
2017         return dict(
2018             (k, v) for k, v in info_dict.items()
2019             if k not in ['requested_formats', 'requested_subtitles'])
2020
2021     def post_process(self, filename, ie_info):
2022         """Run all the postprocessors on the given file."""
2023         info = dict(ie_info)
2024         info['filepath'] = filename
2025         pps_chain = []
2026         if ie_info.get('__postprocessors') is not None:
2027             pps_chain.extend(ie_info['__postprocessors'])
2028         pps_chain.extend(self._pps)
2029         for pp in pps_chain:
2030             files_to_delete = []
2031             try:
2032                 files_to_delete, info = pp.run(info)
2033             except PostProcessingError as e:
2034                 self.report_error(e.msg)
2035             if files_to_delete and not self.params.get('keepvideo', False):
2036                 for old_filename in files_to_delete:
2037                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2038                     try:
2039                         os.remove(encodeFilename(old_filename))
2040                     except (IOError, OSError):
2041                         self.report_warning('Unable to remove downloaded original file')
2042
2043     def _make_archive_id(self, info_dict):
2044         # Future-proof against any change in case
2045         # and backwards compatibility with prior versions
2046         extractor = info_dict.get('extractor_key')
2047         if extractor is None:
2048             if 'id' in info_dict:
2049                 extractor = info_dict.get('ie_key')  # key in a playlist
2050         if extractor is None:
2051             return None  # Incomplete video information
2052         return extractor.lower() + ' ' + info_dict['id']
2053
2054     def in_download_archive(self, info_dict):
2055         fn = self.params.get('download_archive')
2056         if fn is None:
2057             return False
2058
2059         vid_id = self._make_archive_id(info_dict)
2060         if vid_id is None:
2061             return False  # Incomplete video information
2062
2063         try:
2064             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2065                 for line in archive_file:
2066                     if line.strip() == vid_id:
2067                         return True
2068         except IOError as ioe:
2069             if ioe.errno != errno.ENOENT:
2070                 raise
2071         return False
2072
2073     def record_download_archive(self, info_dict):
2074         fn = self.params.get('download_archive')
2075         if fn is None:
2076             return
2077         vid_id = self._make_archive_id(info_dict)
2078         assert vid_id
2079         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2080             archive_file.write(vid_id + '\n')
2081
2082     @staticmethod
2083     def format_resolution(format, default='unknown'):
2084         if format.get('vcodec') == 'none':
2085             return 'audio only'
2086         if format.get('resolution') is not None:
2087             return format['resolution']
2088         if format.get('height') is not None:
2089             if format.get('width') is not None:
2090                 res = '%sx%s' % (format['width'], format['height'])
2091             else:
2092                 res = '%sp' % format['height']
2093         elif format.get('width') is not None:
2094             res = '%dx?' % format['width']
2095         else:
2096             res = default
2097         return res
2098
2099     def _format_note(self, fdict):
2100         res = ''
2101         if fdict.get('ext') in ['f4f', 'f4m']:
2102             res += '(unsupported) '
2103         if fdict.get('language'):
2104             if res:
2105                 res += ' '
2106             res += '[%s] ' % fdict['language']
2107         if fdict.get('format_note') is not None:
2108             res += fdict['format_note'] + ' '
2109         if fdict.get('tbr') is not None:
2110             res += '%4dk ' % fdict['tbr']
2111         if fdict.get('container') is not None:
2112             if res:
2113                 res += ', '
2114             res += '%s container' % fdict['container']
2115         if (fdict.get('vcodec') is not None and
2116                 fdict.get('vcodec') != 'none'):
2117             if res:
2118                 res += ', '
2119             res += fdict['vcodec']
2120             if fdict.get('vbr') is not None:
2121                 res += '@'
2122         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2123             res += 'video@'
2124         if fdict.get('vbr') is not None:
2125             res += '%4dk' % fdict['vbr']
2126         if fdict.get('fps') is not None:
2127             if res:
2128                 res += ', '
2129             res += '%sfps' % fdict['fps']
2130         if fdict.get('acodec') is not None:
2131             if res:
2132                 res += ', '
2133             if fdict['acodec'] == 'none':
2134                 res += 'video only'
2135             else:
2136                 res += '%-5s' % fdict['acodec']
2137         elif fdict.get('abr') is not None:
2138             if res:
2139                 res += ', '
2140             res += 'audio'
2141         if fdict.get('abr') is not None:
2142             res += '@%3dk' % fdict['abr']
2143         if fdict.get('asr') is not None:
2144             res += ' (%5dHz)' % fdict['asr']
2145         if fdict.get('filesize') is not None:
2146             if res:
2147                 res += ', '
2148             res += format_bytes(fdict['filesize'])
2149         elif fdict.get('filesize_approx') is not None:
2150             if res:
2151                 res += ', '
2152             res += '~' + format_bytes(fdict['filesize_approx'])
2153         return res
2154
2155     def list_formats(self, info_dict):
2156         formats = info_dict.get('formats', [info_dict])
2157         table = [
2158             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2159             for f in formats
2160             if f.get('preference') is None or f['preference'] >= -1000]
2161         if len(formats) > 1:
2162             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2163
2164         header_line = ['format code', 'extension', 'resolution', 'note']
2165         self.to_screen(
2166             '[info] Available formats for %s:\n%s' %
2167             (info_dict['id'], render_table(header_line, table)))
2168
2169     def list_thumbnails(self, info_dict):
2170         thumbnails = info_dict.get('thumbnails')
2171         if not thumbnails:
2172             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2173             return
2174
2175         self.to_screen(
2176             '[info] Thumbnails for %s:' % info_dict['id'])
2177         self.to_screen(render_table(
2178             ['ID', 'width', 'height', 'URL'],
2179             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2180
2181     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2182         if not subtitles:
2183             self.to_screen('%s has no %s' % (video_id, name))
2184             return
2185         self.to_screen(
2186             'Available %s for %s:' % (name, video_id))
2187         self.to_screen(render_table(
2188             ['Language', 'formats'],
2189             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2190                 for lang, formats in subtitles.items()]))
2191
2192     def urlopen(self, req):
2193         """ Start an HTTP download """
2194         if isinstance(req, compat_basestring):
2195             req = sanitized_Request(req)
2196         return self._opener.open(req, timeout=self._socket_timeout)
2197
2198     def print_debug_header(self):
2199         if not self.params.get('verbose'):
2200             return
2201
2202         if type('') is not compat_str:
2203             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2204             self.report_warning(
2205                 'Your Python is broken! Update to a newer and supported version')
2206
2207         stdout_encoding = getattr(
2208             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2209         encoding_str = (
2210             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2211                 locale.getpreferredencoding(),
2212                 sys.getfilesystemencoding(),
2213                 stdout_encoding,
2214                 self.get_encoding()))
2215         write_string(encoding_str, encoding=None)
2216
2217         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2218         if _LAZY_LOADER:
2219             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2220         try:
2221             sp = subprocess.Popen(
2222                 ['git', 'rev-parse', '--short', 'HEAD'],
2223                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2224                 cwd=os.path.dirname(os.path.abspath(__file__)))
2225             out, err = sp.communicate()
2226             out = out.decode().strip()
2227             if re.match('[0-9a-f]+', out):
2228                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2229         except Exception:
2230             try:
2231                 sys.exc_clear()
2232             except Exception:
2233                 pass
2234         self._write_string('[debug] Python version %s - %s\n' % (
2235             platform.python_version(), platform_name()))
2236
2237         exe_versions = FFmpegPostProcessor.get_versions(self)
2238         exe_versions['rtmpdump'] = rtmpdump_version()
2239         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2240         exe_str = ', '.join(
2241             '%s %s' % (exe, v)
2242             for exe, v in sorted(exe_versions.items())
2243             if v
2244         )
2245         if not exe_str:
2246             exe_str = 'none'
2247         self._write_string('[debug] exe versions: %s\n' % exe_str)
2248
2249         proxy_map = {}
2250         for handler in self._opener.handlers:
2251             if hasattr(handler, 'proxies'):
2252                 proxy_map.update(handler.proxies)
2253         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2254
2255         if self.params.get('call_home', False):
2256             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2257             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2258             latest_version = self.urlopen(
2259                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2260             if version_tuple(latest_version) > version_tuple(__version__):
2261                 self.report_warning(
2262                     'You are using an outdated version (newest version: %s)! '
2263                     'See https://yt-dl.org/update if you need help updating.' %
2264                     latest_version)
2265
2266     def _setup_opener(self):
2267         timeout_val = self.params.get('socket_timeout')
2268         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2269
2270         opts_cookiefile = self.params.get('cookiefile')
2271         opts_proxy = self.params.get('proxy')
2272
2273         if opts_cookiefile is None:
2274             self.cookiejar = compat_cookiejar.CookieJar()
2275         else:
2276             opts_cookiefile = expand_path(opts_cookiefile)
2277             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2278                 opts_cookiefile)
2279             if os.access(opts_cookiefile, os.R_OK):
2280                 self.cookiejar.load()
2281
2282         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2283         if opts_proxy is not None:
2284             if opts_proxy == '':
2285                 proxies = {}
2286             else:
2287                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2288         else:
2289             proxies = compat_urllib_request.getproxies()
2290             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2291             if 'http' in proxies and 'https' not in proxies:
2292                 proxies['https'] = proxies['http']
2293         proxy_handler = PerRequestProxyHandler(proxies)
2294
2295         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2296         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2297         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2298         data_handler = compat_urllib_request_DataHandler()
2299
2300         # When passing our own FileHandler instance, build_opener won't add the
2301         # default FileHandler and allows us to disable the file protocol, which
2302         # can be used for malicious purposes (see
2303         # https://github.com/rg3/youtube-dl/issues/8227)
2304         file_handler = compat_urllib_request.FileHandler()
2305
2306         def file_open(*args, **kwargs):
2307             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2308         file_handler.file_open = file_open
2309
2310         opener = compat_urllib_request.build_opener(
2311             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2312
2313         # Delete the default user-agent header, which would otherwise apply in
2314         # cases where our custom HTTP handler doesn't come into play
2315         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2316         opener.addheaders = []
2317         self._opener = opener
2318
2319     def encode(self, s):
2320         if isinstance(s, bytes):
2321             return s  # Already encoded
2322
2323         try:
2324             return s.encode(self.get_encoding())
2325         except UnicodeEncodeError as err:
2326             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2327             raise
2328
2329     def get_encoding(self):
2330         encoding = self.params.get('encoding')
2331         if encoding is None:
2332             encoding = preferredencoding()
2333         return encoding
2334
2335     def _write_thumbnails(self, info_dict, filename):
2336         if self.params.get('writethumbnail', False):
2337             thumbnails = info_dict.get('thumbnails')
2338             if thumbnails:
2339                 thumbnails = [thumbnails[-1]]
2340         elif self.params.get('write_all_thumbnails', False):
2341             thumbnails = info_dict.get('thumbnails')
2342         else:
2343             return
2344
2345         if not thumbnails:
2346             # No thumbnails present, so return immediately
2347             return
2348
2349         for t in thumbnails:
2350             thumb_ext = determine_ext(t['url'], 'jpg')
2351             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2352             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2353             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2354
2355             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2356                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2357                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2358             else:
2359                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2360                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2361                 try:
2362                     uf = self.urlopen(t['url'])
2363                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2364                         shutil.copyfileobj(uf, thumbf)
2365                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2366                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2367                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2368                     self.report_warning('Unable to download thumbnail "%s": %s' %
2369                                         (t['url'], error_to_compat_str(err)))