[YoutubeDL] write raw subtitle files
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_get_terminal_size,
33     compat_http_client,
34     compat_kwargs,
35     compat_numeric_types,
36     compat_os_name,
37     compat_str,
38     compat_tokenize_tokenize,
39     compat_urllib_error,
40     compat_urllib_request,
41     compat_urllib_request_DataHandler,
42 )
43 from .utils import (
44     age_restricted,
45     args_to_str,
46     ContentTooShortError,
47     date_from_str,
48     DateRange,
49     DEFAULT_OUTTMPL,
50     determine_ext,
51     determine_protocol,
52     DownloadError,
53     encode_compat_str,
54     encodeFilename,
55     error_to_compat_str,
56     expand_path,
57     ExtractorError,
58     format_bytes,
59     formatSeconds,
60     GeoRestrictedError,
61     ISO3166Utils,
62     locked_file,
63     make_HTTPS_handler,
64     MaxDownloadsReached,
65     PagedList,
66     parse_filesize,
67     PerRequestProxyHandler,
68     platform_name,
69     PostProcessingError,
70     preferredencoding,
71     prepend_extension,
72     register_socks_protocols,
73     render_table,
74     replace_extension,
75     SameFileError,
76     sanitize_filename,
77     sanitize_path,
78     sanitize_url,
79     sanitized_Request,
80     std_headers,
81     subtitles_filename,
82     UnavailableVideoError,
83     url_basename,
84     version_tuple,
85     write_json_file,
86     write_string,
87     YoutubeDLCookieProcessor,
88     YoutubeDLHandler,
89 )
90 from .cache import Cache
91 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
92 from .downloader import get_suitable_downloader
93 from .downloader.rtmp import rtmpdump_version
94 from .postprocessor import (
95     FFmpegFixupM3u8PP,
96     FFmpegFixupM4aPP,
97     FFmpegFixupStretchedPP,
98     FFmpegMergerPP,
99     FFmpegPostProcessor,
100     get_postprocessor,
101 )
102 from .version import __version__
103
104 if compat_os_name == 'nt':
105     import ctypes
106
107
108 class YoutubeDL(object):
109     """YoutubeDL class.
110
111     YoutubeDL objects are the ones responsible of downloading the
112     actual video file and writing it to disk if the user has requested
113     it, among some other tasks. In most cases there should be one per
114     program. As, given a video URL, the downloader doesn't know how to
115     extract all the needed information, task that InfoExtractors do, it
116     has to pass the URL to one of them.
117
118     For this, YoutubeDL objects have a method that allows
119     InfoExtractors to be registered in a given order. When it is passed
120     a URL, the YoutubeDL object handles it to the first InfoExtractor it
121     finds that reports being able to handle it. The InfoExtractor extracts
122     all the information about the video or videos the URL refers to, and
123     YoutubeDL process the extracted information, possibly using a File
124     Downloader to download the video.
125
126     YoutubeDL objects accept a lot of parameters. In order not to saturate
127     the object constructor with arguments, it receives a dictionary of
128     options instead. These options are available through the params
129     attribute for the InfoExtractors to use. The YoutubeDL also
130     registers itself as the downloader in charge for the InfoExtractors
131     that are added to it, so this is a "mutual registration".
132
133     Available options:
134
135     username:          Username for authentication purposes.
136     password:          Password for authentication purposes.
137     videopassword:     Password for accessing a video.
138     ap_mso:            Adobe Pass multiple-system operator identifier.
139     ap_username:       Multiple-system operator account username.
140     ap_password:       Multiple-system operator account password.
141     usenetrc:          Use netrc for authentication instead.
142     verbose:           Print additional info to stdout.
143     quiet:             Do not print messages to stdout.
144     no_warnings:       Do not print out anything for warnings.
145     forceurl:          Force printing final URL.
146     forcetitle:        Force printing title.
147     forceid:           Force printing ID.
148     forcethumbnail:    Force printing thumbnail URL.
149     forcedescription:  Force printing description.
150     forcefilename:     Force printing final filename.
151     forceduration:     Force printing duration.
152     forcejson:         Force printing info_dict as JSON.
153     dump_single_json:  Force printing the info_dict of the whole playlist
154                        (or video) as a single JSON line.
155     simulate:          Do not download the video files.
156     format:            Video format code. See options.py for more information.
157     outtmpl:           Template for output names.
158     restrictfilenames: Do not allow "&" and spaces in file names
159     ignoreerrors:      Do not stop on download errors.
160     force_generic_extractor: Force downloader to use the generic extractor
161     nooverwrites:      Prevent overwriting files.
162     playliststart:     Playlist item to start at.
163     playlistend:       Playlist item to end at.
164     playlist_items:    Specific indices of playlist to download.
165     playlistreverse:   Download playlist items in reverse order.
166     playlistrandom:    Download playlist items in random order.
167     matchtitle:        Download only matching titles.
168     rejecttitle:       Reject downloads for matching titles.
169     logger:            Log messages to a logging.Logger instance.
170     logtostderr:       Log messages to stderr instead of stdout.
171     writedescription:  Write the video description to a .description file
172     writeinfojson:     Write the video description to a .info.json file
173     writeannotations:  Write the video annotations to a .annotations.xml file
174     writethumbnail:    Write the thumbnail image to a file
175     write_all_thumbnails:  Write all thumbnail formats to files
176     writesubtitles:    Write the video subtitles to a file
177     writeautomaticsub: Write the automatically generated subtitles to a file
178     allsubtitles:      Downloads all the subtitles of the video
179                        (requires writesubtitles or writeautomaticsub)
180     listsubtitles:     Lists all available subtitles for the video
181     subtitlesformat:   The format code for subtitles
182     subtitleslangs:    List of languages of the subtitles to download
183     keepvideo:         Keep the video file after post-processing
184     daterange:         A DateRange object, download only if the upload_date is in the range.
185     skip_download:     Skip the actual download of the video file
186     cachedir:          Location of the cache files in the filesystem.
187                        False to disable filesystem cache.
188     noplaylist:        Download single video instead of a playlist if in doubt.
189     age_limit:         An integer representing the user's age in years.
190                        Unsuitable videos for the given age are skipped.
191     min_views:         An integer representing the minimum view count the video
192                        must have in order to not be skipped.
193                        Videos without view count information are always
194                        downloaded. None for no limit.
195     max_views:         An integer representing the maximum view count.
196                        Videos that are more popular than that are not
197                        downloaded.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     download_archive:  File name of a file where all downloads are recorded.
201                        Videos already present in the file are not downloaded
202                        again.
203     cookiefile:        File name where cookies should be read from and dumped to.
204     nocheckcertificate:Do not verify SSL certificates
205     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
206                        At the moment, this is only supported by YouTube.
207     proxy:             URL of the proxy server to use
208     geo_verification_proxy:  URL of the proxy to use for IP address verification
209                        on geo-restricted sites. (Experimental)
210     socket_timeout:    Time to wait for unresponsive hosts, in seconds
211     bidi_workaround:   Work around buggy terminals without bidirectional text
212                        support, using fridibi
213     debug_printtraffic:Print out sent and received HTTP traffic
214     include_ads:       Download ads as well
215     default_search:    Prepend this string if an input url is not valid.
216                        'auto' for elaborate guessing
217     encoding:          Use this encoding instead of the system-specified.
218     extract_flat:      Do not resolve URLs, return the immediate result.
219                        Pass in 'in_playlist' to only show this behavior for
220                        playlist items.
221     postprocessors:    A list of dictionaries, each with an entry
222                        * key:  The name of the postprocessor. See
223                                youtube_dl/postprocessor/__init__.py for a list.
224                        as well as any further keyword arguments for the
225                        postprocessor.
226     progress_hooks:    A list of functions that get called on download
227                        progress, with a dictionary with the entries
228                        * status: One of "downloading", "error", or "finished".
229                                  Check this first and ignore unknown values.
230
231                        If status is one of "downloading", or "finished", the
232                        following properties may also be present:
233                        * filename: The final filename (always present)
234                        * tmpfilename: The filename we're currently writing to
235                        * downloaded_bytes: Bytes on disk
236                        * total_bytes: Size of the whole file, None if unknown
237                        * total_bytes_estimate: Guess of the eventual file size,
238                                                None if unavailable.
239                        * elapsed: The number of seconds since download started.
240                        * eta: The estimated time in seconds, None if unknown
241                        * speed: The download speed in bytes/second, None if
242                                 unknown
243                        * fragment_index: The counter of the currently
244                                          downloaded video fragment.
245                        * fragment_count: The number of fragments (= individual
246                                          files that will be merged)
247
248                        Progress hooks are guaranteed to be called at least once
249                        (with status "finished") if the download is successful.
250     merge_output_format: Extension to use when merging formats.
251     fixup:             Automatically correct known faults of the file.
252                        One of:
253                        - "never": do nothing
254                        - "warn": only emit a warning
255                        - "detect_or_warn": check whether we can do anything
256                                            about it, warn otherwise (default)
257     source_address:    (Experimental) Client-side IP address to bind to.
258     call_home:         Boolean, true iff we are allowed to contact the
259                        youtube-dl servers for debugging.
260     sleep_interval:    Number of seconds to sleep before each download when
261                        used alone or a lower bound of a range for randomized
262                        sleep before each download (minimum possible number
263                        of seconds to sleep) when used along with
264                        max_sleep_interval.
265     max_sleep_interval:Upper bound of a range for randomized sleep before each
266                        download (maximum possible number of seconds to sleep).
267                        Must only be used along with sleep_interval.
268                        Actual sleep time will be a random float from range
269                        [sleep_interval; max_sleep_interval].
270     listformats:       Print an overview of available video formats and exit.
271     list_thumbnails:   Print a table of all thumbnails and exit.
272     match_filter:      A function that gets called with the info_dict of
273                        every video.
274                        If it returns a message, the video is ignored.
275                        If it returns None, the video is downloaded.
276                        match_filter_func in utils.py is one example for this.
277     no_color:          Do not emit color codes in output.
278     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
279                        HTTP header (experimental)
280     geo_bypass_country:
281                        Two-letter ISO 3166-2 country code that will be used for
282                        explicit geographic restriction bypassing via faking
283                        X-Forwarded-For HTTP header (experimental)
284
285     The following options determine which downloader is picked:
286     external_downloader: Executable of the external downloader to call.
287                        None or unset for standard (built-in) downloader.
288     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
289                        if True, otherwise use ffmpeg/avconv if False, otherwise
290                        use downloader suggested by extractor if None.
291
292     The following parameters are not used by YoutubeDL itself, they are used by
293     the downloader (see youtube_dl/downloader/common.py):
294     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
295     noresizebuffer, retries, continuedl, noprogress, consoletitle,
296     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
297
298     The following options are used by the post processors:
299     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
300                        otherwise prefer avconv.
301     postprocessor_args: A list of additional command-line arguments for the
302                         postprocessor.
303     """
304
305     params = None
306     _ies = []
307     _pps = []
308     _download_retcode = None
309     _num_downloads = None
310     _screen_file = None
311
312     def __init__(self, params=None, auto_init=True):
313         """Create a FileDownloader object with the given options."""
314         if params is None:
315             params = {}
316         self._ies = []
317         self._ies_instances = {}
318         self._pps = []
319         self._progress_hooks = []
320         self._download_retcode = 0
321         self._num_downloads = 0
322         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
323         self._err_file = sys.stderr
324         self.params = {
325             # Default parameters
326             'nocheckcertificate': False,
327         }
328         self.params.update(params)
329         self.cache = Cache(self)
330
331         def check_deprecated(param, option, suggestion):
332             if self.params.get(param) is not None:
333                 self.report_warning(
334                     '%s is deprecated. Use %s instead.' % (option, suggestion))
335                 return True
336             return False
337
338         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
339             if self.params.get('geo_verification_proxy') is None:
340                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
341
342         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
343         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
344         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
345
346         if params.get('bidi_workaround', False):
347             try:
348                 import pty
349                 master, slave = pty.openpty()
350                 width = compat_get_terminal_size().columns
351                 if width is None:
352                     width_args = []
353                 else:
354                     width_args = ['-w', str(width)]
355                 sp_kwargs = dict(
356                     stdin=subprocess.PIPE,
357                     stdout=slave,
358                     stderr=self._err_file)
359                 try:
360                     self._output_process = subprocess.Popen(
361                         ['bidiv'] + width_args, **sp_kwargs
362                     )
363                 except OSError:
364                     self._output_process = subprocess.Popen(
365                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
366                 self._output_channel = os.fdopen(master, 'rb')
367             except OSError as ose:
368                 if ose.errno == errno.ENOENT:
369                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
370                 else:
371                     raise
372
373         if (sys.version_info >= (3,) and sys.platform != 'win32' and
374                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
375                 not params.get('restrictfilenames', False)):
376             # On Python 3, the Unicode filesystem API will throw errors (#1474)
377             self.report_warning(
378                 'Assuming --restrict-filenames since file system encoding '
379                 'cannot encode all characters. '
380                 'Set the LC_ALL environment variable to fix this.')
381             self.params['restrictfilenames'] = True
382
383         if isinstance(params.get('outtmpl'), bytes):
384             self.report_warning(
385                 'Parameter outtmpl is bytes, but should be a unicode string. '
386                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
387
388         self._setup_opener()
389
390         if auto_init:
391             self.print_debug_header()
392             self.add_default_info_extractors()
393
394         for pp_def_raw in self.params.get('postprocessors', []):
395             pp_class = get_postprocessor(pp_def_raw['key'])
396             pp_def = dict(pp_def_raw)
397             del pp_def['key']
398             pp = pp_class(self, **compat_kwargs(pp_def))
399             self.add_post_processor(pp)
400
401         for ph in self.params.get('progress_hooks', []):
402             self.add_progress_hook(ph)
403
404         register_socks_protocols()
405
406     def warn_if_short_id(self, argv):
407         # short YouTube ID starting with dash?
408         idxs = [
409             i for i, a in enumerate(argv)
410             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
411         if idxs:
412             correct_argv = (
413                 ['youtube-dl'] +
414                 [a for i, a in enumerate(argv) if i not in idxs] +
415                 ['--'] + [argv[i] for i in idxs]
416             )
417             self.report_warning(
418                 'Long argument string detected. '
419                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
420                 args_to_str(correct_argv))
421
422     def add_info_extractor(self, ie):
423         """Add an InfoExtractor object to the end of the list."""
424         self._ies.append(ie)
425         if not isinstance(ie, type):
426             self._ies_instances[ie.ie_key()] = ie
427             ie.set_downloader(self)
428
429     def get_info_extractor(self, ie_key):
430         """
431         Get an instance of an IE with name ie_key, it will try to get one from
432         the _ies list, if there's no instance it will create a new one and add
433         it to the extractor list.
434         """
435         ie = self._ies_instances.get(ie_key)
436         if ie is None:
437             ie = get_info_extractor(ie_key)()
438             self.add_info_extractor(ie)
439         return ie
440
441     def add_default_info_extractors(self):
442         """
443         Add the InfoExtractors returned by gen_extractors to the end of the list
444         """
445         for ie in gen_extractor_classes():
446             self.add_info_extractor(ie)
447
448     def add_post_processor(self, pp):
449         """Add a PostProcessor object to the end of the chain."""
450         self._pps.append(pp)
451         pp.set_downloader(self)
452
453     def add_progress_hook(self, ph):
454         """Add the progress hook (currently only for the file downloader)"""
455         self._progress_hooks.append(ph)
456
457     def _bidi_workaround(self, message):
458         if not hasattr(self, '_output_channel'):
459             return message
460
461         assert hasattr(self, '_output_process')
462         assert isinstance(message, compat_str)
463         line_count = message.count('\n') + 1
464         self._output_process.stdin.write((message + '\n').encode('utf-8'))
465         self._output_process.stdin.flush()
466         res = ''.join(self._output_channel.readline().decode('utf-8')
467                       for _ in range(line_count))
468         return res[:-len('\n')]
469
470     def to_screen(self, message, skip_eol=False):
471         """Print message to stdout if not in quiet mode."""
472         return self.to_stdout(message, skip_eol, check_quiet=True)
473
474     def _write_string(self, s, out=None):
475         write_string(s, out=out, encoding=self.params.get('encoding'))
476
477     def to_stdout(self, message, skip_eol=False, check_quiet=False):
478         """Print message to stdout if not in quiet mode."""
479         if self.params.get('logger'):
480             self.params['logger'].debug(message)
481         elif not check_quiet or not self.params.get('quiet', False):
482             message = self._bidi_workaround(message)
483             terminator = ['\n', ''][skip_eol]
484             output = message + terminator
485
486             self._write_string(output, self._screen_file)
487
488     def to_stderr(self, message):
489         """Print message to stderr."""
490         assert isinstance(message, compat_str)
491         if self.params.get('logger'):
492             self.params['logger'].error(message)
493         else:
494             message = self._bidi_workaround(message)
495             output = message + '\n'
496             self._write_string(output, self._err_file)
497
498     def to_console_title(self, message):
499         if not self.params.get('consoletitle', False):
500             return
501         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
502             # c_wchar_p() might not be necessary if `message` is
503             # already of type unicode()
504             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
505         elif 'TERM' in os.environ:
506             self._write_string('\033]0;%s\007' % message, self._screen_file)
507
508     def save_console_title(self):
509         if not self.params.get('consoletitle', False):
510             return
511         if 'TERM' in os.environ:
512             # Save the title on stack
513             self._write_string('\033[22;0t', self._screen_file)
514
515     def restore_console_title(self):
516         if not self.params.get('consoletitle', False):
517             return
518         if 'TERM' in os.environ:
519             # Restore the title from stack
520             self._write_string('\033[23;0t', self._screen_file)
521
522     def __enter__(self):
523         self.save_console_title()
524         return self
525
526     def __exit__(self, *args):
527         self.restore_console_title()
528
529         if self.params.get('cookiefile') is not None:
530             self.cookiejar.save()
531
532     def trouble(self, message=None, tb=None):
533         """Determine action to take when a download problem appears.
534
535         Depending on if the downloader has been configured to ignore
536         download errors or not, this method may throw an exception or
537         not when errors are found, after printing the message.
538
539         tb, if given, is additional traceback information.
540         """
541         if message is not None:
542             self.to_stderr(message)
543         if self.params.get('verbose'):
544             if tb is None:
545                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
546                     tb = ''
547                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
548                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
549                     tb += encode_compat_str(traceback.format_exc())
550                 else:
551                     tb_data = traceback.format_list(traceback.extract_stack())
552                     tb = ''.join(tb_data)
553             self.to_stderr(tb)
554         if not self.params.get('ignoreerrors', False):
555             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
556                 exc_info = sys.exc_info()[1].exc_info
557             else:
558                 exc_info = sys.exc_info()
559             raise DownloadError(message, exc_info)
560         self._download_retcode = 1
561
562     def report_warning(self, message):
563         '''
564         Print the message to stderr, it will be prefixed with 'WARNING:'
565         If stderr is a tty file the 'WARNING:' will be colored
566         '''
567         if self.params.get('logger') is not None:
568             self.params['logger'].warning(message)
569         else:
570             if self.params.get('no_warnings'):
571                 return
572             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
573                 _msg_header = '\033[0;33mWARNING:\033[0m'
574             else:
575                 _msg_header = 'WARNING:'
576             warning_message = '%s %s' % (_msg_header, message)
577             self.to_stderr(warning_message)
578
579     def report_error(self, message, tb=None):
580         '''
581         Do the same as trouble, but prefixes the message with 'ERROR:', colored
582         in red if stderr is a tty file.
583         '''
584         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
585             _msg_header = '\033[0;31mERROR:\033[0m'
586         else:
587             _msg_header = 'ERROR:'
588         error_message = '%s %s' % (_msg_header, message)
589         self.trouble(error_message, tb)
590
591     def report_file_already_downloaded(self, file_name):
592         """Report file has already been fully downloaded."""
593         try:
594             self.to_screen('[download] %s has already been downloaded' % file_name)
595         except UnicodeEncodeError:
596             self.to_screen('[download] The file has already been downloaded')
597
598     def prepare_filename(self, info_dict):
599         """Generate the output filename."""
600         try:
601             template_dict = dict(info_dict)
602
603             template_dict['epoch'] = int(time.time())
604             autonumber_size = self.params.get('autonumber_size')
605             if autonumber_size is None:
606                 autonumber_size = 5
607             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
608             if template_dict.get('resolution') is None:
609                 if template_dict.get('width') and template_dict.get('height'):
610                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
611                 elif template_dict.get('height'):
612                     template_dict['resolution'] = '%sp' % template_dict['height']
613                 elif template_dict.get('width'):
614                     template_dict['resolution'] = '%dx?' % template_dict['width']
615
616             sanitize = lambda k, v: sanitize_filename(
617                 compat_str(v),
618                 restricted=self.params.get('restrictfilenames'),
619                 is_id=(k == 'id' or k.endswith('_id')))
620             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
621                                  for k, v in template_dict.items()
622                                  if v is not None and not isinstance(v, (list, tuple, dict)))
623             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
624
625             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
626
627             # For fields playlist_index and autonumber convert all occurrences
628             # of %(field)s to %(field)0Nd for backward compatibility
629             field_size_compat_map = {
630                 'playlist_index': len(str(template_dict['n_entries'])),
631                 'autonumber': autonumber_size,
632             }
633             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
634             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
635             if mobj:
636                 outtmpl = re.sub(
637                     FIELD_SIZE_COMPAT_RE,
638                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
639                     outtmpl)
640
641             NUMERIC_FIELDS = set((
642                 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
643                 'timestamp', 'upload_year', 'upload_month', 'upload_day',
644                 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
645                 'average_rating', 'comment_count', 'age_limit',
646                 'start_time', 'end_time',
647                 'chapter_number', 'season_number', 'episode_number',
648                 'track_number', 'disc_number', 'release_year',
649                 'playlist_index',
650             ))
651
652             # Missing numeric fields used together with integer presentation types
653             # in format specification will break the argument substitution since
654             # string 'NA' is returned for missing fields. We will patch output
655             # template for missing fields to meet string presentation type.
656             for numeric_field in NUMERIC_FIELDS:
657                 if numeric_field not in template_dict:
658                     # As of [1] format syntax is:
659                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
660                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
661                     FORMAT_RE = r'''(?x)
662                         (?<!%)
663                         %
664                         \({0}\)  # mapping key
665                         (?:[#0\-+ ]+)?  # conversion flags (optional)
666                         (?:\d+)?  # minimum field width (optional)
667                         (?:\.\d+)?  # precision (optional)
668                         [hlL]?  # length modifier (optional)
669                         [diouxXeEfFgGcrs%]  # conversion type
670                     '''
671                     outtmpl = re.sub(
672                         FORMAT_RE.format(numeric_field),
673                         r'%({0})s'.format(numeric_field), outtmpl)
674
675             filename = expand_path(outtmpl % template_dict)
676             # Temporary fix for #4787
677             # 'Treat' all problem characters by passing filename through preferredencoding
678             # to workaround encoding issues with subprocess on python2 @ Windows
679             if sys.version_info < (3, 0) and sys.platform == 'win32':
680                 filename = encodeFilename(filename, True).decode(preferredencoding())
681             return sanitize_path(filename)
682         except ValueError as err:
683             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
684             return None
685
686     def _match_entry(self, info_dict, incomplete):
687         """ Returns None iff the file should be downloaded """
688
689         video_title = info_dict.get('title', info_dict.get('id', 'video'))
690         if 'title' in info_dict:
691             # This can happen when we're just evaluating the playlist
692             title = info_dict['title']
693             matchtitle = self.params.get('matchtitle', False)
694             if matchtitle:
695                 if not re.search(matchtitle, title, re.IGNORECASE):
696                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
697             rejecttitle = self.params.get('rejecttitle', False)
698             if rejecttitle:
699                 if re.search(rejecttitle, title, re.IGNORECASE):
700                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
701         date = info_dict.get('upload_date')
702         if date is not None:
703             dateRange = self.params.get('daterange', DateRange())
704             if date not in dateRange:
705                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
706         view_count = info_dict.get('view_count')
707         if view_count is not None:
708             min_views = self.params.get('min_views')
709             if min_views is not None and view_count < min_views:
710                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
711             max_views = self.params.get('max_views')
712             if max_views is not None and view_count > max_views:
713                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
714         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
715             return 'Skipping "%s" because it is age restricted' % video_title
716         if self.in_download_archive(info_dict):
717             return '%s has already been recorded in archive' % video_title
718
719         if not incomplete:
720             match_filter = self.params.get('match_filter')
721             if match_filter is not None:
722                 ret = match_filter(info_dict)
723                 if ret is not None:
724                     return ret
725
726         return None
727
728     @staticmethod
729     def add_extra_info(info_dict, extra_info):
730         '''Set the keys from extra_info in info dict if they are missing'''
731         for key, value in extra_info.items():
732             info_dict.setdefault(key, value)
733
734     def extract_info(self, url, download=True, ie_key=None, extra_info={},
735                      process=True, force_generic_extractor=False):
736         '''
737         Returns a list with a dictionary for each video we find.
738         If 'download', also downloads the videos.
739         extra_info is a dict containing the extra values to add to each result
740         '''
741
742         if not ie_key and force_generic_extractor:
743             ie_key = 'Generic'
744
745         if ie_key:
746             ies = [self.get_info_extractor(ie_key)]
747         else:
748             ies = self._ies
749
750         for ie in ies:
751             if not ie.suitable(url):
752                 continue
753
754             ie = self.get_info_extractor(ie.ie_key())
755             if not ie.working():
756                 self.report_warning('The program functionality for this site has been marked as broken, '
757                                     'and will probably not work.')
758
759             try:
760                 ie_result = ie.extract(url)
761                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
762                     break
763                 if isinstance(ie_result, list):
764                     # Backwards compatibility: old IE result format
765                     ie_result = {
766                         '_type': 'compat_list',
767                         'entries': ie_result,
768                     }
769                 self.add_default_extra_info(ie_result, ie, url)
770                 if process:
771                     return self.process_ie_result(ie_result, download, extra_info)
772                 else:
773                     return ie_result
774             except GeoRestrictedError as e:
775                 msg = e.msg
776                 if e.countries:
777                     msg += '\nThis video is available in %s.' % ', '.join(
778                         map(ISO3166Utils.short2full, e.countries))
779                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
780                 self.report_error(msg)
781                 break
782             except ExtractorError as e:  # An error we somewhat expected
783                 self.report_error(compat_str(e), e.format_traceback())
784                 break
785             except MaxDownloadsReached:
786                 raise
787             except Exception as e:
788                 if self.params.get('ignoreerrors', False):
789                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
790                     break
791                 else:
792                     raise
793         else:
794             self.report_error('no suitable InfoExtractor for URL %s' % url)
795
796     def add_default_extra_info(self, ie_result, ie, url):
797         self.add_extra_info(ie_result, {
798             'extractor': ie.IE_NAME,
799             'webpage_url': url,
800             'webpage_url_basename': url_basename(url),
801             'extractor_key': ie.ie_key(),
802         })
803
804     def process_ie_result(self, ie_result, download=True, extra_info={}):
805         """
806         Take the result of the ie(may be modified) and resolve all unresolved
807         references (URLs, playlist items).
808
809         It will also download the videos if 'download'.
810         Returns the resolved ie_result.
811         """
812         result_type = ie_result.get('_type', 'video')
813
814         if result_type in ('url', 'url_transparent'):
815             ie_result['url'] = sanitize_url(ie_result['url'])
816             extract_flat = self.params.get('extract_flat', False)
817             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
818                     extract_flat is True):
819                 if self.params.get('forcejson', False):
820                     self.to_stdout(json.dumps(ie_result))
821                 return ie_result
822
823         if result_type == 'video':
824             self.add_extra_info(ie_result, extra_info)
825             return self.process_video_result(ie_result, download=download)
826         elif result_type == 'url':
827             # We have to add extra_info to the results because it may be
828             # contained in a playlist
829             return self.extract_info(ie_result['url'],
830                                      download,
831                                      ie_key=ie_result.get('ie_key'),
832                                      extra_info=extra_info)
833         elif result_type == 'url_transparent':
834             # Use the information from the embedding page
835             info = self.extract_info(
836                 ie_result['url'], ie_key=ie_result.get('ie_key'),
837                 extra_info=extra_info, download=False, process=False)
838
839             # extract_info may return None when ignoreerrors is enabled and
840             # extraction failed with an error, don't crash and return early
841             # in this case
842             if not info:
843                 return info
844
845             force_properties = dict(
846                 (k, v) for k, v in ie_result.items() if v is not None)
847             for f in ('_type', 'url', 'ie_key'):
848                 if f in force_properties:
849                     del force_properties[f]
850             new_result = info.copy()
851             new_result.update(force_properties)
852
853             # Extracted info may not be a video result (i.e.
854             # info.get('_type', 'video') != video) but rather an url or
855             # url_transparent. In such cases outer metadata (from ie_result)
856             # should be propagated to inner one (info). For this to happen
857             # _type of info should be overridden with url_transparent. This
858             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
859             if new_result.get('_type') == 'url':
860                 new_result['_type'] = 'url_transparent'
861
862             return self.process_ie_result(
863                 new_result, download=download, extra_info=extra_info)
864         elif result_type in ('playlist', 'multi_video'):
865             # We process each entry in the playlist
866             playlist = ie_result.get('title') or ie_result.get('id')
867             self.to_screen('[download] Downloading playlist: %s' % playlist)
868
869             playlist_results = []
870
871             playliststart = self.params.get('playliststart', 1) - 1
872             playlistend = self.params.get('playlistend')
873             # For backwards compatibility, interpret -1 as whole list
874             if playlistend == -1:
875                 playlistend = None
876
877             playlistitems_str = self.params.get('playlist_items')
878             playlistitems = None
879             if playlistitems_str is not None:
880                 def iter_playlistitems(format):
881                     for string_segment in format.split(','):
882                         if '-' in string_segment:
883                             start, end = string_segment.split('-')
884                             for item in range(int(start), int(end) + 1):
885                                 yield int(item)
886                         else:
887                             yield int(string_segment)
888                 playlistitems = iter_playlistitems(playlistitems_str)
889
890             ie_entries = ie_result['entries']
891             if isinstance(ie_entries, list):
892                 n_all_entries = len(ie_entries)
893                 if playlistitems:
894                     entries = [
895                         ie_entries[i - 1] for i in playlistitems
896                         if -n_all_entries <= i - 1 < n_all_entries]
897                 else:
898                     entries = ie_entries[playliststart:playlistend]
899                 n_entries = len(entries)
900                 self.to_screen(
901                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
902                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
903             elif isinstance(ie_entries, PagedList):
904                 if playlistitems:
905                     entries = []
906                     for item in playlistitems:
907                         entries.extend(ie_entries.getslice(
908                             item - 1, item
909                         ))
910                 else:
911                     entries = ie_entries.getslice(
912                         playliststart, playlistend)
913                 n_entries = len(entries)
914                 self.to_screen(
915                     '[%s] playlist %s: Downloading %d videos' %
916                     (ie_result['extractor'], playlist, n_entries))
917             else:  # iterable
918                 if playlistitems:
919                     entry_list = list(ie_entries)
920                     entries = [entry_list[i - 1] for i in playlistitems]
921                 else:
922                     entries = list(itertools.islice(
923                         ie_entries, playliststart, playlistend))
924                 n_entries = len(entries)
925                 self.to_screen(
926                     '[%s] playlist %s: Downloading %d videos' %
927                     (ie_result['extractor'], playlist, n_entries))
928
929             if self.params.get('playlistreverse', False):
930                 entries = entries[::-1]
931
932             if self.params.get('playlistrandom', False):
933                 random.shuffle(entries)
934
935             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
936
937             for i, entry in enumerate(entries, 1):
938                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
939                 # This __x_forwarded_for_ip thing is a bit ugly but requires
940                 # minimal changes
941                 if x_forwarded_for:
942                     entry['__x_forwarded_for_ip'] = x_forwarded_for
943                 extra = {
944                     'n_entries': n_entries,
945                     'playlist': playlist,
946                     'playlist_id': ie_result.get('id'),
947                     'playlist_title': ie_result.get('title'),
948                     'playlist_index': i + playliststart,
949                     'extractor': ie_result['extractor'],
950                     'webpage_url': ie_result['webpage_url'],
951                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
952                     'extractor_key': ie_result['extractor_key'],
953                 }
954
955                 reason = self._match_entry(entry, incomplete=True)
956                 if reason is not None:
957                     self.to_screen('[download] ' + reason)
958                     continue
959
960                 entry_result = self.process_ie_result(entry,
961                                                       download=download,
962                                                       extra_info=extra)
963                 playlist_results.append(entry_result)
964             ie_result['entries'] = playlist_results
965             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
966             return ie_result
967         elif result_type == 'compat_list':
968             self.report_warning(
969                 'Extractor %s returned a compat_list result. '
970                 'It needs to be updated.' % ie_result.get('extractor'))
971
972             def _fixup(r):
973                 self.add_extra_info(
974                     r,
975                     {
976                         'extractor': ie_result['extractor'],
977                         'webpage_url': ie_result['webpage_url'],
978                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
979                         'extractor_key': ie_result['extractor_key'],
980                     }
981                 )
982                 return r
983             ie_result['entries'] = [
984                 self.process_ie_result(_fixup(r), download, extra_info)
985                 for r in ie_result['entries']
986             ]
987             return ie_result
988         else:
989             raise Exception('Invalid result type: %s' % result_type)
990
991     def _build_format_filter(self, filter_spec):
992         " Returns a function to filter the formats according to the filter_spec "
993
994         OPERATORS = {
995             '<': operator.lt,
996             '<=': operator.le,
997             '>': operator.gt,
998             '>=': operator.ge,
999             '=': operator.eq,
1000             '!=': operator.ne,
1001         }
1002         operator_rex = re.compile(r'''(?x)\s*
1003             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1004             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1005             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1006             $
1007             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1008         m = operator_rex.search(filter_spec)
1009         if m:
1010             try:
1011                 comparison_value = int(m.group('value'))
1012             except ValueError:
1013                 comparison_value = parse_filesize(m.group('value'))
1014                 if comparison_value is None:
1015                     comparison_value = parse_filesize(m.group('value') + 'B')
1016                 if comparison_value is None:
1017                     raise ValueError(
1018                         'Invalid value %r in format specification %r' % (
1019                             m.group('value'), filter_spec))
1020             op = OPERATORS[m.group('op')]
1021
1022         if not m:
1023             STR_OPERATORS = {
1024                 '=': operator.eq,
1025                 '!=': operator.ne,
1026                 '^=': lambda attr, value: attr.startswith(value),
1027                 '$=': lambda attr, value: attr.endswith(value),
1028                 '*=': lambda attr, value: value in attr,
1029             }
1030             str_operator_rex = re.compile(r'''(?x)
1031                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1032                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1033                 \s*(?P<value>[a-zA-Z0-9._-]+)
1034                 \s*$
1035                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1036             m = str_operator_rex.search(filter_spec)
1037             if m:
1038                 comparison_value = m.group('value')
1039                 op = STR_OPERATORS[m.group('op')]
1040
1041         if not m:
1042             raise ValueError('Invalid filter specification %r' % filter_spec)
1043
1044         def _filter(f):
1045             actual_value = f.get(m.group('key'))
1046             if actual_value is None:
1047                 return m.group('none_inclusive')
1048             return op(actual_value, comparison_value)
1049         return _filter
1050
1051     def build_format_selector(self, format_spec):
1052         def syntax_error(note, start):
1053             message = (
1054                 'Invalid format specification: '
1055                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1056             return SyntaxError(message)
1057
1058         PICKFIRST = 'PICKFIRST'
1059         MERGE = 'MERGE'
1060         SINGLE = 'SINGLE'
1061         GROUP = 'GROUP'
1062         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1063
1064         def _parse_filter(tokens):
1065             filter_parts = []
1066             for type, string, start, _, _ in tokens:
1067                 if type == tokenize.OP and string == ']':
1068                     return ''.join(filter_parts)
1069                 else:
1070                     filter_parts.append(string)
1071
1072         def _remove_unused_ops(tokens):
1073             # Remove operators that we don't use and join them with the surrounding strings
1074             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1075             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1076             last_string, last_start, last_end, last_line = None, None, None, None
1077             for type, string, start, end, line in tokens:
1078                 if type == tokenize.OP and string == '[':
1079                     if last_string:
1080                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1081                         last_string = None
1082                     yield type, string, start, end, line
1083                     # everything inside brackets will be handled by _parse_filter
1084                     for type, string, start, end, line in tokens:
1085                         yield type, string, start, end, line
1086                         if type == tokenize.OP and string == ']':
1087                             break
1088                 elif type == tokenize.OP and string in ALLOWED_OPS:
1089                     if last_string:
1090                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1091                         last_string = None
1092                     yield type, string, start, end, line
1093                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1094                     if not last_string:
1095                         last_string = string
1096                         last_start = start
1097                         last_end = end
1098                     else:
1099                         last_string += string
1100             if last_string:
1101                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1102
1103         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1104             selectors = []
1105             current_selector = None
1106             for type, string, start, _, _ in tokens:
1107                 # ENCODING is only defined in python 3.x
1108                 if type == getattr(tokenize, 'ENCODING', None):
1109                     continue
1110                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1111                     current_selector = FormatSelector(SINGLE, string, [])
1112                 elif type == tokenize.OP:
1113                     if string == ')':
1114                         if not inside_group:
1115                             # ')' will be handled by the parentheses group
1116                             tokens.restore_last_token()
1117                         break
1118                     elif inside_merge and string in ['/', ',']:
1119                         tokens.restore_last_token()
1120                         break
1121                     elif inside_choice and string == ',':
1122                         tokens.restore_last_token()
1123                         break
1124                     elif string == ',':
1125                         if not current_selector:
1126                             raise syntax_error('"," must follow a format selector', start)
1127                         selectors.append(current_selector)
1128                         current_selector = None
1129                     elif string == '/':
1130                         if not current_selector:
1131                             raise syntax_error('"/" must follow a format selector', start)
1132                         first_choice = current_selector
1133                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1134                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1135                     elif string == '[':
1136                         if not current_selector:
1137                             current_selector = FormatSelector(SINGLE, 'best', [])
1138                         format_filter = _parse_filter(tokens)
1139                         current_selector.filters.append(format_filter)
1140                     elif string == '(':
1141                         if current_selector:
1142                             raise syntax_error('Unexpected "("', start)
1143                         group = _parse_format_selection(tokens, inside_group=True)
1144                         current_selector = FormatSelector(GROUP, group, [])
1145                     elif string == '+':
1146                         video_selector = current_selector
1147                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1148                         if not video_selector or not audio_selector:
1149                             raise syntax_error('"+" must be between two format selectors', start)
1150                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1151                     else:
1152                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1153                 elif type == tokenize.ENDMARKER:
1154                     break
1155             if current_selector:
1156                 selectors.append(current_selector)
1157             return selectors
1158
1159         def _build_selector_function(selector):
1160             if isinstance(selector, list):
1161                 fs = [_build_selector_function(s) for s in selector]
1162
1163                 def selector_function(ctx):
1164                     for f in fs:
1165                         for format in f(ctx):
1166                             yield format
1167                 return selector_function
1168             elif selector.type == GROUP:
1169                 selector_function = _build_selector_function(selector.selector)
1170             elif selector.type == PICKFIRST:
1171                 fs = [_build_selector_function(s) for s in selector.selector]
1172
1173                 def selector_function(ctx):
1174                     for f in fs:
1175                         picked_formats = list(f(ctx))
1176                         if picked_formats:
1177                             return picked_formats
1178                     return []
1179             elif selector.type == SINGLE:
1180                 format_spec = selector.selector
1181
1182                 def selector_function(ctx):
1183                     formats = list(ctx['formats'])
1184                     if not formats:
1185                         return
1186                     if format_spec == 'all':
1187                         for f in formats:
1188                             yield f
1189                     elif format_spec in ['best', 'worst', None]:
1190                         format_idx = 0 if format_spec == 'worst' else -1
1191                         audiovideo_formats = [
1192                             f for f in formats
1193                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1194                         if audiovideo_formats:
1195                             yield audiovideo_formats[format_idx]
1196                         # for extractors with incomplete formats (audio only (soundcloud)
1197                         # or video only (imgur)) we will fallback to best/worst
1198                         # {video,audio}-only format
1199                         elif ctx['incomplete_formats']:
1200                             yield formats[format_idx]
1201                     elif format_spec == 'bestaudio':
1202                         audio_formats = [
1203                             f for f in formats
1204                             if f.get('vcodec') == 'none']
1205                         if audio_formats:
1206                             yield audio_formats[-1]
1207                     elif format_spec == 'worstaudio':
1208                         audio_formats = [
1209                             f for f in formats
1210                             if f.get('vcodec') == 'none']
1211                         if audio_formats:
1212                             yield audio_formats[0]
1213                     elif format_spec == 'bestvideo':
1214                         video_formats = [
1215                             f for f in formats
1216                             if f.get('acodec') == 'none']
1217                         if video_formats:
1218                             yield video_formats[-1]
1219                     elif format_spec == 'worstvideo':
1220                         video_formats = [
1221                             f for f in formats
1222                             if f.get('acodec') == 'none']
1223                         if video_formats:
1224                             yield video_formats[0]
1225                     else:
1226                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1227                         if format_spec in extensions:
1228                             filter_f = lambda f: f['ext'] == format_spec
1229                         else:
1230                             filter_f = lambda f: f['format_id'] == format_spec
1231                         matches = list(filter(filter_f, formats))
1232                         if matches:
1233                             yield matches[-1]
1234             elif selector.type == MERGE:
1235                 def _merge(formats_info):
1236                     format_1, format_2 = [f['format_id'] for f in formats_info]
1237                     # The first format must contain the video and the
1238                     # second the audio
1239                     if formats_info[0].get('vcodec') == 'none':
1240                         self.report_error('The first format must '
1241                                           'contain the video, try using '
1242                                           '"-f %s+%s"' % (format_2, format_1))
1243                         return
1244                     # Formats must be opposite (video+audio)
1245                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1246                         self.report_error(
1247                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1248                             % (format_1, format_2))
1249                         return
1250                     output_ext = (
1251                         formats_info[0]['ext']
1252                         if self.params.get('merge_output_format') is None
1253                         else self.params['merge_output_format'])
1254                     return {
1255                         'requested_formats': formats_info,
1256                         'format': '%s+%s' % (formats_info[0].get('format'),
1257                                              formats_info[1].get('format')),
1258                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1259                                                 formats_info[1].get('format_id')),
1260                         'width': formats_info[0].get('width'),
1261                         'height': formats_info[0].get('height'),
1262                         'resolution': formats_info[0].get('resolution'),
1263                         'fps': formats_info[0].get('fps'),
1264                         'vcodec': formats_info[0].get('vcodec'),
1265                         'vbr': formats_info[0].get('vbr'),
1266                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1267                         'acodec': formats_info[1].get('acodec'),
1268                         'abr': formats_info[1].get('abr'),
1269                         'ext': output_ext,
1270                     }
1271                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1272
1273                 def selector_function(ctx):
1274                     for pair in itertools.product(
1275                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1276                         yield _merge(pair)
1277
1278             filters = [self._build_format_filter(f) for f in selector.filters]
1279
1280             def final_selector(ctx):
1281                 ctx_copy = copy.deepcopy(ctx)
1282                 for _filter in filters:
1283                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1284                 return selector_function(ctx_copy)
1285             return final_selector
1286
1287         stream = io.BytesIO(format_spec.encode('utf-8'))
1288         try:
1289             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1290         except tokenize.TokenError:
1291             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1292
1293         class TokenIterator(object):
1294             def __init__(self, tokens):
1295                 self.tokens = tokens
1296                 self.counter = 0
1297
1298             def __iter__(self):
1299                 return self
1300
1301             def __next__(self):
1302                 if self.counter >= len(self.tokens):
1303                     raise StopIteration()
1304                 value = self.tokens[self.counter]
1305                 self.counter += 1
1306                 return value
1307
1308             next = __next__
1309
1310             def restore_last_token(self):
1311                 self.counter -= 1
1312
1313         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1314         return _build_selector_function(parsed_selector)
1315
1316     def _calc_headers(self, info_dict):
1317         res = std_headers.copy()
1318
1319         add_headers = info_dict.get('http_headers')
1320         if add_headers:
1321             res.update(add_headers)
1322
1323         cookies = self._calc_cookies(info_dict)
1324         if cookies:
1325             res['Cookie'] = cookies
1326
1327         if 'X-Forwarded-For' not in res:
1328             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1329             if x_forwarded_for_ip:
1330                 res['X-Forwarded-For'] = x_forwarded_for_ip
1331
1332         return res
1333
1334     def _calc_cookies(self, info_dict):
1335         pr = sanitized_Request(info_dict['url'])
1336         self.cookiejar.add_cookie_header(pr)
1337         return pr.get_header('Cookie')
1338
1339     def process_video_result(self, info_dict, download=True):
1340         assert info_dict.get('_type', 'video') == 'video'
1341
1342         if 'id' not in info_dict:
1343             raise ExtractorError('Missing "id" field in extractor result')
1344         if 'title' not in info_dict:
1345             raise ExtractorError('Missing "title" field in extractor result')
1346
1347         if not isinstance(info_dict['id'], compat_str):
1348             self.report_warning('"id" field is not a string - forcing string conversion')
1349             info_dict['id'] = compat_str(info_dict['id'])
1350
1351         if 'playlist' not in info_dict:
1352             # It isn't part of a playlist
1353             info_dict['playlist'] = None
1354             info_dict['playlist_index'] = None
1355
1356         thumbnails = info_dict.get('thumbnails')
1357         if thumbnails is None:
1358             thumbnail = info_dict.get('thumbnail')
1359             if thumbnail:
1360                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1361         if thumbnails:
1362             thumbnails.sort(key=lambda t: (
1363                 t.get('preference') if t.get('preference') is not None else -1,
1364                 t.get('width') if t.get('width') is not None else -1,
1365                 t.get('height') if t.get('height') is not None else -1,
1366                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1367             for i, t in enumerate(thumbnails):
1368                 t['url'] = sanitize_url(t['url'])
1369                 if t.get('width') and t.get('height'):
1370                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1371                 if t.get('id') is None:
1372                     t['id'] = '%d' % i
1373
1374         if self.params.get('list_thumbnails'):
1375             self.list_thumbnails(info_dict)
1376             return
1377
1378         thumbnail = info_dict.get('thumbnail')
1379         if thumbnail:
1380             info_dict['thumbnail'] = sanitize_url(thumbnail)
1381         elif thumbnails:
1382             info_dict['thumbnail'] = thumbnails[-1]['url']
1383
1384         if 'display_id' not in info_dict and 'id' in info_dict:
1385             info_dict['display_id'] = info_dict['id']
1386
1387         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1388             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1389             # see http://bugs.python.org/issue1646728)
1390             try:
1391                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1392                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1393             except (ValueError, OverflowError, OSError):
1394                 pass
1395
1396         # Auto generate title fields corresponding to the *_number fields when missing
1397         # in order to always have clean titles. This is very common for TV series.
1398         for field in ('chapter', 'season', 'episode'):
1399             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1400                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1401
1402         subtitles = info_dict.get('subtitles')
1403         if subtitles:
1404             for _, subtitle in subtitles.items():
1405                 for subtitle_format in subtitle:
1406                     if subtitle_format.get('url'):
1407                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1408                     if subtitle_format.get('ext') is None:
1409                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1410
1411         if self.params.get('listsubtitles', False):
1412             if 'automatic_captions' in info_dict:
1413                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1414             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1415             return
1416         info_dict['requested_subtitles'] = self.process_subtitles(
1417             info_dict['id'], subtitles,
1418             info_dict.get('automatic_captions'))
1419
1420         # We now pick which formats have to be downloaded
1421         if info_dict.get('formats') is None:
1422             # There's only one format available
1423             formats = [info_dict]
1424         else:
1425             formats = info_dict['formats']
1426
1427         if not formats:
1428             raise ExtractorError('No video formats found!')
1429
1430         formats_dict = {}
1431
1432         # We check that all the formats have the format and format_id fields
1433         for i, format in enumerate(formats):
1434             if 'url' not in format:
1435                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1436
1437             format['url'] = sanitize_url(format['url'])
1438
1439             if format.get('format_id') is None:
1440                 format['format_id'] = compat_str(i)
1441             else:
1442                 # Sanitize format_id from characters used in format selector expression
1443                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1444             format_id = format['format_id']
1445             if format_id not in formats_dict:
1446                 formats_dict[format_id] = []
1447             formats_dict[format_id].append(format)
1448
1449         # Make sure all formats have unique format_id
1450         for format_id, ambiguous_formats in formats_dict.items():
1451             if len(ambiguous_formats) > 1:
1452                 for i, format in enumerate(ambiguous_formats):
1453                     format['format_id'] = '%s-%d' % (format_id, i)
1454
1455         for i, format in enumerate(formats):
1456             if format.get('format') is None:
1457                 format['format'] = '{id} - {res}{note}'.format(
1458                     id=format['format_id'],
1459                     res=self.format_resolution(format),
1460                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1461                 )
1462             # Automatically determine file extension if missing
1463             if format.get('ext') is None:
1464                 format['ext'] = determine_ext(format['url']).lower()
1465             # Automatically determine protocol if missing (useful for format
1466             # selection purposes)
1467             if format.get('protocol') is None:
1468                 format['protocol'] = determine_protocol(format)
1469             # Add HTTP headers, so that external programs can use them from the
1470             # json output
1471             full_format_info = info_dict.copy()
1472             full_format_info.update(format)
1473             format['http_headers'] = self._calc_headers(full_format_info)
1474         # Remove private housekeeping stuff
1475         if '__x_forwarded_for_ip' in info_dict:
1476             del info_dict['__x_forwarded_for_ip']
1477
1478         # TODO Central sorting goes here
1479
1480         if formats[0] is not info_dict:
1481             # only set the 'formats' fields if the original info_dict list them
1482             # otherwise we end up with a circular reference, the first (and unique)
1483             # element in the 'formats' field in info_dict is info_dict itself,
1484             # which can't be exported to json
1485             info_dict['formats'] = formats
1486         if self.params.get('listformats'):
1487             self.list_formats(info_dict)
1488             return
1489
1490         req_format = self.params.get('format')
1491         if req_format is None:
1492             req_format_list = []
1493             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1494                     not info_dict.get('is_live')):
1495                 merger = FFmpegMergerPP(self)
1496                 if merger.available and merger.can_merge():
1497                     req_format_list.append('bestvideo+bestaudio')
1498             req_format_list.append('best')
1499             req_format = '/'.join(req_format_list)
1500         format_selector = self.build_format_selector(req_format)
1501
1502         # While in format selection we may need to have an access to the original
1503         # format set in order to calculate some metrics or do some processing.
1504         # For now we need to be able to guess whether original formats provided
1505         # by extractor are incomplete or not (i.e. whether extractor provides only
1506         # video-only or audio-only formats) for proper formats selection for
1507         # extractors with such incomplete formats (see
1508         # https://github.com/rg3/youtube-dl/pull/5556).
1509         # Since formats may be filtered during format selection and may not match
1510         # the original formats the results may be incorrect. Thus original formats
1511         # or pre-calculated metrics should be passed to format selection routines
1512         # as well.
1513         # We will pass a context object containing all necessary additional data
1514         # instead of just formats.
1515         # This fixes incorrect format selection issue (see
1516         # https://github.com/rg3/youtube-dl/issues/10083).
1517         incomplete_formats = (
1518             # All formats are video-only or
1519             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1520             # all formats are audio-only
1521             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1522
1523         ctx = {
1524             'formats': formats,
1525             'incomplete_formats': incomplete_formats,
1526         }
1527
1528         formats_to_download = list(format_selector(ctx))
1529         if not formats_to_download:
1530             raise ExtractorError('requested format not available',
1531                                  expected=True)
1532
1533         if download:
1534             if len(formats_to_download) > 1:
1535                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1536             for format in formats_to_download:
1537                 new_info = dict(info_dict)
1538                 new_info.update(format)
1539                 self.process_info(new_info)
1540         # We update the info dict with the best quality format (backwards compatibility)
1541         info_dict.update(formats_to_download[-1])
1542         return info_dict
1543
1544     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1545         """Select the requested subtitles and their format"""
1546         available_subs = {}
1547         if normal_subtitles and self.params.get('writesubtitles'):
1548             available_subs.update(normal_subtitles)
1549         if automatic_captions and self.params.get('writeautomaticsub'):
1550             for lang, cap_info in automatic_captions.items():
1551                 if lang not in available_subs:
1552                     available_subs[lang] = cap_info
1553
1554         if (not self.params.get('writesubtitles') and not
1555                 self.params.get('writeautomaticsub') or not
1556                 available_subs):
1557             return None
1558
1559         if self.params.get('allsubtitles', False):
1560             requested_langs = available_subs.keys()
1561         else:
1562             if self.params.get('subtitleslangs', False):
1563                 requested_langs = self.params.get('subtitleslangs')
1564             elif 'en' in available_subs:
1565                 requested_langs = ['en']
1566             else:
1567                 requested_langs = [list(available_subs.keys())[0]]
1568
1569         formats_query = self.params.get('subtitlesformat', 'best')
1570         formats_preference = formats_query.split('/') if formats_query else []
1571         subs = {}
1572         for lang in requested_langs:
1573             formats = available_subs.get(lang)
1574             if formats is None:
1575                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1576                 continue
1577             for ext in formats_preference:
1578                 if ext == 'best':
1579                     f = formats[-1]
1580                     break
1581                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1582                 if matches:
1583                     f = matches[-1]
1584                     break
1585             else:
1586                 f = formats[-1]
1587                 self.report_warning(
1588                     'No subtitle format found matching "%s" for language %s, '
1589                     'using %s' % (formats_query, lang, f['ext']))
1590             subs[lang] = f
1591         return subs
1592
1593     def process_info(self, info_dict):
1594         """Process a single resolved IE result."""
1595
1596         assert info_dict.get('_type', 'video') == 'video'
1597
1598         max_downloads = self.params.get('max_downloads')
1599         if max_downloads is not None:
1600             if self._num_downloads >= int(max_downloads):
1601                 raise MaxDownloadsReached()
1602
1603         info_dict['fulltitle'] = info_dict['title']
1604         if len(info_dict['title']) > 200:
1605             info_dict['title'] = info_dict['title'][:197] + '...'
1606
1607         if 'format' not in info_dict:
1608             info_dict['format'] = info_dict['ext']
1609
1610         reason = self._match_entry(info_dict, incomplete=False)
1611         if reason is not None:
1612             self.to_screen('[download] ' + reason)
1613             return
1614
1615         self._num_downloads += 1
1616
1617         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1618
1619         # Forced printings
1620         if self.params.get('forcetitle', False):
1621             self.to_stdout(info_dict['fulltitle'])
1622         if self.params.get('forceid', False):
1623             self.to_stdout(info_dict['id'])
1624         if self.params.get('forceurl', False):
1625             if info_dict.get('requested_formats') is not None:
1626                 for f in info_dict['requested_formats']:
1627                     self.to_stdout(f['url'] + f.get('play_path', ''))
1628             else:
1629                 # For RTMP URLs, also include the playpath
1630                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1631         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1632             self.to_stdout(info_dict['thumbnail'])
1633         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1634             self.to_stdout(info_dict['description'])
1635         if self.params.get('forcefilename', False) and filename is not None:
1636             self.to_stdout(filename)
1637         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1638             self.to_stdout(formatSeconds(info_dict['duration']))
1639         if self.params.get('forceformat', False):
1640             self.to_stdout(info_dict['format'])
1641         if self.params.get('forcejson', False):
1642             self.to_stdout(json.dumps(info_dict))
1643
1644         # Do nothing else if in simulate mode
1645         if self.params.get('simulate', False):
1646             return
1647
1648         if filename is None:
1649             return
1650
1651         try:
1652             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1653             if dn and not os.path.exists(dn):
1654                 os.makedirs(dn)
1655         except (OSError, IOError) as err:
1656             self.report_error('unable to create directory ' + error_to_compat_str(err))
1657             return
1658
1659         if self.params.get('writedescription', False):
1660             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1661             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1662                 self.to_screen('[info] Video description is already present')
1663             elif info_dict.get('description') is None:
1664                 self.report_warning('There\'s no description to write.')
1665             else:
1666                 try:
1667                     self.to_screen('[info] Writing video description to: ' + descfn)
1668                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1669                         descfile.write(info_dict['description'])
1670                 except (OSError, IOError):
1671                     self.report_error('Cannot write description file ' + descfn)
1672                     return
1673
1674         if self.params.get('writeannotations', False):
1675             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1676             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1677                 self.to_screen('[info] Video annotations are already present')
1678             else:
1679                 try:
1680                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1681                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1682                         annofile.write(info_dict['annotations'])
1683                 except (KeyError, TypeError):
1684                     self.report_warning('There are no annotations to write.')
1685                 except (OSError, IOError):
1686                     self.report_error('Cannot write annotations file: ' + annofn)
1687                     return
1688
1689         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1690                                        self.params.get('writeautomaticsub')])
1691
1692         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1693             # subtitles download errors are already managed as troubles in relevant IE
1694             # that way it will silently go on when used with unsupporting IE
1695             subtitles = info_dict['requested_subtitles']
1696             ie = self.get_info_extractor(info_dict['extractor_key'])
1697             for sub_lang, sub_info in subtitles.items():
1698                 sub_format = sub_info['ext']
1699                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1700                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1701                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1702                 else:
1703                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1704                     if sub_info.get('data') is not None:
1705                         try:
1706                             # Use newline='' to prevent conversion of newline characters
1707                             # See https://github.com/rg3/youtube-dl/issues/10268
1708                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1709                                 subfile.write(sub_info['data'])
1710                         except (OSError, IOError):
1711                             self.report_error('Cannot write subtitles file ' + sub_filename)
1712                             return
1713                     else:
1714                         try:
1715                             sub_data = ie._request_webpage(
1716                                 sub_info['url'], info_dict['id'], note=False).read()
1717                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1718                                 subfile.write(sub_data)
1719                         except (ExtractorError, IOError, OSError, ValueError) as err:
1720                             self.report_warning('Unable to download subtitle for "%s": %s' %
1721                                                 (sub_lang, error_to_compat_str(err)))
1722                             continue
1723
1724         if self.params.get('writeinfojson', False):
1725             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1726             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1727                 self.to_screen('[info] Video description metadata is already present')
1728             else:
1729                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1730                 try:
1731                     write_json_file(self.filter_requested_info(info_dict), infofn)
1732                 except (OSError, IOError):
1733                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1734                     return
1735
1736         self._write_thumbnails(info_dict, filename)
1737
1738         if not self.params.get('skip_download', False):
1739             try:
1740                 def dl(name, info):
1741                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1742                     for ph in self._progress_hooks:
1743                         fd.add_progress_hook(ph)
1744                     if self.params.get('verbose'):
1745                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1746                     return fd.download(name, info)
1747
1748                 if info_dict.get('requested_formats') is not None:
1749                     downloaded = []
1750                     success = True
1751                     merger = FFmpegMergerPP(self)
1752                     if not merger.available:
1753                         postprocessors = []
1754                         self.report_warning('You have requested multiple '
1755                                             'formats but ffmpeg or avconv are not installed.'
1756                                             ' The formats won\'t be merged.')
1757                     else:
1758                         postprocessors = [merger]
1759
1760                     def compatible_formats(formats):
1761                         video, audio = formats
1762                         # Check extension
1763                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1764                         if video_ext and audio_ext:
1765                             COMPATIBLE_EXTS = (
1766                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1767                                 ('webm')
1768                             )
1769                             for exts in COMPATIBLE_EXTS:
1770                                 if video_ext in exts and audio_ext in exts:
1771                                     return True
1772                         # TODO: Check acodec/vcodec
1773                         return False
1774
1775                     filename_real_ext = os.path.splitext(filename)[1][1:]
1776                     filename_wo_ext = (
1777                         os.path.splitext(filename)[0]
1778                         if filename_real_ext == info_dict['ext']
1779                         else filename)
1780                     requested_formats = info_dict['requested_formats']
1781                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1782                         info_dict['ext'] = 'mkv'
1783                         self.report_warning(
1784                             'Requested formats are incompatible for merge and will be merged into mkv.')
1785                     # Ensure filename always has a correct extension for successful merge
1786                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1787                     if os.path.exists(encodeFilename(filename)):
1788                         self.to_screen(
1789                             '[download] %s has already been downloaded and '
1790                             'merged' % filename)
1791                     else:
1792                         for f in requested_formats:
1793                             new_info = dict(info_dict)
1794                             new_info.update(f)
1795                             fname = self.prepare_filename(new_info)
1796                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1797                             downloaded.append(fname)
1798                             partial_success = dl(fname, new_info)
1799                             success = success and partial_success
1800                         info_dict['__postprocessors'] = postprocessors
1801                         info_dict['__files_to_merge'] = downloaded
1802                 else:
1803                     # Just a single file
1804                     success = dl(filename, info_dict)
1805             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1806                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1807                 return
1808             except (OSError, IOError) as err:
1809                 raise UnavailableVideoError(err)
1810             except (ContentTooShortError, ) as err:
1811                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1812                 return
1813
1814             if success and filename != '-':
1815                 # Fixup content
1816                 fixup_policy = self.params.get('fixup')
1817                 if fixup_policy is None:
1818                     fixup_policy = 'detect_or_warn'
1819
1820                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1821
1822                 stretched_ratio = info_dict.get('stretched_ratio')
1823                 if stretched_ratio is not None and stretched_ratio != 1:
1824                     if fixup_policy == 'warn':
1825                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1826                             info_dict['id'], stretched_ratio))
1827                     elif fixup_policy == 'detect_or_warn':
1828                         stretched_pp = FFmpegFixupStretchedPP(self)
1829                         if stretched_pp.available:
1830                             info_dict.setdefault('__postprocessors', [])
1831                             info_dict['__postprocessors'].append(stretched_pp)
1832                         else:
1833                             self.report_warning(
1834                                 '%s: Non-uniform pixel ratio (%s). %s'
1835                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1836                     else:
1837                         assert fixup_policy in ('ignore', 'never')
1838
1839                 if (info_dict.get('requested_formats') is None and
1840                         info_dict.get('container') == 'm4a_dash'):
1841                     if fixup_policy == 'warn':
1842                         self.report_warning(
1843                             '%s: writing DASH m4a. '
1844                             'Only some players support this container.'
1845                             % info_dict['id'])
1846                     elif fixup_policy == 'detect_or_warn':
1847                         fixup_pp = FFmpegFixupM4aPP(self)
1848                         if fixup_pp.available:
1849                             info_dict.setdefault('__postprocessors', [])
1850                             info_dict['__postprocessors'].append(fixup_pp)
1851                         else:
1852                             self.report_warning(
1853                                 '%s: writing DASH m4a. '
1854                                 'Only some players support this container. %s'
1855                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1856                     else:
1857                         assert fixup_policy in ('ignore', 'never')
1858
1859                 if (info_dict.get('protocol') == 'm3u8_native' or
1860                         info_dict.get('protocol') == 'm3u8' and
1861                         self.params.get('hls_prefer_native')):
1862                     if fixup_policy == 'warn':
1863                         self.report_warning('%s: malformated aac bitstream.' % (
1864                             info_dict['id']))
1865                     elif fixup_policy == 'detect_or_warn':
1866                         fixup_pp = FFmpegFixupM3u8PP(self)
1867                         if fixup_pp.available:
1868                             info_dict.setdefault('__postprocessors', [])
1869                             info_dict['__postprocessors'].append(fixup_pp)
1870                         else:
1871                             self.report_warning(
1872                                 '%s: malformated aac bitstream. %s'
1873                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1874                     else:
1875                         assert fixup_policy in ('ignore', 'never')
1876
1877                 try:
1878                     self.post_process(filename, info_dict)
1879                 except (PostProcessingError) as err:
1880                     self.report_error('postprocessing: %s' % str(err))
1881                     return
1882                 self.record_download_archive(info_dict)
1883
1884     def download(self, url_list):
1885         """Download a given list of URLs."""
1886         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1887         if (len(url_list) > 1 and
1888                 outtmpl != '-' and
1889                 '%' not in outtmpl and
1890                 self.params.get('max_downloads') != 1):
1891             raise SameFileError(outtmpl)
1892
1893         for url in url_list:
1894             try:
1895                 # It also downloads the videos
1896                 res = self.extract_info(
1897                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1898             except UnavailableVideoError:
1899                 self.report_error('unable to download video')
1900             except MaxDownloadsReached:
1901                 self.to_screen('[info] Maximum number of downloaded files reached.')
1902                 raise
1903             else:
1904                 if self.params.get('dump_single_json', False):
1905                     self.to_stdout(json.dumps(res))
1906
1907         return self._download_retcode
1908
1909     def download_with_info_file(self, info_filename):
1910         with contextlib.closing(fileinput.FileInput(
1911                 [info_filename], mode='r',
1912                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1913             # FileInput doesn't have a read method, we can't call json.load
1914             info = self.filter_requested_info(json.loads('\n'.join(f)))
1915         try:
1916             self.process_ie_result(info, download=True)
1917         except DownloadError:
1918             webpage_url = info.get('webpage_url')
1919             if webpage_url is not None:
1920                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1921                 return self.download([webpage_url])
1922             else:
1923                 raise
1924         return self._download_retcode
1925
1926     @staticmethod
1927     def filter_requested_info(info_dict):
1928         return dict(
1929             (k, v) for k, v in info_dict.items()
1930             if k not in ['requested_formats', 'requested_subtitles'])
1931
1932     def post_process(self, filename, ie_info):
1933         """Run all the postprocessors on the given file."""
1934         info = dict(ie_info)
1935         info['filepath'] = filename
1936         pps_chain = []
1937         if ie_info.get('__postprocessors') is not None:
1938             pps_chain.extend(ie_info['__postprocessors'])
1939         pps_chain.extend(self._pps)
1940         for pp in pps_chain:
1941             files_to_delete = []
1942             try:
1943                 files_to_delete, info = pp.run(info)
1944             except PostProcessingError as e:
1945                 self.report_error(e.msg)
1946             if files_to_delete and not self.params.get('keepvideo', False):
1947                 for old_filename in files_to_delete:
1948                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1949                     try:
1950                         os.remove(encodeFilename(old_filename))
1951                     except (IOError, OSError):
1952                         self.report_warning('Unable to remove downloaded original file')
1953
1954     def _make_archive_id(self, info_dict):
1955         # Future-proof against any change in case
1956         # and backwards compatibility with prior versions
1957         extractor = info_dict.get('extractor_key')
1958         if extractor is None:
1959             if 'id' in info_dict:
1960                 extractor = info_dict.get('ie_key')  # key in a playlist
1961         if extractor is None:
1962             return None  # Incomplete video information
1963         return extractor.lower() + ' ' + info_dict['id']
1964
1965     def in_download_archive(self, info_dict):
1966         fn = self.params.get('download_archive')
1967         if fn is None:
1968             return False
1969
1970         vid_id = self._make_archive_id(info_dict)
1971         if vid_id is None:
1972             return False  # Incomplete video information
1973
1974         try:
1975             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1976                 for line in archive_file:
1977                     if line.strip() == vid_id:
1978                         return True
1979         except IOError as ioe:
1980             if ioe.errno != errno.ENOENT:
1981                 raise
1982         return False
1983
1984     def record_download_archive(self, info_dict):
1985         fn = self.params.get('download_archive')
1986         if fn is None:
1987             return
1988         vid_id = self._make_archive_id(info_dict)
1989         assert vid_id
1990         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1991             archive_file.write(vid_id + '\n')
1992
1993     @staticmethod
1994     def format_resolution(format, default='unknown'):
1995         if format.get('vcodec') == 'none':
1996             return 'audio only'
1997         if format.get('resolution') is not None:
1998             return format['resolution']
1999         if format.get('height') is not None:
2000             if format.get('width') is not None:
2001                 res = '%sx%s' % (format['width'], format['height'])
2002             else:
2003                 res = '%sp' % format['height']
2004         elif format.get('width') is not None:
2005             res = '%dx?' % format['width']
2006         else:
2007             res = default
2008         return res
2009
2010     def _format_note(self, fdict):
2011         res = ''
2012         if fdict.get('ext') in ['f4f', 'f4m']:
2013             res += '(unsupported) '
2014         if fdict.get('language'):
2015             if res:
2016                 res += ' '
2017             res += '[%s] ' % fdict['language']
2018         if fdict.get('format_note') is not None:
2019             res += fdict['format_note'] + ' '
2020         if fdict.get('tbr') is not None:
2021             res += '%4dk ' % fdict['tbr']
2022         if fdict.get('container') is not None:
2023             if res:
2024                 res += ', '
2025             res += '%s container' % fdict['container']
2026         if (fdict.get('vcodec') is not None and
2027                 fdict.get('vcodec') != 'none'):
2028             if res:
2029                 res += ', '
2030             res += fdict['vcodec']
2031             if fdict.get('vbr') is not None:
2032                 res += '@'
2033         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2034             res += 'video@'
2035         if fdict.get('vbr') is not None:
2036             res += '%4dk' % fdict['vbr']
2037         if fdict.get('fps') is not None:
2038             if res:
2039                 res += ', '
2040             res += '%sfps' % fdict['fps']
2041         if fdict.get('acodec') is not None:
2042             if res:
2043                 res += ', '
2044             if fdict['acodec'] == 'none':
2045                 res += 'video only'
2046             else:
2047                 res += '%-5s' % fdict['acodec']
2048         elif fdict.get('abr') is not None:
2049             if res:
2050                 res += ', '
2051             res += 'audio'
2052         if fdict.get('abr') is not None:
2053             res += '@%3dk' % fdict['abr']
2054         if fdict.get('asr') is not None:
2055             res += ' (%5dHz)' % fdict['asr']
2056         if fdict.get('filesize') is not None:
2057             if res:
2058                 res += ', '
2059             res += format_bytes(fdict['filesize'])
2060         elif fdict.get('filesize_approx') is not None:
2061             if res:
2062                 res += ', '
2063             res += '~' + format_bytes(fdict['filesize_approx'])
2064         return res
2065
2066     def list_formats(self, info_dict):
2067         formats = info_dict.get('formats', [info_dict])
2068         table = [
2069             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2070             for f in formats
2071             if f.get('preference') is None or f['preference'] >= -1000]
2072         if len(formats) > 1:
2073             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2074
2075         header_line = ['format code', 'extension', 'resolution', 'note']
2076         self.to_screen(
2077             '[info] Available formats for %s:\n%s' %
2078             (info_dict['id'], render_table(header_line, table)))
2079
2080     def list_thumbnails(self, info_dict):
2081         thumbnails = info_dict.get('thumbnails')
2082         if not thumbnails:
2083             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2084             return
2085
2086         self.to_screen(
2087             '[info] Thumbnails for %s:' % info_dict['id'])
2088         self.to_screen(render_table(
2089             ['ID', 'width', 'height', 'URL'],
2090             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2091
2092     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2093         if not subtitles:
2094             self.to_screen('%s has no %s' % (video_id, name))
2095             return
2096         self.to_screen(
2097             'Available %s for %s:' % (name, video_id))
2098         self.to_screen(render_table(
2099             ['Language', 'formats'],
2100             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2101                 for lang, formats in subtitles.items()]))
2102
2103     def urlopen(self, req):
2104         """ Start an HTTP download """
2105         if isinstance(req, compat_basestring):
2106             req = sanitized_Request(req)
2107         return self._opener.open(req, timeout=self._socket_timeout)
2108
2109     def print_debug_header(self):
2110         if not self.params.get('verbose'):
2111             return
2112
2113         if type('') is not compat_str:
2114             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2115             self.report_warning(
2116                 'Your Python is broken! Update to a newer and supported version')
2117
2118         stdout_encoding = getattr(
2119             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2120         encoding_str = (
2121             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2122                 locale.getpreferredencoding(),
2123                 sys.getfilesystemencoding(),
2124                 stdout_encoding,
2125                 self.get_encoding()))
2126         write_string(encoding_str, encoding=None)
2127
2128         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2129         if _LAZY_LOADER:
2130             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2131         try:
2132             sp = subprocess.Popen(
2133                 ['git', 'rev-parse', '--short', 'HEAD'],
2134                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2135                 cwd=os.path.dirname(os.path.abspath(__file__)))
2136             out, err = sp.communicate()
2137             out = out.decode().strip()
2138             if re.match('[0-9a-f]+', out):
2139                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2140         except Exception:
2141             try:
2142                 sys.exc_clear()
2143             except Exception:
2144                 pass
2145         self._write_string('[debug] Python version %s - %s\n' % (
2146             platform.python_version(), platform_name()))
2147
2148         exe_versions = FFmpegPostProcessor.get_versions(self)
2149         exe_versions['rtmpdump'] = rtmpdump_version()
2150         exe_str = ', '.join(
2151             '%s %s' % (exe, v)
2152             for exe, v in sorted(exe_versions.items())
2153             if v
2154         )
2155         if not exe_str:
2156             exe_str = 'none'
2157         self._write_string('[debug] exe versions: %s\n' % exe_str)
2158
2159         proxy_map = {}
2160         for handler in self._opener.handlers:
2161             if hasattr(handler, 'proxies'):
2162                 proxy_map.update(handler.proxies)
2163         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2164
2165         if self.params.get('call_home', False):
2166             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2167             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2168             latest_version = self.urlopen(
2169                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2170             if version_tuple(latest_version) > version_tuple(__version__):
2171                 self.report_warning(
2172                     'You are using an outdated version (newest version: %s)! '
2173                     'See https://yt-dl.org/update if you need help updating.' %
2174                     latest_version)
2175
2176     def _setup_opener(self):
2177         timeout_val = self.params.get('socket_timeout')
2178         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2179
2180         opts_cookiefile = self.params.get('cookiefile')
2181         opts_proxy = self.params.get('proxy')
2182
2183         if opts_cookiefile is None:
2184             self.cookiejar = compat_cookiejar.CookieJar()
2185         else:
2186             opts_cookiefile = expand_path(opts_cookiefile)
2187             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2188                 opts_cookiefile)
2189             if os.access(opts_cookiefile, os.R_OK):
2190                 self.cookiejar.load()
2191
2192         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2193         if opts_proxy is not None:
2194             if opts_proxy == '':
2195                 proxies = {}
2196             else:
2197                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2198         else:
2199             proxies = compat_urllib_request.getproxies()
2200             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2201             if 'http' in proxies and 'https' not in proxies:
2202                 proxies['https'] = proxies['http']
2203         proxy_handler = PerRequestProxyHandler(proxies)
2204
2205         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2206         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2207         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2208         data_handler = compat_urllib_request_DataHandler()
2209
2210         # When passing our own FileHandler instance, build_opener won't add the
2211         # default FileHandler and allows us to disable the file protocol, which
2212         # can be used for malicious purposes (see
2213         # https://github.com/rg3/youtube-dl/issues/8227)
2214         file_handler = compat_urllib_request.FileHandler()
2215
2216         def file_open(*args, **kwargs):
2217             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2218         file_handler.file_open = file_open
2219
2220         opener = compat_urllib_request.build_opener(
2221             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2222
2223         # Delete the default user-agent header, which would otherwise apply in
2224         # cases where our custom HTTP handler doesn't come into play
2225         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2226         opener.addheaders = []
2227         self._opener = opener
2228
2229     def encode(self, s):
2230         if isinstance(s, bytes):
2231             return s  # Already encoded
2232
2233         try:
2234             return s.encode(self.get_encoding())
2235         except UnicodeEncodeError as err:
2236             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2237             raise
2238
2239     def get_encoding(self):
2240         encoding = self.params.get('encoding')
2241         if encoding is None:
2242             encoding = preferredencoding()
2243         return encoding
2244
2245     def _write_thumbnails(self, info_dict, filename):
2246         if self.params.get('writethumbnail', False):
2247             thumbnails = info_dict.get('thumbnails')
2248             if thumbnails:
2249                 thumbnails = [thumbnails[-1]]
2250         elif self.params.get('write_all_thumbnails', False):
2251             thumbnails = info_dict.get('thumbnails')
2252         else:
2253             return
2254
2255         if not thumbnails:
2256             # No thumbnails present, so return immediately
2257             return
2258
2259         for t in thumbnails:
2260             thumb_ext = determine_ext(t['url'], 'jpg')
2261             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2262             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2263             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2264
2265             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2266                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2267                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2268             else:
2269                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2270                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2271                 try:
2272                     uf = self.urlopen(t['url'])
2273                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2274                         shutil.copyfileobj(uf, thumbf)
2275                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2276                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2277                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2278                     self.report_warning('Unable to download thumbnail "%s": %s' %
2279                                         (t['url'], error_to_compat_str(err)))