[YoutubeDL] Rewrite outtmpl for playlist_index and autonumber for backward compatibility
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_expanduser,
33     compat_get_terminal_size,
34     compat_http_client,
35     compat_kwargs,
36     compat_numeric_types,
37     compat_os_name,
38     compat_str,
39     compat_tokenize_tokenize,
40     compat_urllib_error,
41     compat_urllib_request,
42     compat_urllib_request_DataHandler,
43 )
44 from .utils import (
45     age_restricted,
46     args_to_str,
47     ContentTooShortError,
48     date_from_str,
49     DateRange,
50     DEFAULT_OUTTMPL,
51     determine_ext,
52     determine_protocol,
53     DownloadError,
54     encode_compat_str,
55     encodeFilename,
56     error_to_compat_str,
57     ExtractorError,
58     format_bytes,
59     formatSeconds,
60     GeoRestrictedError,
61     ISO3166Utils,
62     locked_file,
63     make_HTTPS_handler,
64     MaxDownloadsReached,
65     PagedList,
66     parse_filesize,
67     PerRequestProxyHandler,
68     platform_name,
69     PostProcessingError,
70     preferredencoding,
71     prepend_extension,
72     register_socks_protocols,
73     render_table,
74     replace_extension,
75     SameFileError,
76     sanitize_filename,
77     sanitize_path,
78     sanitize_url,
79     sanitized_Request,
80     std_headers,
81     subtitles_filename,
82     UnavailableVideoError,
83     url_basename,
84     version_tuple,
85     write_json_file,
86     write_string,
87     YoutubeDLCookieProcessor,
88     YoutubeDLHandler,
89 )
90 from .cache import Cache
91 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
92 from .downloader import get_suitable_downloader
93 from .downloader.rtmp import rtmpdump_version
94 from .postprocessor import (
95     FFmpegFixupM3u8PP,
96     FFmpegFixupM4aPP,
97     FFmpegFixupStretchedPP,
98     FFmpegMergerPP,
99     FFmpegPostProcessor,
100     get_postprocessor,
101 )
102 from .version import __version__
103
104 if compat_os_name == 'nt':
105     import ctypes
106
107
108 class YoutubeDL(object):
109     """YoutubeDL class.
110
111     YoutubeDL objects are the ones responsible of downloading the
112     actual video file and writing it to disk if the user has requested
113     it, among some other tasks. In most cases there should be one per
114     program. As, given a video URL, the downloader doesn't know how to
115     extract all the needed information, task that InfoExtractors do, it
116     has to pass the URL to one of them.
117
118     For this, YoutubeDL objects have a method that allows
119     InfoExtractors to be registered in a given order. When it is passed
120     a URL, the YoutubeDL object handles it to the first InfoExtractor it
121     finds that reports being able to handle it. The InfoExtractor extracts
122     all the information about the video or videos the URL refers to, and
123     YoutubeDL process the extracted information, possibly using a File
124     Downloader to download the video.
125
126     YoutubeDL objects accept a lot of parameters. In order not to saturate
127     the object constructor with arguments, it receives a dictionary of
128     options instead. These options are available through the params
129     attribute for the InfoExtractors to use. The YoutubeDL also
130     registers itself as the downloader in charge for the InfoExtractors
131     that are added to it, so this is a "mutual registration".
132
133     Available options:
134
135     username:          Username for authentication purposes.
136     password:          Password for authentication purposes.
137     videopassword:     Password for accessing a video.
138     ap_mso:            Adobe Pass multiple-system operator identifier.
139     ap_username:       Multiple-system operator account username.
140     ap_password:       Multiple-system operator account password.
141     usenetrc:          Use netrc for authentication instead.
142     verbose:           Print additional info to stdout.
143     quiet:             Do not print messages to stdout.
144     no_warnings:       Do not print out anything for warnings.
145     forceurl:          Force printing final URL.
146     forcetitle:        Force printing title.
147     forceid:           Force printing ID.
148     forcethumbnail:    Force printing thumbnail URL.
149     forcedescription:  Force printing description.
150     forcefilename:     Force printing final filename.
151     forceduration:     Force printing duration.
152     forcejson:         Force printing info_dict as JSON.
153     dump_single_json:  Force printing the info_dict of the whole playlist
154                        (or video) as a single JSON line.
155     simulate:          Do not download the video files.
156     format:            Video format code. See options.py for more information.
157     outtmpl:           Template for output names.
158     restrictfilenames: Do not allow "&" and spaces in file names
159     ignoreerrors:      Do not stop on download errors.
160     force_generic_extractor: Force downloader to use the generic extractor
161     nooverwrites:      Prevent overwriting files.
162     playliststart:     Playlist item to start at.
163     playlistend:       Playlist item to end at.
164     playlist_items:    Specific indices of playlist to download.
165     playlistreverse:   Download playlist items in reverse order.
166     playlistrandom:    Download playlist items in random order.
167     matchtitle:        Download only matching titles.
168     rejecttitle:       Reject downloads for matching titles.
169     logger:            Log messages to a logging.Logger instance.
170     logtostderr:       Log messages to stderr instead of stdout.
171     writedescription:  Write the video description to a .description file
172     writeinfojson:     Write the video description to a .info.json file
173     writeannotations:  Write the video annotations to a .annotations.xml file
174     writethumbnail:    Write the thumbnail image to a file
175     write_all_thumbnails:  Write all thumbnail formats to files
176     writesubtitles:    Write the video subtitles to a file
177     writeautomaticsub: Write the automatically generated subtitles to a file
178     allsubtitles:      Downloads all the subtitles of the video
179                        (requires writesubtitles or writeautomaticsub)
180     listsubtitles:     Lists all available subtitles for the video
181     subtitlesformat:   The format code for subtitles
182     subtitleslangs:    List of languages of the subtitles to download
183     keepvideo:         Keep the video file after post-processing
184     daterange:         A DateRange object, download only if the upload_date is in the range.
185     skip_download:     Skip the actual download of the video file
186     cachedir:          Location of the cache files in the filesystem.
187                        False to disable filesystem cache.
188     noplaylist:        Download single video instead of a playlist if in doubt.
189     age_limit:         An integer representing the user's age in years.
190                        Unsuitable videos for the given age are skipped.
191     min_views:         An integer representing the minimum view count the video
192                        must have in order to not be skipped.
193                        Videos without view count information are always
194                        downloaded. None for no limit.
195     max_views:         An integer representing the maximum view count.
196                        Videos that are more popular than that are not
197                        downloaded.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     download_archive:  File name of a file where all downloads are recorded.
201                        Videos already present in the file are not downloaded
202                        again.
203     cookiefile:        File name where cookies should be read from and dumped to.
204     nocheckcertificate:Do not verify SSL certificates
205     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
206                        At the moment, this is only supported by YouTube.
207     proxy:             URL of the proxy server to use
208     geo_verification_proxy:  URL of the proxy to use for IP address verification
209                        on geo-restricted sites. (Experimental)
210     socket_timeout:    Time to wait for unresponsive hosts, in seconds
211     bidi_workaround:   Work around buggy terminals without bidirectional text
212                        support, using fridibi
213     debug_printtraffic:Print out sent and received HTTP traffic
214     include_ads:       Download ads as well
215     default_search:    Prepend this string if an input url is not valid.
216                        'auto' for elaborate guessing
217     encoding:          Use this encoding instead of the system-specified.
218     extract_flat:      Do not resolve URLs, return the immediate result.
219                        Pass in 'in_playlist' to only show this behavior for
220                        playlist items.
221     postprocessors:    A list of dictionaries, each with an entry
222                        * key:  The name of the postprocessor. See
223                                youtube_dl/postprocessor/__init__.py for a list.
224                        as well as any further keyword arguments for the
225                        postprocessor.
226     progress_hooks:    A list of functions that get called on download
227                        progress, with a dictionary with the entries
228                        * status: One of "downloading", "error", or "finished".
229                                  Check this first and ignore unknown values.
230
231                        If status is one of "downloading", or "finished", the
232                        following properties may also be present:
233                        * filename: The final filename (always present)
234                        * tmpfilename: The filename we're currently writing to
235                        * downloaded_bytes: Bytes on disk
236                        * total_bytes: Size of the whole file, None if unknown
237                        * total_bytes_estimate: Guess of the eventual file size,
238                                                None if unavailable.
239                        * elapsed: The number of seconds since download started.
240                        * eta: The estimated time in seconds, None if unknown
241                        * speed: The download speed in bytes/second, None if
242                                 unknown
243                        * fragment_index: The counter of the currently
244                                          downloaded video fragment.
245                        * fragment_count: The number of fragments (= individual
246                                          files that will be merged)
247
248                        Progress hooks are guaranteed to be called at least once
249                        (with status "finished") if the download is successful.
250     merge_output_format: Extension to use when merging formats.
251     fixup:             Automatically correct known faults of the file.
252                        One of:
253                        - "never": do nothing
254                        - "warn": only emit a warning
255                        - "detect_or_warn": check whether we can do anything
256                                            about it, warn otherwise (default)
257     source_address:    (Experimental) Client-side IP address to bind to.
258     call_home:         Boolean, true iff we are allowed to contact the
259                        youtube-dl servers for debugging.
260     sleep_interval:    Number of seconds to sleep before each download when
261                        used alone or a lower bound of a range for randomized
262                        sleep before each download (minimum possible number
263                        of seconds to sleep) when used along with
264                        max_sleep_interval.
265     max_sleep_interval:Upper bound of a range for randomized sleep before each
266                        download (maximum possible number of seconds to sleep).
267                        Must only be used along with sleep_interval.
268                        Actual sleep time will be a random float from range
269                        [sleep_interval; max_sleep_interval].
270     listformats:       Print an overview of available video formats and exit.
271     list_thumbnails:   Print a table of all thumbnails and exit.
272     match_filter:      A function that gets called with the info_dict of
273                        every video.
274                        If it returns a message, the video is ignored.
275                        If it returns None, the video is downloaded.
276                        match_filter_func in utils.py is one example for this.
277     no_color:          Do not emit color codes in output.
278     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
279                        HTTP header (experimental)
280     geo_bypass_country:
281                        Two-letter ISO 3166-2 country code that will be used for
282                        explicit geographic restriction bypassing via faking
283                        X-Forwarded-For HTTP header (experimental)
284
285     The following options determine which downloader is picked:
286     external_downloader: Executable of the external downloader to call.
287                        None or unset for standard (built-in) downloader.
288     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
289                        if True, otherwise use ffmpeg/avconv if False, otherwise
290                        use downloader suggested by extractor if None.
291
292     The following parameters are not used by YoutubeDL itself, they are used by
293     the downloader (see youtube_dl/downloader/common.py):
294     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
295     noresizebuffer, retries, continuedl, noprogress, consoletitle,
296     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
297
298     The following options are used by the post processors:
299     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
300                        otherwise prefer avconv.
301     postprocessor_args: A list of additional command-line arguments for the
302                         postprocessor.
303     """
304
305     params = None
306     _ies = []
307     _pps = []
308     _download_retcode = None
309     _num_downloads = None
310     _screen_file = None
311
312     def __init__(self, params=None, auto_init=True):
313         """Create a FileDownloader object with the given options."""
314         if params is None:
315             params = {}
316         self._ies = []
317         self._ies_instances = {}
318         self._pps = []
319         self._progress_hooks = []
320         self._download_retcode = 0
321         self._num_downloads = 0
322         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
323         self._err_file = sys.stderr
324         self.params = {
325             # Default parameters
326             'nocheckcertificate': False,
327         }
328         self.params.update(params)
329         self.cache = Cache(self)
330
331         if self.params.get('cn_verification_proxy') is not None:
332             self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.')
333             if self.params.get('geo_verification_proxy') is None:
334                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
335
336         if params.get('bidi_workaround', False):
337             try:
338                 import pty
339                 master, slave = pty.openpty()
340                 width = compat_get_terminal_size().columns
341                 if width is None:
342                     width_args = []
343                 else:
344                     width_args = ['-w', str(width)]
345                 sp_kwargs = dict(
346                     stdin=subprocess.PIPE,
347                     stdout=slave,
348                     stderr=self._err_file)
349                 try:
350                     self._output_process = subprocess.Popen(
351                         ['bidiv'] + width_args, **sp_kwargs
352                     )
353                 except OSError:
354                     self._output_process = subprocess.Popen(
355                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
356                 self._output_channel = os.fdopen(master, 'rb')
357             except OSError as ose:
358                 if ose.errno == errno.ENOENT:
359                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
360                 else:
361                     raise
362
363         if (sys.version_info >= (3,) and sys.platform != 'win32' and
364                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
365                 not params.get('restrictfilenames', False)):
366             # On Python 3, the Unicode filesystem API will throw errors (#1474)
367             self.report_warning(
368                 'Assuming --restrict-filenames since file system encoding '
369                 'cannot encode all characters. '
370                 'Set the LC_ALL environment variable to fix this.')
371             self.params['restrictfilenames'] = True
372
373         if isinstance(params.get('outtmpl'), bytes):
374             self.report_warning(
375                 'Parameter outtmpl is bytes, but should be a unicode string. '
376                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
377
378         self._setup_opener()
379
380         if auto_init:
381             self.print_debug_header()
382             self.add_default_info_extractors()
383
384         for pp_def_raw in self.params.get('postprocessors', []):
385             pp_class = get_postprocessor(pp_def_raw['key'])
386             pp_def = dict(pp_def_raw)
387             del pp_def['key']
388             pp = pp_class(self, **compat_kwargs(pp_def))
389             self.add_post_processor(pp)
390
391         for ph in self.params.get('progress_hooks', []):
392             self.add_progress_hook(ph)
393
394         register_socks_protocols()
395
396     def warn_if_short_id(self, argv):
397         # short YouTube ID starting with dash?
398         idxs = [
399             i for i, a in enumerate(argv)
400             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
401         if idxs:
402             correct_argv = (
403                 ['youtube-dl'] +
404                 [a for i, a in enumerate(argv) if i not in idxs] +
405                 ['--'] + [argv[i] for i in idxs]
406             )
407             self.report_warning(
408                 'Long argument string detected. '
409                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
410                 args_to_str(correct_argv))
411
412     def add_info_extractor(self, ie):
413         """Add an InfoExtractor object to the end of the list."""
414         self._ies.append(ie)
415         if not isinstance(ie, type):
416             self._ies_instances[ie.ie_key()] = ie
417             ie.set_downloader(self)
418
419     def get_info_extractor(self, ie_key):
420         """
421         Get an instance of an IE with name ie_key, it will try to get one from
422         the _ies list, if there's no instance it will create a new one and add
423         it to the extractor list.
424         """
425         ie = self._ies_instances.get(ie_key)
426         if ie is None:
427             ie = get_info_extractor(ie_key)()
428             self.add_info_extractor(ie)
429         return ie
430
431     def add_default_info_extractors(self):
432         """
433         Add the InfoExtractors returned by gen_extractors to the end of the list
434         """
435         for ie in gen_extractor_classes():
436             self.add_info_extractor(ie)
437
438     def add_post_processor(self, pp):
439         """Add a PostProcessor object to the end of the chain."""
440         self._pps.append(pp)
441         pp.set_downloader(self)
442
443     def add_progress_hook(self, ph):
444         """Add the progress hook (currently only for the file downloader)"""
445         self._progress_hooks.append(ph)
446
447     def _bidi_workaround(self, message):
448         if not hasattr(self, '_output_channel'):
449             return message
450
451         assert hasattr(self, '_output_process')
452         assert isinstance(message, compat_str)
453         line_count = message.count('\n') + 1
454         self._output_process.stdin.write((message + '\n').encode('utf-8'))
455         self._output_process.stdin.flush()
456         res = ''.join(self._output_channel.readline().decode('utf-8')
457                       for _ in range(line_count))
458         return res[:-len('\n')]
459
460     def to_screen(self, message, skip_eol=False):
461         """Print message to stdout if not in quiet mode."""
462         return self.to_stdout(message, skip_eol, check_quiet=True)
463
464     def _write_string(self, s, out=None):
465         write_string(s, out=out, encoding=self.params.get('encoding'))
466
467     def to_stdout(self, message, skip_eol=False, check_quiet=False):
468         """Print message to stdout if not in quiet mode."""
469         if self.params.get('logger'):
470             self.params['logger'].debug(message)
471         elif not check_quiet or not self.params.get('quiet', False):
472             message = self._bidi_workaround(message)
473             terminator = ['\n', ''][skip_eol]
474             output = message + terminator
475
476             self._write_string(output, self._screen_file)
477
478     def to_stderr(self, message):
479         """Print message to stderr."""
480         assert isinstance(message, compat_str)
481         if self.params.get('logger'):
482             self.params['logger'].error(message)
483         else:
484             message = self._bidi_workaround(message)
485             output = message + '\n'
486             self._write_string(output, self._err_file)
487
488     def to_console_title(self, message):
489         if not self.params.get('consoletitle', False):
490             return
491         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
492             # c_wchar_p() might not be necessary if `message` is
493             # already of type unicode()
494             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
495         elif 'TERM' in os.environ:
496             self._write_string('\033]0;%s\007' % message, self._screen_file)
497
498     def save_console_title(self):
499         if not self.params.get('consoletitle', False):
500             return
501         if 'TERM' in os.environ:
502             # Save the title on stack
503             self._write_string('\033[22;0t', self._screen_file)
504
505     def restore_console_title(self):
506         if not self.params.get('consoletitle', False):
507             return
508         if 'TERM' in os.environ:
509             # Restore the title from stack
510             self._write_string('\033[23;0t', self._screen_file)
511
512     def __enter__(self):
513         self.save_console_title()
514         return self
515
516     def __exit__(self, *args):
517         self.restore_console_title()
518
519         if self.params.get('cookiefile') is not None:
520             self.cookiejar.save()
521
522     def trouble(self, message=None, tb=None):
523         """Determine action to take when a download problem appears.
524
525         Depending on if the downloader has been configured to ignore
526         download errors or not, this method may throw an exception or
527         not when errors are found, after printing the message.
528
529         tb, if given, is additional traceback information.
530         """
531         if message is not None:
532             self.to_stderr(message)
533         if self.params.get('verbose'):
534             if tb is None:
535                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
536                     tb = ''
537                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
538                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
539                     tb += encode_compat_str(traceback.format_exc())
540                 else:
541                     tb_data = traceback.format_list(traceback.extract_stack())
542                     tb = ''.join(tb_data)
543             self.to_stderr(tb)
544         if not self.params.get('ignoreerrors', False):
545             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
546                 exc_info = sys.exc_info()[1].exc_info
547             else:
548                 exc_info = sys.exc_info()
549             raise DownloadError(message, exc_info)
550         self._download_retcode = 1
551
552     def report_warning(self, message):
553         '''
554         Print the message to stderr, it will be prefixed with 'WARNING:'
555         If stderr is a tty file the 'WARNING:' will be colored
556         '''
557         if self.params.get('logger') is not None:
558             self.params['logger'].warning(message)
559         else:
560             if self.params.get('no_warnings'):
561                 return
562             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
563                 _msg_header = '\033[0;33mWARNING:\033[0m'
564             else:
565                 _msg_header = 'WARNING:'
566             warning_message = '%s %s' % (_msg_header, message)
567             self.to_stderr(warning_message)
568
569     def report_error(self, message, tb=None):
570         '''
571         Do the same as trouble, but prefixes the message with 'ERROR:', colored
572         in red if stderr is a tty file.
573         '''
574         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
575             _msg_header = '\033[0;31mERROR:\033[0m'
576         else:
577             _msg_header = 'ERROR:'
578         error_message = '%s %s' % (_msg_header, message)
579         self.trouble(error_message, tb)
580
581     def report_file_already_downloaded(self, file_name):
582         """Report file has already been fully downloaded."""
583         try:
584             self.to_screen('[download] %s has already been downloaded' % file_name)
585         except UnicodeEncodeError:
586             self.to_screen('[download] The file has already been downloaded')
587
588     def prepare_filename(self, info_dict):
589         """Generate the output filename."""
590         try:
591             template_dict = dict(info_dict)
592
593             template_dict['epoch'] = int(time.time())
594             autonumber_size = self.params.get('autonumber_size')
595             if autonumber_size is None:
596                 autonumber_size = 5
597             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
598             if template_dict.get('resolution') is None:
599                 if template_dict.get('width') and template_dict.get('height'):
600                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
601                 elif template_dict.get('height'):
602                     template_dict['resolution'] = '%sp' % template_dict['height']
603                 elif template_dict.get('width'):
604                     template_dict['resolution'] = '%dx?' % template_dict['width']
605
606             sanitize = lambda k, v: sanitize_filename(
607                 compat_str(v),
608                 restricted=self.params.get('restrictfilenames'),
609                 is_id=(k == 'id'))
610             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
611                                  for k, v in template_dict.items()
612                                  if v is not None and not isinstance(v, (list, tuple, dict)))
613             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
614
615             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
616
617             # For fields playlist_index and autonumber convert all occurrences
618             # of %(field)s to %(field)0Nd for backward compatibility
619             field_size_compat_map = {
620                 'playlist_index': len(str(template_dict['n_entries'])),
621                 'autonumber': autonumber_size,
622             }
623             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
624             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
625             if mobj:
626                 outtmpl = re.sub(
627                     FIELD_SIZE_COMPAT_RE,
628                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
629                     outtmpl)
630
631             NUMERIC_FIELDS = set((
632                 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
633                 'upload_year', 'upload_month', 'upload_day',
634                 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
635                 'average_rating', 'comment_count', 'age_limit',
636                 'start_time', 'end_time',
637                 'chapter_number', 'season_number', 'episode_number',
638                 'playlist_index',
639             ))
640
641             # Missing numeric fields used together with integer presentation types
642             # in format specification will break the argument substitution since
643             # string 'NA' is returned for missing fields. We will patch output
644             # template for missing fields to meet string presentation type.
645             for numeric_field in NUMERIC_FIELDS:
646                 if numeric_field not in template_dict:
647                     # As of [1] format syntax is:
648                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
649                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
650                     FORMAT_RE = r'''(?x)
651                         (?<!%)
652                         %
653                         \({0}\)  # mapping key
654                         (?:[#0\-+ ]+)?  # conversion flags (optional)
655                         (?:\d+)?  # minimum field width (optional)
656                         (?:\.\d+)?  # precision (optional)
657                         [hlL]?  # length modifier (optional)
658                         [diouxXeEfFgGcrs%]  # conversion type
659                     '''
660                     outtmpl = re.sub(
661                         FORMAT_RE.format(numeric_field),
662                         r'%({0})s'.format(numeric_field), outtmpl)
663
664             tmpl = compat_expanduser(outtmpl)
665             filename = tmpl % template_dict
666             # Temporary fix for #4787
667             # 'Treat' all problem characters by passing filename through preferredencoding
668             # to workaround encoding issues with subprocess on python2 @ Windows
669             if sys.version_info < (3, 0) and sys.platform == 'win32':
670                 filename = encodeFilename(filename, True).decode(preferredencoding())
671             return sanitize_path(filename)
672         except ValueError as err:
673             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
674             return None
675
676     def _match_entry(self, info_dict, incomplete):
677         """ Returns None iff the file should be downloaded """
678
679         video_title = info_dict.get('title', info_dict.get('id', 'video'))
680         if 'title' in info_dict:
681             # This can happen when we're just evaluating the playlist
682             title = info_dict['title']
683             matchtitle = self.params.get('matchtitle', False)
684             if matchtitle:
685                 if not re.search(matchtitle, title, re.IGNORECASE):
686                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
687             rejecttitle = self.params.get('rejecttitle', False)
688             if rejecttitle:
689                 if re.search(rejecttitle, title, re.IGNORECASE):
690                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
691         date = info_dict.get('upload_date')
692         if date is not None:
693             dateRange = self.params.get('daterange', DateRange())
694             if date not in dateRange:
695                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
696         view_count = info_dict.get('view_count')
697         if view_count is not None:
698             min_views = self.params.get('min_views')
699             if min_views is not None and view_count < min_views:
700                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
701             max_views = self.params.get('max_views')
702             if max_views is not None and view_count > max_views:
703                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
704         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
705             return 'Skipping "%s" because it is age restricted' % video_title
706         if self.in_download_archive(info_dict):
707             return '%s has already been recorded in archive' % video_title
708
709         if not incomplete:
710             match_filter = self.params.get('match_filter')
711             if match_filter is not None:
712                 ret = match_filter(info_dict)
713                 if ret is not None:
714                     return ret
715
716         return None
717
718     @staticmethod
719     def add_extra_info(info_dict, extra_info):
720         '''Set the keys from extra_info in info dict if they are missing'''
721         for key, value in extra_info.items():
722             info_dict.setdefault(key, value)
723
724     def extract_info(self, url, download=True, ie_key=None, extra_info={},
725                      process=True, force_generic_extractor=False):
726         '''
727         Returns a list with a dictionary for each video we find.
728         If 'download', also downloads the videos.
729         extra_info is a dict containing the extra values to add to each result
730         '''
731
732         if not ie_key and force_generic_extractor:
733             ie_key = 'Generic'
734
735         if ie_key:
736             ies = [self.get_info_extractor(ie_key)]
737         else:
738             ies = self._ies
739
740         for ie in ies:
741             if not ie.suitable(url):
742                 continue
743
744             ie = self.get_info_extractor(ie.ie_key())
745             if not ie.working():
746                 self.report_warning('The program functionality for this site has been marked as broken, '
747                                     'and will probably not work.')
748
749             try:
750                 ie_result = ie.extract(url)
751                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
752                     break
753                 if isinstance(ie_result, list):
754                     # Backwards compatibility: old IE result format
755                     ie_result = {
756                         '_type': 'compat_list',
757                         'entries': ie_result,
758                     }
759                 self.add_default_extra_info(ie_result, ie, url)
760                 if process:
761                     return self.process_ie_result(ie_result, download, extra_info)
762                 else:
763                     return ie_result
764             except GeoRestrictedError as e:
765                 msg = e.msg
766                 if e.countries:
767                     msg += '\nThis video is available in %s.' % ', '.join(
768                         map(ISO3166Utils.short2full, e.countries))
769                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
770                 self.report_error(msg)
771                 break
772             except ExtractorError as e:  # An error we somewhat expected
773                 self.report_error(compat_str(e), e.format_traceback())
774                 break
775             except MaxDownloadsReached:
776                 raise
777             except Exception as e:
778                 if self.params.get('ignoreerrors', False):
779                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
780                     break
781                 else:
782                     raise
783         else:
784             self.report_error('no suitable InfoExtractor for URL %s' % url)
785
786     def add_default_extra_info(self, ie_result, ie, url):
787         self.add_extra_info(ie_result, {
788             'extractor': ie.IE_NAME,
789             'webpage_url': url,
790             'webpage_url_basename': url_basename(url),
791             'extractor_key': ie.ie_key(),
792         })
793
794     def process_ie_result(self, ie_result, download=True, extra_info={}):
795         """
796         Take the result of the ie(may be modified) and resolve all unresolved
797         references (URLs, playlist items).
798
799         It will also download the videos if 'download'.
800         Returns the resolved ie_result.
801         """
802         result_type = ie_result.get('_type', 'video')
803
804         if result_type in ('url', 'url_transparent'):
805             ie_result['url'] = sanitize_url(ie_result['url'])
806             extract_flat = self.params.get('extract_flat', False)
807             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
808                     extract_flat is True):
809                 if self.params.get('forcejson', False):
810                     self.to_stdout(json.dumps(ie_result))
811                 return ie_result
812
813         if result_type == 'video':
814             self.add_extra_info(ie_result, extra_info)
815             return self.process_video_result(ie_result, download=download)
816         elif result_type == 'url':
817             # We have to add extra_info to the results because it may be
818             # contained in a playlist
819             return self.extract_info(ie_result['url'],
820                                      download,
821                                      ie_key=ie_result.get('ie_key'),
822                                      extra_info=extra_info)
823         elif result_type == 'url_transparent':
824             # Use the information from the embedding page
825             info = self.extract_info(
826                 ie_result['url'], ie_key=ie_result.get('ie_key'),
827                 extra_info=extra_info, download=False, process=False)
828
829             force_properties = dict(
830                 (k, v) for k, v in ie_result.items() if v is not None)
831             for f in ('_type', 'url', 'ie_key'):
832                 if f in force_properties:
833                     del force_properties[f]
834             new_result = info.copy()
835             new_result.update(force_properties)
836
837             assert new_result.get('_type') != 'url_transparent'
838
839             return self.process_ie_result(
840                 new_result, download=download, extra_info=extra_info)
841         elif result_type == 'playlist' or result_type == 'multi_video':
842             # We process each entry in the playlist
843             playlist = ie_result.get('title') or ie_result.get('id')
844             self.to_screen('[download] Downloading playlist: %s' % playlist)
845
846             playlist_results = []
847
848             playliststart = self.params.get('playliststart', 1) - 1
849             playlistend = self.params.get('playlistend')
850             # For backwards compatibility, interpret -1 as whole list
851             if playlistend == -1:
852                 playlistend = None
853
854             playlistitems_str = self.params.get('playlist_items')
855             playlistitems = None
856             if playlistitems_str is not None:
857                 def iter_playlistitems(format):
858                     for string_segment in format.split(','):
859                         if '-' in string_segment:
860                             start, end = string_segment.split('-')
861                             for item in range(int(start), int(end) + 1):
862                                 yield int(item)
863                         else:
864                             yield int(string_segment)
865                 playlistitems = iter_playlistitems(playlistitems_str)
866
867             ie_entries = ie_result['entries']
868             if isinstance(ie_entries, list):
869                 n_all_entries = len(ie_entries)
870                 if playlistitems:
871                     entries = [
872                         ie_entries[i - 1] for i in playlistitems
873                         if -n_all_entries <= i - 1 < n_all_entries]
874                 else:
875                     entries = ie_entries[playliststart:playlistend]
876                 n_entries = len(entries)
877                 self.to_screen(
878                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
879                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
880             elif isinstance(ie_entries, PagedList):
881                 if playlistitems:
882                     entries = []
883                     for item in playlistitems:
884                         entries.extend(ie_entries.getslice(
885                             item - 1, item
886                         ))
887                 else:
888                     entries = ie_entries.getslice(
889                         playliststart, playlistend)
890                 n_entries = len(entries)
891                 self.to_screen(
892                     '[%s] playlist %s: Downloading %d videos' %
893                     (ie_result['extractor'], playlist, n_entries))
894             else:  # iterable
895                 if playlistitems:
896                     entry_list = list(ie_entries)
897                     entries = [entry_list[i - 1] for i in playlistitems]
898                 else:
899                     entries = list(itertools.islice(
900                         ie_entries, playliststart, playlistend))
901                 n_entries = len(entries)
902                 self.to_screen(
903                     '[%s] playlist %s: Downloading %d videos' %
904                     (ie_result['extractor'], playlist, n_entries))
905
906             if self.params.get('playlistreverse', False):
907                 entries = entries[::-1]
908
909             if self.params.get('playlistrandom', False):
910                 random.shuffle(entries)
911
912             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
913
914             for i, entry in enumerate(entries, 1):
915                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
916                 # This __x_forwarded_for_ip thing is a bit ugly but requires
917                 # minimal changes
918                 if x_forwarded_for:
919                     entry['__x_forwarded_for_ip'] = x_forwarded_for
920                 extra = {
921                     'n_entries': n_entries,
922                     'playlist': playlist,
923                     'playlist_id': ie_result.get('id'),
924                     'playlist_title': ie_result.get('title'),
925                     'playlist_index': i + playliststart,
926                     'extractor': ie_result['extractor'],
927                     'webpage_url': ie_result['webpage_url'],
928                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
929                     'extractor_key': ie_result['extractor_key'],
930                 }
931
932                 reason = self._match_entry(entry, incomplete=True)
933                 if reason is not None:
934                     self.to_screen('[download] ' + reason)
935                     continue
936
937                 entry_result = self.process_ie_result(entry,
938                                                       download=download,
939                                                       extra_info=extra)
940                 playlist_results.append(entry_result)
941             ie_result['entries'] = playlist_results
942             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
943             return ie_result
944         elif result_type == 'compat_list':
945             self.report_warning(
946                 'Extractor %s returned a compat_list result. '
947                 'It needs to be updated.' % ie_result.get('extractor'))
948
949             def _fixup(r):
950                 self.add_extra_info(
951                     r,
952                     {
953                         'extractor': ie_result['extractor'],
954                         'webpage_url': ie_result['webpage_url'],
955                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
956                         'extractor_key': ie_result['extractor_key'],
957                     }
958                 )
959                 return r
960             ie_result['entries'] = [
961                 self.process_ie_result(_fixup(r), download, extra_info)
962                 for r in ie_result['entries']
963             ]
964             return ie_result
965         else:
966             raise Exception('Invalid result type: %s' % result_type)
967
968     def _build_format_filter(self, filter_spec):
969         " Returns a function to filter the formats according to the filter_spec "
970
971         OPERATORS = {
972             '<': operator.lt,
973             '<=': operator.le,
974             '>': operator.gt,
975             '>=': operator.ge,
976             '=': operator.eq,
977             '!=': operator.ne,
978         }
979         operator_rex = re.compile(r'''(?x)\s*
980             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
981             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
982             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
983             $
984             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
985         m = operator_rex.search(filter_spec)
986         if m:
987             try:
988                 comparison_value = int(m.group('value'))
989             except ValueError:
990                 comparison_value = parse_filesize(m.group('value'))
991                 if comparison_value is None:
992                     comparison_value = parse_filesize(m.group('value') + 'B')
993                 if comparison_value is None:
994                     raise ValueError(
995                         'Invalid value %r in format specification %r' % (
996                             m.group('value'), filter_spec))
997             op = OPERATORS[m.group('op')]
998
999         if not m:
1000             STR_OPERATORS = {
1001                 '=': operator.eq,
1002                 '!=': operator.ne,
1003                 '^=': lambda attr, value: attr.startswith(value),
1004                 '$=': lambda attr, value: attr.endswith(value),
1005                 '*=': lambda attr, value: value in attr,
1006             }
1007             str_operator_rex = re.compile(r'''(?x)
1008                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1009                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1010                 \s*(?P<value>[a-zA-Z0-9._-]+)
1011                 \s*$
1012                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1013             m = str_operator_rex.search(filter_spec)
1014             if m:
1015                 comparison_value = m.group('value')
1016                 op = STR_OPERATORS[m.group('op')]
1017
1018         if not m:
1019             raise ValueError('Invalid filter specification %r' % filter_spec)
1020
1021         def _filter(f):
1022             actual_value = f.get(m.group('key'))
1023             if actual_value is None:
1024                 return m.group('none_inclusive')
1025             return op(actual_value, comparison_value)
1026         return _filter
1027
1028     def build_format_selector(self, format_spec):
1029         def syntax_error(note, start):
1030             message = (
1031                 'Invalid format specification: '
1032                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1033             return SyntaxError(message)
1034
1035         PICKFIRST = 'PICKFIRST'
1036         MERGE = 'MERGE'
1037         SINGLE = 'SINGLE'
1038         GROUP = 'GROUP'
1039         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1040
1041         def _parse_filter(tokens):
1042             filter_parts = []
1043             for type, string, start, _, _ in tokens:
1044                 if type == tokenize.OP and string == ']':
1045                     return ''.join(filter_parts)
1046                 else:
1047                     filter_parts.append(string)
1048
1049         def _remove_unused_ops(tokens):
1050             # Remove operators that we don't use and join them with the surrounding strings
1051             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1052             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1053             last_string, last_start, last_end, last_line = None, None, None, None
1054             for type, string, start, end, line in tokens:
1055                 if type == tokenize.OP and string == '[':
1056                     if last_string:
1057                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1058                         last_string = None
1059                     yield type, string, start, end, line
1060                     # everything inside brackets will be handled by _parse_filter
1061                     for type, string, start, end, line in tokens:
1062                         yield type, string, start, end, line
1063                         if type == tokenize.OP and string == ']':
1064                             break
1065                 elif type == tokenize.OP and string in ALLOWED_OPS:
1066                     if last_string:
1067                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1068                         last_string = None
1069                     yield type, string, start, end, line
1070                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1071                     if not last_string:
1072                         last_string = string
1073                         last_start = start
1074                         last_end = end
1075                     else:
1076                         last_string += string
1077             if last_string:
1078                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1079
1080         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1081             selectors = []
1082             current_selector = None
1083             for type, string, start, _, _ in tokens:
1084                 # ENCODING is only defined in python 3.x
1085                 if type == getattr(tokenize, 'ENCODING', None):
1086                     continue
1087                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1088                     current_selector = FormatSelector(SINGLE, string, [])
1089                 elif type == tokenize.OP:
1090                     if string == ')':
1091                         if not inside_group:
1092                             # ')' will be handled by the parentheses group
1093                             tokens.restore_last_token()
1094                         break
1095                     elif inside_merge and string in ['/', ',']:
1096                         tokens.restore_last_token()
1097                         break
1098                     elif inside_choice and string == ',':
1099                         tokens.restore_last_token()
1100                         break
1101                     elif string == ',':
1102                         if not current_selector:
1103                             raise syntax_error('"," must follow a format selector', start)
1104                         selectors.append(current_selector)
1105                         current_selector = None
1106                     elif string == '/':
1107                         if not current_selector:
1108                             raise syntax_error('"/" must follow a format selector', start)
1109                         first_choice = current_selector
1110                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1111                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1112                     elif string == '[':
1113                         if not current_selector:
1114                             current_selector = FormatSelector(SINGLE, 'best', [])
1115                         format_filter = _parse_filter(tokens)
1116                         current_selector.filters.append(format_filter)
1117                     elif string == '(':
1118                         if current_selector:
1119                             raise syntax_error('Unexpected "("', start)
1120                         group = _parse_format_selection(tokens, inside_group=True)
1121                         current_selector = FormatSelector(GROUP, group, [])
1122                     elif string == '+':
1123                         video_selector = current_selector
1124                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1125                         if not video_selector or not audio_selector:
1126                             raise syntax_error('"+" must be between two format selectors', start)
1127                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1128                     else:
1129                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1130                 elif type == tokenize.ENDMARKER:
1131                     break
1132             if current_selector:
1133                 selectors.append(current_selector)
1134             return selectors
1135
1136         def _build_selector_function(selector):
1137             if isinstance(selector, list):
1138                 fs = [_build_selector_function(s) for s in selector]
1139
1140                 def selector_function(ctx):
1141                     for f in fs:
1142                         for format in f(ctx):
1143                             yield format
1144                 return selector_function
1145             elif selector.type == GROUP:
1146                 selector_function = _build_selector_function(selector.selector)
1147             elif selector.type == PICKFIRST:
1148                 fs = [_build_selector_function(s) for s in selector.selector]
1149
1150                 def selector_function(ctx):
1151                     for f in fs:
1152                         picked_formats = list(f(ctx))
1153                         if picked_formats:
1154                             return picked_formats
1155                     return []
1156             elif selector.type == SINGLE:
1157                 format_spec = selector.selector
1158
1159                 def selector_function(ctx):
1160                     formats = list(ctx['formats'])
1161                     if not formats:
1162                         return
1163                     if format_spec == 'all':
1164                         for f in formats:
1165                             yield f
1166                     elif format_spec in ['best', 'worst', None]:
1167                         format_idx = 0 if format_spec == 'worst' else -1
1168                         audiovideo_formats = [
1169                             f for f in formats
1170                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1171                         if audiovideo_formats:
1172                             yield audiovideo_formats[format_idx]
1173                         # for extractors with incomplete formats (audio only (soundcloud)
1174                         # or video only (imgur)) we will fallback to best/worst
1175                         # {video,audio}-only format
1176                         elif ctx['incomplete_formats']:
1177                             yield formats[format_idx]
1178                     elif format_spec == 'bestaudio':
1179                         audio_formats = [
1180                             f for f in formats
1181                             if f.get('vcodec') == 'none']
1182                         if audio_formats:
1183                             yield audio_formats[-1]
1184                     elif format_spec == 'worstaudio':
1185                         audio_formats = [
1186                             f for f in formats
1187                             if f.get('vcodec') == 'none']
1188                         if audio_formats:
1189                             yield audio_formats[0]
1190                     elif format_spec == 'bestvideo':
1191                         video_formats = [
1192                             f for f in formats
1193                             if f.get('acodec') == 'none']
1194                         if video_formats:
1195                             yield video_formats[-1]
1196                     elif format_spec == 'worstvideo':
1197                         video_formats = [
1198                             f for f in formats
1199                             if f.get('acodec') == 'none']
1200                         if video_formats:
1201                             yield video_formats[0]
1202                     else:
1203                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1204                         if format_spec in extensions:
1205                             filter_f = lambda f: f['ext'] == format_spec
1206                         else:
1207                             filter_f = lambda f: f['format_id'] == format_spec
1208                         matches = list(filter(filter_f, formats))
1209                         if matches:
1210                             yield matches[-1]
1211             elif selector.type == MERGE:
1212                 def _merge(formats_info):
1213                     format_1, format_2 = [f['format_id'] for f in formats_info]
1214                     # The first format must contain the video and the
1215                     # second the audio
1216                     if formats_info[0].get('vcodec') == 'none':
1217                         self.report_error('The first format must '
1218                                           'contain the video, try using '
1219                                           '"-f %s+%s"' % (format_2, format_1))
1220                         return
1221                     # Formats must be opposite (video+audio)
1222                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1223                         self.report_error(
1224                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1225                             % (format_1, format_2))
1226                         return
1227                     output_ext = (
1228                         formats_info[0]['ext']
1229                         if self.params.get('merge_output_format') is None
1230                         else self.params['merge_output_format'])
1231                     return {
1232                         'requested_formats': formats_info,
1233                         'format': '%s+%s' % (formats_info[0].get('format'),
1234                                              formats_info[1].get('format')),
1235                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1236                                                 formats_info[1].get('format_id')),
1237                         'width': formats_info[0].get('width'),
1238                         'height': formats_info[0].get('height'),
1239                         'resolution': formats_info[0].get('resolution'),
1240                         'fps': formats_info[0].get('fps'),
1241                         'vcodec': formats_info[0].get('vcodec'),
1242                         'vbr': formats_info[0].get('vbr'),
1243                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1244                         'acodec': formats_info[1].get('acodec'),
1245                         'abr': formats_info[1].get('abr'),
1246                         'ext': output_ext,
1247                     }
1248                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1249
1250                 def selector_function(ctx):
1251                     for pair in itertools.product(
1252                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1253                         yield _merge(pair)
1254
1255             filters = [self._build_format_filter(f) for f in selector.filters]
1256
1257             def final_selector(ctx):
1258                 ctx_copy = copy.deepcopy(ctx)
1259                 for _filter in filters:
1260                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1261                 return selector_function(ctx_copy)
1262             return final_selector
1263
1264         stream = io.BytesIO(format_spec.encode('utf-8'))
1265         try:
1266             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1267         except tokenize.TokenError:
1268             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1269
1270         class TokenIterator(object):
1271             def __init__(self, tokens):
1272                 self.tokens = tokens
1273                 self.counter = 0
1274
1275             def __iter__(self):
1276                 return self
1277
1278             def __next__(self):
1279                 if self.counter >= len(self.tokens):
1280                     raise StopIteration()
1281                 value = self.tokens[self.counter]
1282                 self.counter += 1
1283                 return value
1284
1285             next = __next__
1286
1287             def restore_last_token(self):
1288                 self.counter -= 1
1289
1290         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1291         return _build_selector_function(parsed_selector)
1292
1293     def _calc_headers(self, info_dict):
1294         res = std_headers.copy()
1295
1296         add_headers = info_dict.get('http_headers')
1297         if add_headers:
1298             res.update(add_headers)
1299
1300         cookies = self._calc_cookies(info_dict)
1301         if cookies:
1302             res['Cookie'] = cookies
1303
1304         if 'X-Forwarded-For' not in res:
1305             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1306             if x_forwarded_for_ip:
1307                 res['X-Forwarded-For'] = x_forwarded_for_ip
1308
1309         return res
1310
1311     def _calc_cookies(self, info_dict):
1312         pr = sanitized_Request(info_dict['url'])
1313         self.cookiejar.add_cookie_header(pr)
1314         return pr.get_header('Cookie')
1315
1316     def process_video_result(self, info_dict, download=True):
1317         assert info_dict.get('_type', 'video') == 'video'
1318
1319         if 'id' not in info_dict:
1320             raise ExtractorError('Missing "id" field in extractor result')
1321         if 'title' not in info_dict:
1322             raise ExtractorError('Missing "title" field in extractor result')
1323
1324         if not isinstance(info_dict['id'], compat_str):
1325             self.report_warning('"id" field is not a string - forcing string conversion')
1326             info_dict['id'] = compat_str(info_dict['id'])
1327
1328         if 'playlist' not in info_dict:
1329             # It isn't part of a playlist
1330             info_dict['playlist'] = None
1331             info_dict['playlist_index'] = None
1332
1333         thumbnails = info_dict.get('thumbnails')
1334         if thumbnails is None:
1335             thumbnail = info_dict.get('thumbnail')
1336             if thumbnail:
1337                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1338         if thumbnails:
1339             thumbnails.sort(key=lambda t: (
1340                 t.get('preference') if t.get('preference') is not None else -1,
1341                 t.get('width') if t.get('width') is not None else -1,
1342                 t.get('height') if t.get('height') is not None else -1,
1343                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1344             for i, t in enumerate(thumbnails):
1345                 t['url'] = sanitize_url(t['url'])
1346                 if t.get('width') and t.get('height'):
1347                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1348                 if t.get('id') is None:
1349                     t['id'] = '%d' % i
1350
1351         if self.params.get('list_thumbnails'):
1352             self.list_thumbnails(info_dict)
1353             return
1354
1355         thumbnail = info_dict.get('thumbnail')
1356         if thumbnail:
1357             info_dict['thumbnail'] = sanitize_url(thumbnail)
1358         elif thumbnails:
1359             info_dict['thumbnail'] = thumbnails[-1]['url']
1360
1361         if 'display_id' not in info_dict and 'id' in info_dict:
1362             info_dict['display_id'] = info_dict['id']
1363
1364         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1365             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1366             # see http://bugs.python.org/issue1646728)
1367             try:
1368                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1369                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1370             except (ValueError, OverflowError, OSError):
1371                 pass
1372
1373         # Auto generate title fields corresponding to the *_number fields when missing
1374         # in order to always have clean titles. This is very common for TV series.
1375         for field in ('chapter', 'season', 'episode'):
1376             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1377                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1378
1379         subtitles = info_dict.get('subtitles')
1380         if subtitles:
1381             for _, subtitle in subtitles.items():
1382                 for subtitle_format in subtitle:
1383                     if subtitle_format.get('url'):
1384                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1385                     if subtitle_format.get('ext') is None:
1386                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1387
1388         if self.params.get('listsubtitles', False):
1389             if 'automatic_captions' in info_dict:
1390                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1391             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1392             return
1393         info_dict['requested_subtitles'] = self.process_subtitles(
1394             info_dict['id'], subtitles,
1395             info_dict.get('automatic_captions'))
1396
1397         # We now pick which formats have to be downloaded
1398         if info_dict.get('formats') is None:
1399             # There's only one format available
1400             formats = [info_dict]
1401         else:
1402             formats = info_dict['formats']
1403
1404         if not formats:
1405             raise ExtractorError('No video formats found!')
1406
1407         formats_dict = {}
1408
1409         # We check that all the formats have the format and format_id fields
1410         for i, format in enumerate(formats):
1411             if 'url' not in format:
1412                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1413
1414             format['url'] = sanitize_url(format['url'])
1415
1416             if format.get('format_id') is None:
1417                 format['format_id'] = compat_str(i)
1418             else:
1419                 # Sanitize format_id from characters used in format selector expression
1420                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1421             format_id = format['format_id']
1422             if format_id not in formats_dict:
1423                 formats_dict[format_id] = []
1424             formats_dict[format_id].append(format)
1425
1426         # Make sure all formats have unique format_id
1427         for format_id, ambiguous_formats in formats_dict.items():
1428             if len(ambiguous_formats) > 1:
1429                 for i, format in enumerate(ambiguous_formats):
1430                     format['format_id'] = '%s-%d' % (format_id, i)
1431
1432         for i, format in enumerate(formats):
1433             if format.get('format') is None:
1434                 format['format'] = '{id} - {res}{note}'.format(
1435                     id=format['format_id'],
1436                     res=self.format_resolution(format),
1437                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1438                 )
1439             # Automatically determine file extension if missing
1440             if format.get('ext') is None:
1441                 format['ext'] = determine_ext(format['url']).lower()
1442             # Automatically determine protocol if missing (useful for format
1443             # selection purposes)
1444             if format.get('protocol') is None:
1445                 format['protocol'] = determine_protocol(format)
1446             # Add HTTP headers, so that external programs can use them from the
1447             # json output
1448             full_format_info = info_dict.copy()
1449             full_format_info.update(format)
1450             format['http_headers'] = self._calc_headers(full_format_info)
1451         # Remove private housekeeping stuff
1452         if '__x_forwarded_for_ip' in info_dict:
1453             del info_dict['__x_forwarded_for_ip']
1454
1455         # TODO Central sorting goes here
1456
1457         if formats[0] is not info_dict:
1458             # only set the 'formats' fields if the original info_dict list them
1459             # otherwise we end up with a circular reference, the first (and unique)
1460             # element in the 'formats' field in info_dict is info_dict itself,
1461             # which can't be exported to json
1462             info_dict['formats'] = formats
1463         if self.params.get('listformats'):
1464             self.list_formats(info_dict)
1465             return
1466
1467         req_format = self.params.get('format')
1468         if req_format is None:
1469             req_format_list = []
1470             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1471                     not info_dict.get('is_live')):
1472                 merger = FFmpegMergerPP(self)
1473                 if merger.available and merger.can_merge():
1474                     req_format_list.append('bestvideo+bestaudio')
1475             req_format_list.append('best')
1476             req_format = '/'.join(req_format_list)
1477         format_selector = self.build_format_selector(req_format)
1478
1479         # While in format selection we may need to have an access to the original
1480         # format set in order to calculate some metrics or do some processing.
1481         # For now we need to be able to guess whether original formats provided
1482         # by extractor are incomplete or not (i.e. whether extractor provides only
1483         # video-only or audio-only formats) for proper formats selection for
1484         # extractors with such incomplete formats (see
1485         # https://github.com/rg3/youtube-dl/pull/5556).
1486         # Since formats may be filtered during format selection and may not match
1487         # the original formats the results may be incorrect. Thus original formats
1488         # or pre-calculated metrics should be passed to format selection routines
1489         # as well.
1490         # We will pass a context object containing all necessary additional data
1491         # instead of just formats.
1492         # This fixes incorrect format selection issue (see
1493         # https://github.com/rg3/youtube-dl/issues/10083).
1494         incomplete_formats = (
1495             # All formats are video-only or
1496             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1497             # all formats are audio-only
1498             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1499
1500         ctx = {
1501             'formats': formats,
1502             'incomplete_formats': incomplete_formats,
1503         }
1504
1505         formats_to_download = list(format_selector(ctx))
1506         if not formats_to_download:
1507             raise ExtractorError('requested format not available',
1508                                  expected=True)
1509
1510         if download:
1511             if len(formats_to_download) > 1:
1512                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1513             for format in formats_to_download:
1514                 new_info = dict(info_dict)
1515                 new_info.update(format)
1516                 self.process_info(new_info)
1517         # We update the info dict with the best quality format (backwards compatibility)
1518         info_dict.update(formats_to_download[-1])
1519         return info_dict
1520
1521     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1522         """Select the requested subtitles and their format"""
1523         available_subs = {}
1524         if normal_subtitles and self.params.get('writesubtitles'):
1525             available_subs.update(normal_subtitles)
1526         if automatic_captions and self.params.get('writeautomaticsub'):
1527             for lang, cap_info in automatic_captions.items():
1528                 if lang not in available_subs:
1529                     available_subs[lang] = cap_info
1530
1531         if (not self.params.get('writesubtitles') and not
1532                 self.params.get('writeautomaticsub') or not
1533                 available_subs):
1534             return None
1535
1536         if self.params.get('allsubtitles', False):
1537             requested_langs = available_subs.keys()
1538         else:
1539             if self.params.get('subtitleslangs', False):
1540                 requested_langs = self.params.get('subtitleslangs')
1541             elif 'en' in available_subs:
1542                 requested_langs = ['en']
1543             else:
1544                 requested_langs = [list(available_subs.keys())[0]]
1545
1546         formats_query = self.params.get('subtitlesformat', 'best')
1547         formats_preference = formats_query.split('/') if formats_query else []
1548         subs = {}
1549         for lang in requested_langs:
1550             formats = available_subs.get(lang)
1551             if formats is None:
1552                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1553                 continue
1554             for ext in formats_preference:
1555                 if ext == 'best':
1556                     f = formats[-1]
1557                     break
1558                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1559                 if matches:
1560                     f = matches[-1]
1561                     break
1562             else:
1563                 f = formats[-1]
1564                 self.report_warning(
1565                     'No subtitle format found matching "%s" for language %s, '
1566                     'using %s' % (formats_query, lang, f['ext']))
1567             subs[lang] = f
1568         return subs
1569
1570     def process_info(self, info_dict):
1571         """Process a single resolved IE result."""
1572
1573         assert info_dict.get('_type', 'video') == 'video'
1574
1575         max_downloads = self.params.get('max_downloads')
1576         if max_downloads is not None:
1577             if self._num_downloads >= int(max_downloads):
1578                 raise MaxDownloadsReached()
1579
1580         info_dict['fulltitle'] = info_dict['title']
1581         if len(info_dict['title']) > 200:
1582             info_dict['title'] = info_dict['title'][:197] + '...'
1583
1584         if 'format' not in info_dict:
1585             info_dict['format'] = info_dict['ext']
1586
1587         reason = self._match_entry(info_dict, incomplete=False)
1588         if reason is not None:
1589             self.to_screen('[download] ' + reason)
1590             return
1591
1592         self._num_downloads += 1
1593
1594         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1595
1596         # Forced printings
1597         if self.params.get('forcetitle', False):
1598             self.to_stdout(info_dict['fulltitle'])
1599         if self.params.get('forceid', False):
1600             self.to_stdout(info_dict['id'])
1601         if self.params.get('forceurl', False):
1602             if info_dict.get('requested_formats') is not None:
1603                 for f in info_dict['requested_formats']:
1604                     self.to_stdout(f['url'] + f.get('play_path', ''))
1605             else:
1606                 # For RTMP URLs, also include the playpath
1607                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1608         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1609             self.to_stdout(info_dict['thumbnail'])
1610         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1611             self.to_stdout(info_dict['description'])
1612         if self.params.get('forcefilename', False) and filename is not None:
1613             self.to_stdout(filename)
1614         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1615             self.to_stdout(formatSeconds(info_dict['duration']))
1616         if self.params.get('forceformat', False):
1617             self.to_stdout(info_dict['format'])
1618         if self.params.get('forcejson', False):
1619             self.to_stdout(json.dumps(info_dict))
1620
1621         # Do nothing else if in simulate mode
1622         if self.params.get('simulate', False):
1623             return
1624
1625         if filename is None:
1626             return
1627
1628         try:
1629             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1630             if dn and not os.path.exists(dn):
1631                 os.makedirs(dn)
1632         except (OSError, IOError) as err:
1633             self.report_error('unable to create directory ' + error_to_compat_str(err))
1634             return
1635
1636         if self.params.get('writedescription', False):
1637             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1638             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1639                 self.to_screen('[info] Video description is already present')
1640             elif info_dict.get('description') is None:
1641                 self.report_warning('There\'s no description to write.')
1642             else:
1643                 try:
1644                     self.to_screen('[info] Writing video description to: ' + descfn)
1645                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1646                         descfile.write(info_dict['description'])
1647                 except (OSError, IOError):
1648                     self.report_error('Cannot write description file ' + descfn)
1649                     return
1650
1651         if self.params.get('writeannotations', False):
1652             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1653             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1654                 self.to_screen('[info] Video annotations are already present')
1655             else:
1656                 try:
1657                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1658                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1659                         annofile.write(info_dict['annotations'])
1660                 except (KeyError, TypeError):
1661                     self.report_warning('There are no annotations to write.')
1662                 except (OSError, IOError):
1663                     self.report_error('Cannot write annotations file: ' + annofn)
1664                     return
1665
1666         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1667                                        self.params.get('writeautomaticsub')])
1668
1669         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1670             # subtitles download errors are already managed as troubles in relevant IE
1671             # that way it will silently go on when used with unsupporting IE
1672             subtitles = info_dict['requested_subtitles']
1673             ie = self.get_info_extractor(info_dict['extractor_key'])
1674             for sub_lang, sub_info in subtitles.items():
1675                 sub_format = sub_info['ext']
1676                 if sub_info.get('data') is not None:
1677                     sub_data = sub_info['data']
1678                 else:
1679                     try:
1680                         sub_data = ie._download_webpage(
1681                             sub_info['url'], info_dict['id'], note=False)
1682                     except ExtractorError as err:
1683                         self.report_warning('Unable to download subtitle for "%s": %s' %
1684                                             (sub_lang, error_to_compat_str(err.cause)))
1685                         continue
1686                 try:
1687                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1688                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1689                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1690                     else:
1691                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1692                         # Use newline='' to prevent conversion of newline characters
1693                         # See https://github.com/rg3/youtube-dl/issues/10268
1694                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1695                             subfile.write(sub_data)
1696                 except (OSError, IOError):
1697                     self.report_error('Cannot write subtitles file ' + sub_filename)
1698                     return
1699
1700         if self.params.get('writeinfojson', False):
1701             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1702             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1703                 self.to_screen('[info] Video description metadata is already present')
1704             else:
1705                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1706                 try:
1707                     write_json_file(self.filter_requested_info(info_dict), infofn)
1708                 except (OSError, IOError):
1709                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1710                     return
1711
1712         self._write_thumbnails(info_dict, filename)
1713
1714         if not self.params.get('skip_download', False):
1715             try:
1716                 def dl(name, info):
1717                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1718                     for ph in self._progress_hooks:
1719                         fd.add_progress_hook(ph)
1720                     if self.params.get('verbose'):
1721                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1722                     return fd.download(name, info)
1723
1724                 if info_dict.get('requested_formats') is not None:
1725                     downloaded = []
1726                     success = True
1727                     merger = FFmpegMergerPP(self)
1728                     if not merger.available:
1729                         postprocessors = []
1730                         self.report_warning('You have requested multiple '
1731                                             'formats but ffmpeg or avconv are not installed.'
1732                                             ' The formats won\'t be merged.')
1733                     else:
1734                         postprocessors = [merger]
1735
1736                     def compatible_formats(formats):
1737                         video, audio = formats
1738                         # Check extension
1739                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1740                         if video_ext and audio_ext:
1741                             COMPATIBLE_EXTS = (
1742                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1743                                 ('webm')
1744                             )
1745                             for exts in COMPATIBLE_EXTS:
1746                                 if video_ext in exts and audio_ext in exts:
1747                                     return True
1748                         # TODO: Check acodec/vcodec
1749                         return False
1750
1751                     filename_real_ext = os.path.splitext(filename)[1][1:]
1752                     filename_wo_ext = (
1753                         os.path.splitext(filename)[0]
1754                         if filename_real_ext == info_dict['ext']
1755                         else filename)
1756                     requested_formats = info_dict['requested_formats']
1757                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1758                         info_dict['ext'] = 'mkv'
1759                         self.report_warning(
1760                             'Requested formats are incompatible for merge and will be merged into mkv.')
1761                     # Ensure filename always has a correct extension for successful merge
1762                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1763                     if os.path.exists(encodeFilename(filename)):
1764                         self.to_screen(
1765                             '[download] %s has already been downloaded and '
1766                             'merged' % filename)
1767                     else:
1768                         for f in requested_formats:
1769                             new_info = dict(info_dict)
1770                             new_info.update(f)
1771                             fname = self.prepare_filename(new_info)
1772                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1773                             downloaded.append(fname)
1774                             partial_success = dl(fname, new_info)
1775                             success = success and partial_success
1776                         info_dict['__postprocessors'] = postprocessors
1777                         info_dict['__files_to_merge'] = downloaded
1778                 else:
1779                     # Just a single file
1780                     success = dl(filename, info_dict)
1781             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1782                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1783                 return
1784             except (OSError, IOError) as err:
1785                 raise UnavailableVideoError(err)
1786             except (ContentTooShortError, ) as err:
1787                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1788                 return
1789
1790             if success and filename != '-':
1791                 # Fixup content
1792                 fixup_policy = self.params.get('fixup')
1793                 if fixup_policy is None:
1794                     fixup_policy = 'detect_or_warn'
1795
1796                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1797
1798                 stretched_ratio = info_dict.get('stretched_ratio')
1799                 if stretched_ratio is not None and stretched_ratio != 1:
1800                     if fixup_policy == 'warn':
1801                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1802                             info_dict['id'], stretched_ratio))
1803                     elif fixup_policy == 'detect_or_warn':
1804                         stretched_pp = FFmpegFixupStretchedPP(self)
1805                         if stretched_pp.available:
1806                             info_dict.setdefault('__postprocessors', [])
1807                             info_dict['__postprocessors'].append(stretched_pp)
1808                         else:
1809                             self.report_warning(
1810                                 '%s: Non-uniform pixel ratio (%s). %s'
1811                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1812                     else:
1813                         assert fixup_policy in ('ignore', 'never')
1814
1815                 if (info_dict.get('requested_formats') is None and
1816                         info_dict.get('container') == 'm4a_dash'):
1817                     if fixup_policy == 'warn':
1818                         self.report_warning(
1819                             '%s: writing DASH m4a. '
1820                             'Only some players support this container.'
1821                             % info_dict['id'])
1822                     elif fixup_policy == 'detect_or_warn':
1823                         fixup_pp = FFmpegFixupM4aPP(self)
1824                         if fixup_pp.available:
1825                             info_dict.setdefault('__postprocessors', [])
1826                             info_dict['__postprocessors'].append(fixup_pp)
1827                         else:
1828                             self.report_warning(
1829                                 '%s: writing DASH m4a. '
1830                                 'Only some players support this container. %s'
1831                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1832                     else:
1833                         assert fixup_policy in ('ignore', 'never')
1834
1835                 if (info_dict.get('protocol') == 'm3u8_native' or
1836                         info_dict.get('protocol') == 'm3u8' and
1837                         self.params.get('hls_prefer_native')):
1838                     if fixup_policy == 'warn':
1839                         self.report_warning('%s: malformated aac bitstream.' % (
1840                             info_dict['id']))
1841                     elif fixup_policy == 'detect_or_warn':
1842                         fixup_pp = FFmpegFixupM3u8PP(self)
1843                         if fixup_pp.available:
1844                             info_dict.setdefault('__postprocessors', [])
1845                             info_dict['__postprocessors'].append(fixup_pp)
1846                         else:
1847                             self.report_warning(
1848                                 '%s: malformated aac bitstream. %s'
1849                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1850                     else:
1851                         assert fixup_policy in ('ignore', 'never')
1852
1853                 try:
1854                     self.post_process(filename, info_dict)
1855                 except (PostProcessingError) as err:
1856                     self.report_error('postprocessing: %s' % str(err))
1857                     return
1858                 self.record_download_archive(info_dict)
1859
1860     def download(self, url_list):
1861         """Download a given list of URLs."""
1862         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1863         if (len(url_list) > 1 and
1864                 '%' not in outtmpl and
1865                 self.params.get('max_downloads') != 1):
1866             raise SameFileError(outtmpl)
1867
1868         for url in url_list:
1869             try:
1870                 # It also downloads the videos
1871                 res = self.extract_info(
1872                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1873             except UnavailableVideoError:
1874                 self.report_error('unable to download video')
1875             except MaxDownloadsReached:
1876                 self.to_screen('[info] Maximum number of downloaded files reached.')
1877                 raise
1878             else:
1879                 if self.params.get('dump_single_json', False):
1880                     self.to_stdout(json.dumps(res))
1881
1882         return self._download_retcode
1883
1884     def download_with_info_file(self, info_filename):
1885         with contextlib.closing(fileinput.FileInput(
1886                 [info_filename], mode='r',
1887                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1888             # FileInput doesn't have a read method, we can't call json.load
1889             info = self.filter_requested_info(json.loads('\n'.join(f)))
1890         try:
1891             self.process_ie_result(info, download=True)
1892         except DownloadError:
1893             webpage_url = info.get('webpage_url')
1894             if webpage_url is not None:
1895                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1896                 return self.download([webpage_url])
1897             else:
1898                 raise
1899         return self._download_retcode
1900
1901     @staticmethod
1902     def filter_requested_info(info_dict):
1903         return dict(
1904             (k, v) for k, v in info_dict.items()
1905             if k not in ['requested_formats', 'requested_subtitles'])
1906
1907     def post_process(self, filename, ie_info):
1908         """Run all the postprocessors on the given file."""
1909         info = dict(ie_info)
1910         info['filepath'] = filename
1911         pps_chain = []
1912         if ie_info.get('__postprocessors') is not None:
1913             pps_chain.extend(ie_info['__postprocessors'])
1914         pps_chain.extend(self._pps)
1915         for pp in pps_chain:
1916             files_to_delete = []
1917             try:
1918                 files_to_delete, info = pp.run(info)
1919             except PostProcessingError as e:
1920                 self.report_error(e.msg)
1921             if files_to_delete and not self.params.get('keepvideo', False):
1922                 for old_filename in files_to_delete:
1923                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1924                     try:
1925                         os.remove(encodeFilename(old_filename))
1926                     except (IOError, OSError):
1927                         self.report_warning('Unable to remove downloaded original file')
1928
1929     def _make_archive_id(self, info_dict):
1930         # Future-proof against any change in case
1931         # and backwards compatibility with prior versions
1932         extractor = info_dict.get('extractor_key')
1933         if extractor is None:
1934             if 'id' in info_dict:
1935                 extractor = info_dict.get('ie_key')  # key in a playlist
1936         if extractor is None:
1937             return None  # Incomplete video information
1938         return extractor.lower() + ' ' + info_dict['id']
1939
1940     def in_download_archive(self, info_dict):
1941         fn = self.params.get('download_archive')
1942         if fn is None:
1943             return False
1944
1945         vid_id = self._make_archive_id(info_dict)
1946         if vid_id is None:
1947             return False  # Incomplete video information
1948
1949         try:
1950             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1951                 for line in archive_file:
1952                     if line.strip() == vid_id:
1953                         return True
1954         except IOError as ioe:
1955             if ioe.errno != errno.ENOENT:
1956                 raise
1957         return False
1958
1959     def record_download_archive(self, info_dict):
1960         fn = self.params.get('download_archive')
1961         if fn is None:
1962             return
1963         vid_id = self._make_archive_id(info_dict)
1964         assert vid_id
1965         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1966             archive_file.write(vid_id + '\n')
1967
1968     @staticmethod
1969     def format_resolution(format, default='unknown'):
1970         if format.get('vcodec') == 'none':
1971             return 'audio only'
1972         if format.get('resolution') is not None:
1973             return format['resolution']
1974         if format.get('height') is not None:
1975             if format.get('width') is not None:
1976                 res = '%sx%s' % (format['width'], format['height'])
1977             else:
1978                 res = '%sp' % format['height']
1979         elif format.get('width') is not None:
1980             res = '%dx?' % format['width']
1981         else:
1982             res = default
1983         return res
1984
1985     def _format_note(self, fdict):
1986         res = ''
1987         if fdict.get('ext') in ['f4f', 'f4m']:
1988             res += '(unsupported) '
1989         if fdict.get('language'):
1990             if res:
1991                 res += ' '
1992             res += '[%s] ' % fdict['language']
1993         if fdict.get('format_note') is not None:
1994             res += fdict['format_note'] + ' '
1995         if fdict.get('tbr') is not None:
1996             res += '%4dk ' % fdict['tbr']
1997         if fdict.get('container') is not None:
1998             if res:
1999                 res += ', '
2000             res += '%s container' % fdict['container']
2001         if (fdict.get('vcodec') is not None and
2002                 fdict.get('vcodec') != 'none'):
2003             if res:
2004                 res += ', '
2005             res += fdict['vcodec']
2006             if fdict.get('vbr') is not None:
2007                 res += '@'
2008         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2009             res += 'video@'
2010         if fdict.get('vbr') is not None:
2011             res += '%4dk' % fdict['vbr']
2012         if fdict.get('fps') is not None:
2013             if res:
2014                 res += ', '
2015             res += '%sfps' % fdict['fps']
2016         if fdict.get('acodec') is not None:
2017             if res:
2018                 res += ', '
2019             if fdict['acodec'] == 'none':
2020                 res += 'video only'
2021             else:
2022                 res += '%-5s' % fdict['acodec']
2023         elif fdict.get('abr') is not None:
2024             if res:
2025                 res += ', '
2026             res += 'audio'
2027         if fdict.get('abr') is not None:
2028             res += '@%3dk' % fdict['abr']
2029         if fdict.get('asr') is not None:
2030             res += ' (%5dHz)' % fdict['asr']
2031         if fdict.get('filesize') is not None:
2032             if res:
2033                 res += ', '
2034             res += format_bytes(fdict['filesize'])
2035         elif fdict.get('filesize_approx') is not None:
2036             if res:
2037                 res += ', '
2038             res += '~' + format_bytes(fdict['filesize_approx'])
2039         return res
2040
2041     def list_formats(self, info_dict):
2042         formats = info_dict.get('formats', [info_dict])
2043         table = [
2044             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2045             for f in formats
2046             if f.get('preference') is None or f['preference'] >= -1000]
2047         if len(formats) > 1:
2048             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2049
2050         header_line = ['format code', 'extension', 'resolution', 'note']
2051         self.to_screen(
2052             '[info] Available formats for %s:\n%s' %
2053             (info_dict['id'], render_table(header_line, table)))
2054
2055     def list_thumbnails(self, info_dict):
2056         thumbnails = info_dict.get('thumbnails')
2057         if not thumbnails:
2058             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2059             return
2060
2061         self.to_screen(
2062             '[info] Thumbnails for %s:' % info_dict['id'])
2063         self.to_screen(render_table(
2064             ['ID', 'width', 'height', 'URL'],
2065             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2066
2067     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2068         if not subtitles:
2069             self.to_screen('%s has no %s' % (video_id, name))
2070             return
2071         self.to_screen(
2072             'Available %s for %s:' % (name, video_id))
2073         self.to_screen(render_table(
2074             ['Language', 'formats'],
2075             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2076                 for lang, formats in subtitles.items()]))
2077
2078     def urlopen(self, req):
2079         """ Start an HTTP download """
2080         if isinstance(req, compat_basestring):
2081             req = sanitized_Request(req)
2082         return self._opener.open(req, timeout=self._socket_timeout)
2083
2084     def print_debug_header(self):
2085         if not self.params.get('verbose'):
2086             return
2087
2088         if type('') is not compat_str:
2089             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2090             self.report_warning(
2091                 'Your Python is broken! Update to a newer and supported version')
2092
2093         stdout_encoding = getattr(
2094             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2095         encoding_str = (
2096             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2097                 locale.getpreferredencoding(),
2098                 sys.getfilesystemencoding(),
2099                 stdout_encoding,
2100                 self.get_encoding()))
2101         write_string(encoding_str, encoding=None)
2102
2103         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2104         if _LAZY_LOADER:
2105             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2106         try:
2107             sp = subprocess.Popen(
2108                 ['git', 'rev-parse', '--short', 'HEAD'],
2109                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2110                 cwd=os.path.dirname(os.path.abspath(__file__)))
2111             out, err = sp.communicate()
2112             out = out.decode().strip()
2113             if re.match('[0-9a-f]+', out):
2114                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2115         except Exception:
2116             try:
2117                 sys.exc_clear()
2118             except Exception:
2119                 pass
2120         self._write_string('[debug] Python version %s - %s\n' % (
2121             platform.python_version(), platform_name()))
2122
2123         exe_versions = FFmpegPostProcessor.get_versions(self)
2124         exe_versions['rtmpdump'] = rtmpdump_version()
2125         exe_str = ', '.join(
2126             '%s %s' % (exe, v)
2127             for exe, v in sorted(exe_versions.items())
2128             if v
2129         )
2130         if not exe_str:
2131             exe_str = 'none'
2132         self._write_string('[debug] exe versions: %s\n' % exe_str)
2133
2134         proxy_map = {}
2135         for handler in self._opener.handlers:
2136             if hasattr(handler, 'proxies'):
2137                 proxy_map.update(handler.proxies)
2138         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2139
2140         if self.params.get('call_home', False):
2141             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2142             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2143             latest_version = self.urlopen(
2144                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2145             if version_tuple(latest_version) > version_tuple(__version__):
2146                 self.report_warning(
2147                     'You are using an outdated version (newest version: %s)! '
2148                     'See https://yt-dl.org/update if you need help updating.' %
2149                     latest_version)
2150
2151     def _setup_opener(self):
2152         timeout_val = self.params.get('socket_timeout')
2153         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2154
2155         opts_cookiefile = self.params.get('cookiefile')
2156         opts_proxy = self.params.get('proxy')
2157
2158         if opts_cookiefile is None:
2159             self.cookiejar = compat_cookiejar.CookieJar()
2160         else:
2161             opts_cookiefile = compat_expanduser(opts_cookiefile)
2162             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2163                 opts_cookiefile)
2164             if os.access(opts_cookiefile, os.R_OK):
2165                 self.cookiejar.load()
2166
2167         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2168         if opts_proxy is not None:
2169             if opts_proxy == '':
2170                 proxies = {}
2171             else:
2172                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2173         else:
2174             proxies = compat_urllib_request.getproxies()
2175             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2176             if 'http' in proxies and 'https' not in proxies:
2177                 proxies['https'] = proxies['http']
2178         proxy_handler = PerRequestProxyHandler(proxies)
2179
2180         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2181         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2182         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2183         data_handler = compat_urllib_request_DataHandler()
2184
2185         # When passing our own FileHandler instance, build_opener won't add the
2186         # default FileHandler and allows us to disable the file protocol, which
2187         # can be used for malicious purposes (see
2188         # https://github.com/rg3/youtube-dl/issues/8227)
2189         file_handler = compat_urllib_request.FileHandler()
2190
2191         def file_open(*args, **kwargs):
2192             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2193         file_handler.file_open = file_open
2194
2195         opener = compat_urllib_request.build_opener(
2196             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2197
2198         # Delete the default user-agent header, which would otherwise apply in
2199         # cases where our custom HTTP handler doesn't come into play
2200         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2201         opener.addheaders = []
2202         self._opener = opener
2203
2204     def encode(self, s):
2205         if isinstance(s, bytes):
2206             return s  # Already encoded
2207
2208         try:
2209             return s.encode(self.get_encoding())
2210         except UnicodeEncodeError as err:
2211             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2212             raise
2213
2214     def get_encoding(self):
2215         encoding = self.params.get('encoding')
2216         if encoding is None:
2217             encoding = preferredencoding()
2218         return encoding
2219
2220     def _write_thumbnails(self, info_dict, filename):
2221         if self.params.get('writethumbnail', False):
2222             thumbnails = info_dict.get('thumbnails')
2223             if thumbnails:
2224                 thumbnails = [thumbnails[-1]]
2225         elif self.params.get('write_all_thumbnails', False):
2226             thumbnails = info_dict.get('thumbnails')
2227         else:
2228             return
2229
2230         if not thumbnails:
2231             # No thumbnails present, so return immediately
2232             return
2233
2234         for t in thumbnails:
2235             thumb_ext = determine_ext(t['url'], 'jpg')
2236             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2237             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2238             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2239
2240             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2241                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2242                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2243             else:
2244                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2245                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2246                 try:
2247                     uf = self.urlopen(t['url'])
2248                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2249                         shutil.copyfileobj(uf, thumbf)
2250                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2251                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2252                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2253                     self.report_warning('Unable to download thumbnail "%s": %s' %
2254                                         (t['url'], error_to_compat_str(err)))