[YoutubeDL] Don't sanitize identifiers (closes #12317)
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_expanduser,
33     compat_get_terminal_size,
34     compat_http_client,
35     compat_kwargs,
36     compat_numeric_types,
37     compat_os_name,
38     compat_str,
39     compat_tokenize_tokenize,
40     compat_urllib_error,
41     compat_urllib_request,
42     compat_urllib_request_DataHandler,
43 )
44 from .utils import (
45     age_restricted,
46     args_to_str,
47     ContentTooShortError,
48     date_from_str,
49     DateRange,
50     DEFAULT_OUTTMPL,
51     determine_ext,
52     determine_protocol,
53     DownloadError,
54     encode_compat_str,
55     encodeFilename,
56     error_to_compat_str,
57     ExtractorError,
58     format_bytes,
59     formatSeconds,
60     GeoRestrictedError,
61     ISO3166Utils,
62     locked_file,
63     make_HTTPS_handler,
64     MaxDownloadsReached,
65     PagedList,
66     parse_filesize,
67     PerRequestProxyHandler,
68     platform_name,
69     PostProcessingError,
70     preferredencoding,
71     prepend_extension,
72     register_socks_protocols,
73     render_table,
74     replace_extension,
75     SameFileError,
76     sanitize_filename,
77     sanitize_path,
78     sanitize_url,
79     sanitized_Request,
80     std_headers,
81     subtitles_filename,
82     UnavailableVideoError,
83     url_basename,
84     version_tuple,
85     write_json_file,
86     write_string,
87     YoutubeDLCookieProcessor,
88     YoutubeDLHandler,
89 )
90 from .cache import Cache
91 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
92 from .downloader import get_suitable_downloader
93 from .downloader.rtmp import rtmpdump_version
94 from .postprocessor import (
95     FFmpegFixupM3u8PP,
96     FFmpegFixupM4aPP,
97     FFmpegFixupStretchedPP,
98     FFmpegMergerPP,
99     FFmpegPostProcessor,
100     get_postprocessor,
101 )
102 from .version import __version__
103
104 if compat_os_name == 'nt':
105     import ctypes
106
107
108 class YoutubeDL(object):
109     """YoutubeDL class.
110
111     YoutubeDL objects are the ones responsible of downloading the
112     actual video file and writing it to disk if the user has requested
113     it, among some other tasks. In most cases there should be one per
114     program. As, given a video URL, the downloader doesn't know how to
115     extract all the needed information, task that InfoExtractors do, it
116     has to pass the URL to one of them.
117
118     For this, YoutubeDL objects have a method that allows
119     InfoExtractors to be registered in a given order. When it is passed
120     a URL, the YoutubeDL object handles it to the first InfoExtractor it
121     finds that reports being able to handle it. The InfoExtractor extracts
122     all the information about the video or videos the URL refers to, and
123     YoutubeDL process the extracted information, possibly using a File
124     Downloader to download the video.
125
126     YoutubeDL objects accept a lot of parameters. In order not to saturate
127     the object constructor with arguments, it receives a dictionary of
128     options instead. These options are available through the params
129     attribute for the InfoExtractors to use. The YoutubeDL also
130     registers itself as the downloader in charge for the InfoExtractors
131     that are added to it, so this is a "mutual registration".
132
133     Available options:
134
135     username:          Username for authentication purposes.
136     password:          Password for authentication purposes.
137     videopassword:     Password for accessing a video.
138     ap_mso:            Adobe Pass multiple-system operator identifier.
139     ap_username:       Multiple-system operator account username.
140     ap_password:       Multiple-system operator account password.
141     usenetrc:          Use netrc for authentication instead.
142     verbose:           Print additional info to stdout.
143     quiet:             Do not print messages to stdout.
144     no_warnings:       Do not print out anything for warnings.
145     forceurl:          Force printing final URL.
146     forcetitle:        Force printing title.
147     forceid:           Force printing ID.
148     forcethumbnail:    Force printing thumbnail URL.
149     forcedescription:  Force printing description.
150     forcefilename:     Force printing final filename.
151     forceduration:     Force printing duration.
152     forcejson:         Force printing info_dict as JSON.
153     dump_single_json:  Force printing the info_dict of the whole playlist
154                        (or video) as a single JSON line.
155     simulate:          Do not download the video files.
156     format:            Video format code. See options.py for more information.
157     outtmpl:           Template for output names.
158     restrictfilenames: Do not allow "&" and spaces in file names
159     ignoreerrors:      Do not stop on download errors.
160     force_generic_extractor: Force downloader to use the generic extractor
161     nooverwrites:      Prevent overwriting files.
162     playliststart:     Playlist item to start at.
163     playlistend:       Playlist item to end at.
164     playlist_items:    Specific indices of playlist to download.
165     playlistreverse:   Download playlist items in reverse order.
166     playlistrandom:    Download playlist items in random order.
167     matchtitle:        Download only matching titles.
168     rejecttitle:       Reject downloads for matching titles.
169     logger:            Log messages to a logging.Logger instance.
170     logtostderr:       Log messages to stderr instead of stdout.
171     writedescription:  Write the video description to a .description file
172     writeinfojson:     Write the video description to a .info.json file
173     writeannotations:  Write the video annotations to a .annotations.xml file
174     writethumbnail:    Write the thumbnail image to a file
175     write_all_thumbnails:  Write all thumbnail formats to files
176     writesubtitles:    Write the video subtitles to a file
177     writeautomaticsub: Write the automatically generated subtitles to a file
178     allsubtitles:      Downloads all the subtitles of the video
179                        (requires writesubtitles or writeautomaticsub)
180     listsubtitles:     Lists all available subtitles for the video
181     subtitlesformat:   The format code for subtitles
182     subtitleslangs:    List of languages of the subtitles to download
183     keepvideo:         Keep the video file after post-processing
184     daterange:         A DateRange object, download only if the upload_date is in the range.
185     skip_download:     Skip the actual download of the video file
186     cachedir:          Location of the cache files in the filesystem.
187                        False to disable filesystem cache.
188     noplaylist:        Download single video instead of a playlist if in doubt.
189     age_limit:         An integer representing the user's age in years.
190                        Unsuitable videos for the given age are skipped.
191     min_views:         An integer representing the minimum view count the video
192                        must have in order to not be skipped.
193                        Videos without view count information are always
194                        downloaded. None for no limit.
195     max_views:         An integer representing the maximum view count.
196                        Videos that are more popular than that are not
197                        downloaded.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     download_archive:  File name of a file where all downloads are recorded.
201                        Videos already present in the file are not downloaded
202                        again.
203     cookiefile:        File name where cookies should be read from and dumped to.
204     nocheckcertificate:Do not verify SSL certificates
205     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
206                        At the moment, this is only supported by YouTube.
207     proxy:             URL of the proxy server to use
208     geo_verification_proxy:  URL of the proxy to use for IP address verification
209                        on geo-restricted sites. (Experimental)
210     socket_timeout:    Time to wait for unresponsive hosts, in seconds
211     bidi_workaround:   Work around buggy terminals without bidirectional text
212                        support, using fridibi
213     debug_printtraffic:Print out sent and received HTTP traffic
214     include_ads:       Download ads as well
215     default_search:    Prepend this string if an input url is not valid.
216                        'auto' for elaborate guessing
217     encoding:          Use this encoding instead of the system-specified.
218     extract_flat:      Do not resolve URLs, return the immediate result.
219                        Pass in 'in_playlist' to only show this behavior for
220                        playlist items.
221     postprocessors:    A list of dictionaries, each with an entry
222                        * key:  The name of the postprocessor. See
223                                youtube_dl/postprocessor/__init__.py for a list.
224                        as well as any further keyword arguments for the
225                        postprocessor.
226     progress_hooks:    A list of functions that get called on download
227                        progress, with a dictionary with the entries
228                        * status: One of "downloading", "error", or "finished".
229                                  Check this first and ignore unknown values.
230
231                        If status is one of "downloading", or "finished", the
232                        following properties may also be present:
233                        * filename: The final filename (always present)
234                        * tmpfilename: The filename we're currently writing to
235                        * downloaded_bytes: Bytes on disk
236                        * total_bytes: Size of the whole file, None if unknown
237                        * total_bytes_estimate: Guess of the eventual file size,
238                                                None if unavailable.
239                        * elapsed: The number of seconds since download started.
240                        * eta: The estimated time in seconds, None if unknown
241                        * speed: The download speed in bytes/second, None if
242                                 unknown
243                        * fragment_index: The counter of the currently
244                                          downloaded video fragment.
245                        * fragment_count: The number of fragments (= individual
246                                          files that will be merged)
247
248                        Progress hooks are guaranteed to be called at least once
249                        (with status "finished") if the download is successful.
250     merge_output_format: Extension to use when merging formats.
251     fixup:             Automatically correct known faults of the file.
252                        One of:
253                        - "never": do nothing
254                        - "warn": only emit a warning
255                        - "detect_or_warn": check whether we can do anything
256                                            about it, warn otherwise (default)
257     source_address:    (Experimental) Client-side IP address to bind to.
258     call_home:         Boolean, true iff we are allowed to contact the
259                        youtube-dl servers for debugging.
260     sleep_interval:    Number of seconds to sleep before each download when
261                        used alone or a lower bound of a range for randomized
262                        sleep before each download (minimum possible number
263                        of seconds to sleep) when used along with
264                        max_sleep_interval.
265     max_sleep_interval:Upper bound of a range for randomized sleep before each
266                        download (maximum possible number of seconds to sleep).
267                        Must only be used along with sleep_interval.
268                        Actual sleep time will be a random float from range
269                        [sleep_interval; max_sleep_interval].
270     listformats:       Print an overview of available video formats and exit.
271     list_thumbnails:   Print a table of all thumbnails and exit.
272     match_filter:      A function that gets called with the info_dict of
273                        every video.
274                        If it returns a message, the video is ignored.
275                        If it returns None, the video is downloaded.
276                        match_filter_func in utils.py is one example for this.
277     no_color:          Do not emit color codes in output.
278     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
279                        HTTP header (experimental)
280     geo_bypass_country:
281                        Two-letter ISO 3166-2 country code that will be used for
282                        explicit geographic restriction bypassing via faking
283                        X-Forwarded-For HTTP header (experimental)
284
285     The following options determine which downloader is picked:
286     external_downloader: Executable of the external downloader to call.
287                        None or unset for standard (built-in) downloader.
288     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
289                        if True, otherwise use ffmpeg/avconv if False, otherwise
290                        use downloader suggested by extractor if None.
291
292     The following parameters are not used by YoutubeDL itself, they are used by
293     the downloader (see youtube_dl/downloader/common.py):
294     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
295     noresizebuffer, retries, continuedl, noprogress, consoletitle,
296     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
297
298     The following options are used by the post processors:
299     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
300                        otherwise prefer avconv.
301     postprocessor_args: A list of additional command-line arguments for the
302                         postprocessor.
303     """
304
305     params = None
306     _ies = []
307     _pps = []
308     _download_retcode = None
309     _num_downloads = None
310     _screen_file = None
311
312     def __init__(self, params=None, auto_init=True):
313         """Create a FileDownloader object with the given options."""
314         if params is None:
315             params = {}
316         self._ies = []
317         self._ies_instances = {}
318         self._pps = []
319         self._progress_hooks = []
320         self._download_retcode = 0
321         self._num_downloads = 0
322         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
323         self._err_file = sys.stderr
324         self.params = {
325             # Default parameters
326             'nocheckcertificate': False,
327         }
328         self.params.update(params)
329         self.cache = Cache(self)
330
331         def check_deprecated(param, option, suggestion):
332             if self.params.get(param) is not None:
333                 self.report_warning(
334                     '%s is deprecated. Use %s instead.' % (option, suggestion))
335                 return True
336             return False
337
338         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
339             if self.params.get('geo_verification_proxy') is None:
340                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
341
342         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
343         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
344         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
345
346         if params.get('bidi_workaround', False):
347             try:
348                 import pty
349                 master, slave = pty.openpty()
350                 width = compat_get_terminal_size().columns
351                 if width is None:
352                     width_args = []
353                 else:
354                     width_args = ['-w', str(width)]
355                 sp_kwargs = dict(
356                     stdin=subprocess.PIPE,
357                     stdout=slave,
358                     stderr=self._err_file)
359                 try:
360                     self._output_process = subprocess.Popen(
361                         ['bidiv'] + width_args, **sp_kwargs
362                     )
363                 except OSError:
364                     self._output_process = subprocess.Popen(
365                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
366                 self._output_channel = os.fdopen(master, 'rb')
367             except OSError as ose:
368                 if ose.errno == errno.ENOENT:
369                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
370                 else:
371                     raise
372
373         if (sys.version_info >= (3,) and sys.platform != 'win32' and
374                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
375                 not params.get('restrictfilenames', False)):
376             # On Python 3, the Unicode filesystem API will throw errors (#1474)
377             self.report_warning(
378                 'Assuming --restrict-filenames since file system encoding '
379                 'cannot encode all characters. '
380                 'Set the LC_ALL environment variable to fix this.')
381             self.params['restrictfilenames'] = True
382
383         if isinstance(params.get('outtmpl'), bytes):
384             self.report_warning(
385                 'Parameter outtmpl is bytes, but should be a unicode string. '
386                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
387
388         self._setup_opener()
389
390         if auto_init:
391             self.print_debug_header()
392             self.add_default_info_extractors()
393
394         for pp_def_raw in self.params.get('postprocessors', []):
395             pp_class = get_postprocessor(pp_def_raw['key'])
396             pp_def = dict(pp_def_raw)
397             del pp_def['key']
398             pp = pp_class(self, **compat_kwargs(pp_def))
399             self.add_post_processor(pp)
400
401         for ph in self.params.get('progress_hooks', []):
402             self.add_progress_hook(ph)
403
404         register_socks_protocols()
405
406     def warn_if_short_id(self, argv):
407         # short YouTube ID starting with dash?
408         idxs = [
409             i for i, a in enumerate(argv)
410             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
411         if idxs:
412             correct_argv = (
413                 ['youtube-dl'] +
414                 [a for i, a in enumerate(argv) if i not in idxs] +
415                 ['--'] + [argv[i] for i in idxs]
416             )
417             self.report_warning(
418                 'Long argument string detected. '
419                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
420                 args_to_str(correct_argv))
421
422     def add_info_extractor(self, ie):
423         """Add an InfoExtractor object to the end of the list."""
424         self._ies.append(ie)
425         if not isinstance(ie, type):
426             self._ies_instances[ie.ie_key()] = ie
427             ie.set_downloader(self)
428
429     def get_info_extractor(self, ie_key):
430         """
431         Get an instance of an IE with name ie_key, it will try to get one from
432         the _ies list, if there's no instance it will create a new one and add
433         it to the extractor list.
434         """
435         ie = self._ies_instances.get(ie_key)
436         if ie is None:
437             ie = get_info_extractor(ie_key)()
438             self.add_info_extractor(ie)
439         return ie
440
441     def add_default_info_extractors(self):
442         """
443         Add the InfoExtractors returned by gen_extractors to the end of the list
444         """
445         for ie in gen_extractor_classes():
446             self.add_info_extractor(ie)
447
448     def add_post_processor(self, pp):
449         """Add a PostProcessor object to the end of the chain."""
450         self._pps.append(pp)
451         pp.set_downloader(self)
452
453     def add_progress_hook(self, ph):
454         """Add the progress hook (currently only for the file downloader)"""
455         self._progress_hooks.append(ph)
456
457     def _bidi_workaround(self, message):
458         if not hasattr(self, '_output_channel'):
459             return message
460
461         assert hasattr(self, '_output_process')
462         assert isinstance(message, compat_str)
463         line_count = message.count('\n') + 1
464         self._output_process.stdin.write((message + '\n').encode('utf-8'))
465         self._output_process.stdin.flush()
466         res = ''.join(self._output_channel.readline().decode('utf-8')
467                       for _ in range(line_count))
468         return res[:-len('\n')]
469
470     def to_screen(self, message, skip_eol=False):
471         """Print message to stdout if not in quiet mode."""
472         return self.to_stdout(message, skip_eol, check_quiet=True)
473
474     def _write_string(self, s, out=None):
475         write_string(s, out=out, encoding=self.params.get('encoding'))
476
477     def to_stdout(self, message, skip_eol=False, check_quiet=False):
478         """Print message to stdout if not in quiet mode."""
479         if self.params.get('logger'):
480             self.params['logger'].debug(message)
481         elif not check_quiet or not self.params.get('quiet', False):
482             message = self._bidi_workaround(message)
483             terminator = ['\n', ''][skip_eol]
484             output = message + terminator
485
486             self._write_string(output, self._screen_file)
487
488     def to_stderr(self, message):
489         """Print message to stderr."""
490         assert isinstance(message, compat_str)
491         if self.params.get('logger'):
492             self.params['logger'].error(message)
493         else:
494             message = self._bidi_workaround(message)
495             output = message + '\n'
496             self._write_string(output, self._err_file)
497
498     def to_console_title(self, message):
499         if not self.params.get('consoletitle', False):
500             return
501         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
502             # c_wchar_p() might not be necessary if `message` is
503             # already of type unicode()
504             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
505         elif 'TERM' in os.environ:
506             self._write_string('\033]0;%s\007' % message, self._screen_file)
507
508     def save_console_title(self):
509         if not self.params.get('consoletitle', False):
510             return
511         if 'TERM' in os.environ:
512             # Save the title on stack
513             self._write_string('\033[22;0t', self._screen_file)
514
515     def restore_console_title(self):
516         if not self.params.get('consoletitle', False):
517             return
518         if 'TERM' in os.environ:
519             # Restore the title from stack
520             self._write_string('\033[23;0t', self._screen_file)
521
522     def __enter__(self):
523         self.save_console_title()
524         return self
525
526     def __exit__(self, *args):
527         self.restore_console_title()
528
529         if self.params.get('cookiefile') is not None:
530             self.cookiejar.save()
531
532     def trouble(self, message=None, tb=None):
533         """Determine action to take when a download problem appears.
534
535         Depending on if the downloader has been configured to ignore
536         download errors or not, this method may throw an exception or
537         not when errors are found, after printing the message.
538
539         tb, if given, is additional traceback information.
540         """
541         if message is not None:
542             self.to_stderr(message)
543         if self.params.get('verbose'):
544             if tb is None:
545                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
546                     tb = ''
547                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
548                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
549                     tb += encode_compat_str(traceback.format_exc())
550                 else:
551                     tb_data = traceback.format_list(traceback.extract_stack())
552                     tb = ''.join(tb_data)
553             self.to_stderr(tb)
554         if not self.params.get('ignoreerrors', False):
555             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
556                 exc_info = sys.exc_info()[1].exc_info
557             else:
558                 exc_info = sys.exc_info()
559             raise DownloadError(message, exc_info)
560         self._download_retcode = 1
561
562     def report_warning(self, message):
563         '''
564         Print the message to stderr, it will be prefixed with 'WARNING:'
565         If stderr is a tty file the 'WARNING:' will be colored
566         '''
567         if self.params.get('logger') is not None:
568             self.params['logger'].warning(message)
569         else:
570             if self.params.get('no_warnings'):
571                 return
572             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
573                 _msg_header = '\033[0;33mWARNING:\033[0m'
574             else:
575                 _msg_header = 'WARNING:'
576             warning_message = '%s %s' % (_msg_header, message)
577             self.to_stderr(warning_message)
578
579     def report_error(self, message, tb=None):
580         '''
581         Do the same as trouble, but prefixes the message with 'ERROR:', colored
582         in red if stderr is a tty file.
583         '''
584         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
585             _msg_header = '\033[0;31mERROR:\033[0m'
586         else:
587             _msg_header = 'ERROR:'
588         error_message = '%s %s' % (_msg_header, message)
589         self.trouble(error_message, tb)
590
591     def report_file_already_downloaded(self, file_name):
592         """Report file has already been fully downloaded."""
593         try:
594             self.to_screen('[download] %s has already been downloaded' % file_name)
595         except UnicodeEncodeError:
596             self.to_screen('[download] The file has already been downloaded')
597
598     def prepare_filename(self, info_dict):
599         """Generate the output filename."""
600         try:
601             template_dict = dict(info_dict)
602
603             template_dict['epoch'] = int(time.time())
604             autonumber_size = self.params.get('autonumber_size')
605             if autonumber_size is None:
606                 autonumber_size = 5
607             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
608             if template_dict.get('resolution') is None:
609                 if template_dict.get('width') and template_dict.get('height'):
610                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
611                 elif template_dict.get('height'):
612                     template_dict['resolution'] = '%sp' % template_dict['height']
613                 elif template_dict.get('width'):
614                     template_dict['resolution'] = '%dx?' % template_dict['width']
615
616             sanitize = lambda k, v: sanitize_filename(
617                 compat_str(v),
618                 restricted=self.params.get('restrictfilenames'),
619                 is_id=(k == 'id' or k.endswith('_id')))
620             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
621                                  for k, v in template_dict.items()
622                                  if v is not None and not isinstance(v, (list, tuple, dict)))
623             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
624
625             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
626
627             # For fields playlist_index and autonumber convert all occurrences
628             # of %(field)s to %(field)0Nd for backward compatibility
629             field_size_compat_map = {
630                 'playlist_index': len(str(template_dict['n_entries'])),
631                 'autonumber': autonumber_size,
632             }
633             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
634             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
635             if mobj:
636                 outtmpl = re.sub(
637                     FIELD_SIZE_COMPAT_RE,
638                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
639                     outtmpl)
640
641             NUMERIC_FIELDS = set((
642                 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
643                 'upload_year', 'upload_month', 'upload_day',
644                 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
645                 'average_rating', 'comment_count', 'age_limit',
646                 'start_time', 'end_time',
647                 'chapter_number', 'season_number', 'episode_number',
648                 'track_number', 'disc_number', 'release_year',
649                 'playlist_index',
650             ))
651
652             # Missing numeric fields used together with integer presentation types
653             # in format specification will break the argument substitution since
654             # string 'NA' is returned for missing fields. We will patch output
655             # template for missing fields to meet string presentation type.
656             for numeric_field in NUMERIC_FIELDS:
657                 if numeric_field not in template_dict:
658                     # As of [1] format syntax is:
659                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
660                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
661                     FORMAT_RE = r'''(?x)
662                         (?<!%)
663                         %
664                         \({0}\)  # mapping key
665                         (?:[#0\-+ ]+)?  # conversion flags (optional)
666                         (?:\d+)?  # minimum field width (optional)
667                         (?:\.\d+)?  # precision (optional)
668                         [hlL]?  # length modifier (optional)
669                         [diouxXeEfFgGcrs%]  # conversion type
670                     '''
671                     outtmpl = re.sub(
672                         FORMAT_RE.format(numeric_field),
673                         r'%({0})s'.format(numeric_field), outtmpl)
674
675             tmpl = compat_expanduser(outtmpl)
676             filename = tmpl % template_dict
677             # Temporary fix for #4787
678             # 'Treat' all problem characters by passing filename through preferredencoding
679             # to workaround encoding issues with subprocess on python2 @ Windows
680             if sys.version_info < (3, 0) and sys.platform == 'win32':
681                 filename = encodeFilename(filename, True).decode(preferredencoding())
682             return sanitize_path(filename)
683         except ValueError as err:
684             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
685             return None
686
687     def _match_entry(self, info_dict, incomplete):
688         """ Returns None iff the file should be downloaded """
689
690         video_title = info_dict.get('title', info_dict.get('id', 'video'))
691         if 'title' in info_dict:
692             # This can happen when we're just evaluating the playlist
693             title = info_dict['title']
694             matchtitle = self.params.get('matchtitle', False)
695             if matchtitle:
696                 if not re.search(matchtitle, title, re.IGNORECASE):
697                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
698             rejecttitle = self.params.get('rejecttitle', False)
699             if rejecttitle:
700                 if re.search(rejecttitle, title, re.IGNORECASE):
701                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
702         date = info_dict.get('upload_date')
703         if date is not None:
704             dateRange = self.params.get('daterange', DateRange())
705             if date not in dateRange:
706                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
707         view_count = info_dict.get('view_count')
708         if view_count is not None:
709             min_views = self.params.get('min_views')
710             if min_views is not None and view_count < min_views:
711                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
712             max_views = self.params.get('max_views')
713             if max_views is not None and view_count > max_views:
714                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
715         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
716             return 'Skipping "%s" because it is age restricted' % video_title
717         if self.in_download_archive(info_dict):
718             return '%s has already been recorded in archive' % video_title
719
720         if not incomplete:
721             match_filter = self.params.get('match_filter')
722             if match_filter is not None:
723                 ret = match_filter(info_dict)
724                 if ret is not None:
725                     return ret
726
727         return None
728
729     @staticmethod
730     def add_extra_info(info_dict, extra_info):
731         '''Set the keys from extra_info in info dict if they are missing'''
732         for key, value in extra_info.items():
733             info_dict.setdefault(key, value)
734
735     def extract_info(self, url, download=True, ie_key=None, extra_info={},
736                      process=True, force_generic_extractor=False):
737         '''
738         Returns a list with a dictionary for each video we find.
739         If 'download', also downloads the videos.
740         extra_info is a dict containing the extra values to add to each result
741         '''
742
743         if not ie_key and force_generic_extractor:
744             ie_key = 'Generic'
745
746         if ie_key:
747             ies = [self.get_info_extractor(ie_key)]
748         else:
749             ies = self._ies
750
751         for ie in ies:
752             if not ie.suitable(url):
753                 continue
754
755             ie = self.get_info_extractor(ie.ie_key())
756             if not ie.working():
757                 self.report_warning('The program functionality for this site has been marked as broken, '
758                                     'and will probably not work.')
759
760             try:
761                 ie_result = ie.extract(url)
762                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
763                     break
764                 if isinstance(ie_result, list):
765                     # Backwards compatibility: old IE result format
766                     ie_result = {
767                         '_type': 'compat_list',
768                         'entries': ie_result,
769                     }
770                 self.add_default_extra_info(ie_result, ie, url)
771                 if process:
772                     return self.process_ie_result(ie_result, download, extra_info)
773                 else:
774                     return ie_result
775             except GeoRestrictedError as e:
776                 msg = e.msg
777                 if e.countries:
778                     msg += '\nThis video is available in %s.' % ', '.join(
779                         map(ISO3166Utils.short2full, e.countries))
780                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
781                 self.report_error(msg)
782                 break
783             except ExtractorError as e:  # An error we somewhat expected
784                 self.report_error(compat_str(e), e.format_traceback())
785                 break
786             except MaxDownloadsReached:
787                 raise
788             except Exception as e:
789                 if self.params.get('ignoreerrors', False):
790                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
791                     break
792                 else:
793                     raise
794         else:
795             self.report_error('no suitable InfoExtractor for URL %s' % url)
796
797     def add_default_extra_info(self, ie_result, ie, url):
798         self.add_extra_info(ie_result, {
799             'extractor': ie.IE_NAME,
800             'webpage_url': url,
801             'webpage_url_basename': url_basename(url),
802             'extractor_key': ie.ie_key(),
803         })
804
805     def process_ie_result(self, ie_result, download=True, extra_info={}):
806         """
807         Take the result of the ie(may be modified) and resolve all unresolved
808         references (URLs, playlist items).
809
810         It will also download the videos if 'download'.
811         Returns the resolved ie_result.
812         """
813         result_type = ie_result.get('_type', 'video')
814
815         if result_type in ('url', 'url_transparent'):
816             ie_result['url'] = sanitize_url(ie_result['url'])
817             extract_flat = self.params.get('extract_flat', False)
818             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
819                     extract_flat is True):
820                 if self.params.get('forcejson', False):
821                     self.to_stdout(json.dumps(ie_result))
822                 return ie_result
823
824         if result_type == 'video':
825             self.add_extra_info(ie_result, extra_info)
826             return self.process_video_result(ie_result, download=download)
827         elif result_type == 'url':
828             # We have to add extra_info to the results because it may be
829             # contained in a playlist
830             return self.extract_info(ie_result['url'],
831                                      download,
832                                      ie_key=ie_result.get('ie_key'),
833                                      extra_info=extra_info)
834         elif result_type == 'url_transparent':
835             # Use the information from the embedding page
836             info = self.extract_info(
837                 ie_result['url'], ie_key=ie_result.get('ie_key'),
838                 extra_info=extra_info, download=False, process=False)
839
840             force_properties = dict(
841                 (k, v) for k, v in ie_result.items() if v is not None)
842             for f in ('_type', 'url', 'ie_key'):
843                 if f in force_properties:
844                     del force_properties[f]
845             new_result = info.copy()
846             new_result.update(force_properties)
847
848             assert new_result.get('_type') != 'url_transparent'
849
850             return self.process_ie_result(
851                 new_result, download=download, extra_info=extra_info)
852         elif result_type == 'playlist' or result_type == 'multi_video':
853             # We process each entry in the playlist
854             playlist = ie_result.get('title') or ie_result.get('id')
855             self.to_screen('[download] Downloading playlist: %s' % playlist)
856
857             playlist_results = []
858
859             playliststart = self.params.get('playliststart', 1) - 1
860             playlistend = self.params.get('playlistend')
861             # For backwards compatibility, interpret -1 as whole list
862             if playlistend == -1:
863                 playlistend = None
864
865             playlistitems_str = self.params.get('playlist_items')
866             playlistitems = None
867             if playlistitems_str is not None:
868                 def iter_playlistitems(format):
869                     for string_segment in format.split(','):
870                         if '-' in string_segment:
871                             start, end = string_segment.split('-')
872                             for item in range(int(start), int(end) + 1):
873                                 yield int(item)
874                         else:
875                             yield int(string_segment)
876                 playlistitems = iter_playlistitems(playlistitems_str)
877
878             ie_entries = ie_result['entries']
879             if isinstance(ie_entries, list):
880                 n_all_entries = len(ie_entries)
881                 if playlistitems:
882                     entries = [
883                         ie_entries[i - 1] for i in playlistitems
884                         if -n_all_entries <= i - 1 < n_all_entries]
885                 else:
886                     entries = ie_entries[playliststart:playlistend]
887                 n_entries = len(entries)
888                 self.to_screen(
889                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
890                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
891             elif isinstance(ie_entries, PagedList):
892                 if playlistitems:
893                     entries = []
894                     for item in playlistitems:
895                         entries.extend(ie_entries.getslice(
896                             item - 1, item
897                         ))
898                 else:
899                     entries = ie_entries.getslice(
900                         playliststart, playlistend)
901                 n_entries = len(entries)
902                 self.to_screen(
903                     '[%s] playlist %s: Downloading %d videos' %
904                     (ie_result['extractor'], playlist, n_entries))
905             else:  # iterable
906                 if playlistitems:
907                     entry_list = list(ie_entries)
908                     entries = [entry_list[i - 1] for i in playlistitems]
909                 else:
910                     entries = list(itertools.islice(
911                         ie_entries, playliststart, playlistend))
912                 n_entries = len(entries)
913                 self.to_screen(
914                     '[%s] playlist %s: Downloading %d videos' %
915                     (ie_result['extractor'], playlist, n_entries))
916
917             if self.params.get('playlistreverse', False):
918                 entries = entries[::-1]
919
920             if self.params.get('playlistrandom', False):
921                 random.shuffle(entries)
922
923             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
924
925             for i, entry in enumerate(entries, 1):
926                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
927                 # This __x_forwarded_for_ip thing is a bit ugly but requires
928                 # minimal changes
929                 if x_forwarded_for:
930                     entry['__x_forwarded_for_ip'] = x_forwarded_for
931                 extra = {
932                     'n_entries': n_entries,
933                     'playlist': playlist,
934                     'playlist_id': ie_result.get('id'),
935                     'playlist_title': ie_result.get('title'),
936                     'playlist_index': i + playliststart,
937                     'extractor': ie_result['extractor'],
938                     'webpage_url': ie_result['webpage_url'],
939                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
940                     'extractor_key': ie_result['extractor_key'],
941                 }
942
943                 reason = self._match_entry(entry, incomplete=True)
944                 if reason is not None:
945                     self.to_screen('[download] ' + reason)
946                     continue
947
948                 entry_result = self.process_ie_result(entry,
949                                                       download=download,
950                                                       extra_info=extra)
951                 playlist_results.append(entry_result)
952             ie_result['entries'] = playlist_results
953             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
954             return ie_result
955         elif result_type == 'compat_list':
956             self.report_warning(
957                 'Extractor %s returned a compat_list result. '
958                 'It needs to be updated.' % ie_result.get('extractor'))
959
960             def _fixup(r):
961                 self.add_extra_info(
962                     r,
963                     {
964                         'extractor': ie_result['extractor'],
965                         'webpage_url': ie_result['webpage_url'],
966                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
967                         'extractor_key': ie_result['extractor_key'],
968                     }
969                 )
970                 return r
971             ie_result['entries'] = [
972                 self.process_ie_result(_fixup(r), download, extra_info)
973                 for r in ie_result['entries']
974             ]
975             return ie_result
976         else:
977             raise Exception('Invalid result type: %s' % result_type)
978
979     def _build_format_filter(self, filter_spec):
980         " Returns a function to filter the formats according to the filter_spec "
981
982         OPERATORS = {
983             '<': operator.lt,
984             '<=': operator.le,
985             '>': operator.gt,
986             '>=': operator.ge,
987             '=': operator.eq,
988             '!=': operator.ne,
989         }
990         operator_rex = re.compile(r'''(?x)\s*
991             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
992             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
993             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
994             $
995             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
996         m = operator_rex.search(filter_spec)
997         if m:
998             try:
999                 comparison_value = int(m.group('value'))
1000             except ValueError:
1001                 comparison_value = parse_filesize(m.group('value'))
1002                 if comparison_value is None:
1003                     comparison_value = parse_filesize(m.group('value') + 'B')
1004                 if comparison_value is None:
1005                     raise ValueError(
1006                         'Invalid value %r in format specification %r' % (
1007                             m.group('value'), filter_spec))
1008             op = OPERATORS[m.group('op')]
1009
1010         if not m:
1011             STR_OPERATORS = {
1012                 '=': operator.eq,
1013                 '!=': operator.ne,
1014                 '^=': lambda attr, value: attr.startswith(value),
1015                 '$=': lambda attr, value: attr.endswith(value),
1016                 '*=': lambda attr, value: value in attr,
1017             }
1018             str_operator_rex = re.compile(r'''(?x)
1019                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1020                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1021                 \s*(?P<value>[a-zA-Z0-9._-]+)
1022                 \s*$
1023                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1024             m = str_operator_rex.search(filter_spec)
1025             if m:
1026                 comparison_value = m.group('value')
1027                 op = STR_OPERATORS[m.group('op')]
1028
1029         if not m:
1030             raise ValueError('Invalid filter specification %r' % filter_spec)
1031
1032         def _filter(f):
1033             actual_value = f.get(m.group('key'))
1034             if actual_value is None:
1035                 return m.group('none_inclusive')
1036             return op(actual_value, comparison_value)
1037         return _filter
1038
1039     def build_format_selector(self, format_spec):
1040         def syntax_error(note, start):
1041             message = (
1042                 'Invalid format specification: '
1043                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1044             return SyntaxError(message)
1045
1046         PICKFIRST = 'PICKFIRST'
1047         MERGE = 'MERGE'
1048         SINGLE = 'SINGLE'
1049         GROUP = 'GROUP'
1050         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1051
1052         def _parse_filter(tokens):
1053             filter_parts = []
1054             for type, string, start, _, _ in tokens:
1055                 if type == tokenize.OP and string == ']':
1056                     return ''.join(filter_parts)
1057                 else:
1058                     filter_parts.append(string)
1059
1060         def _remove_unused_ops(tokens):
1061             # Remove operators that we don't use and join them with the surrounding strings
1062             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1063             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1064             last_string, last_start, last_end, last_line = None, None, None, None
1065             for type, string, start, end, line in tokens:
1066                 if type == tokenize.OP and string == '[':
1067                     if last_string:
1068                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1069                         last_string = None
1070                     yield type, string, start, end, line
1071                     # everything inside brackets will be handled by _parse_filter
1072                     for type, string, start, end, line in tokens:
1073                         yield type, string, start, end, line
1074                         if type == tokenize.OP and string == ']':
1075                             break
1076                 elif type == tokenize.OP and string in ALLOWED_OPS:
1077                     if last_string:
1078                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1079                         last_string = None
1080                     yield type, string, start, end, line
1081                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1082                     if not last_string:
1083                         last_string = string
1084                         last_start = start
1085                         last_end = end
1086                     else:
1087                         last_string += string
1088             if last_string:
1089                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1090
1091         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1092             selectors = []
1093             current_selector = None
1094             for type, string, start, _, _ in tokens:
1095                 # ENCODING is only defined in python 3.x
1096                 if type == getattr(tokenize, 'ENCODING', None):
1097                     continue
1098                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1099                     current_selector = FormatSelector(SINGLE, string, [])
1100                 elif type == tokenize.OP:
1101                     if string == ')':
1102                         if not inside_group:
1103                             # ')' will be handled by the parentheses group
1104                             tokens.restore_last_token()
1105                         break
1106                     elif inside_merge and string in ['/', ',']:
1107                         tokens.restore_last_token()
1108                         break
1109                     elif inside_choice and string == ',':
1110                         tokens.restore_last_token()
1111                         break
1112                     elif string == ',':
1113                         if not current_selector:
1114                             raise syntax_error('"," must follow a format selector', start)
1115                         selectors.append(current_selector)
1116                         current_selector = None
1117                     elif string == '/':
1118                         if not current_selector:
1119                             raise syntax_error('"/" must follow a format selector', start)
1120                         first_choice = current_selector
1121                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1122                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1123                     elif string == '[':
1124                         if not current_selector:
1125                             current_selector = FormatSelector(SINGLE, 'best', [])
1126                         format_filter = _parse_filter(tokens)
1127                         current_selector.filters.append(format_filter)
1128                     elif string == '(':
1129                         if current_selector:
1130                             raise syntax_error('Unexpected "("', start)
1131                         group = _parse_format_selection(tokens, inside_group=True)
1132                         current_selector = FormatSelector(GROUP, group, [])
1133                     elif string == '+':
1134                         video_selector = current_selector
1135                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1136                         if not video_selector or not audio_selector:
1137                             raise syntax_error('"+" must be between two format selectors', start)
1138                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1139                     else:
1140                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1141                 elif type == tokenize.ENDMARKER:
1142                     break
1143             if current_selector:
1144                 selectors.append(current_selector)
1145             return selectors
1146
1147         def _build_selector_function(selector):
1148             if isinstance(selector, list):
1149                 fs = [_build_selector_function(s) for s in selector]
1150
1151                 def selector_function(ctx):
1152                     for f in fs:
1153                         for format in f(ctx):
1154                             yield format
1155                 return selector_function
1156             elif selector.type == GROUP:
1157                 selector_function = _build_selector_function(selector.selector)
1158             elif selector.type == PICKFIRST:
1159                 fs = [_build_selector_function(s) for s in selector.selector]
1160
1161                 def selector_function(ctx):
1162                     for f in fs:
1163                         picked_formats = list(f(ctx))
1164                         if picked_formats:
1165                             return picked_formats
1166                     return []
1167             elif selector.type == SINGLE:
1168                 format_spec = selector.selector
1169
1170                 def selector_function(ctx):
1171                     formats = list(ctx['formats'])
1172                     if not formats:
1173                         return
1174                     if format_spec == 'all':
1175                         for f in formats:
1176                             yield f
1177                     elif format_spec in ['best', 'worst', None]:
1178                         format_idx = 0 if format_spec == 'worst' else -1
1179                         audiovideo_formats = [
1180                             f for f in formats
1181                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1182                         if audiovideo_formats:
1183                             yield audiovideo_formats[format_idx]
1184                         # for extractors with incomplete formats (audio only (soundcloud)
1185                         # or video only (imgur)) we will fallback to best/worst
1186                         # {video,audio}-only format
1187                         elif ctx['incomplete_formats']:
1188                             yield formats[format_idx]
1189                     elif format_spec == 'bestaudio':
1190                         audio_formats = [
1191                             f for f in formats
1192                             if f.get('vcodec') == 'none']
1193                         if audio_formats:
1194                             yield audio_formats[-1]
1195                     elif format_spec == 'worstaudio':
1196                         audio_formats = [
1197                             f for f in formats
1198                             if f.get('vcodec') == 'none']
1199                         if audio_formats:
1200                             yield audio_formats[0]
1201                     elif format_spec == 'bestvideo':
1202                         video_formats = [
1203                             f for f in formats
1204                             if f.get('acodec') == 'none']
1205                         if video_formats:
1206                             yield video_formats[-1]
1207                     elif format_spec == 'worstvideo':
1208                         video_formats = [
1209                             f for f in formats
1210                             if f.get('acodec') == 'none']
1211                         if video_formats:
1212                             yield video_formats[0]
1213                     else:
1214                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1215                         if format_spec in extensions:
1216                             filter_f = lambda f: f['ext'] == format_spec
1217                         else:
1218                             filter_f = lambda f: f['format_id'] == format_spec
1219                         matches = list(filter(filter_f, formats))
1220                         if matches:
1221                             yield matches[-1]
1222             elif selector.type == MERGE:
1223                 def _merge(formats_info):
1224                     format_1, format_2 = [f['format_id'] for f in formats_info]
1225                     # The first format must contain the video and the
1226                     # second the audio
1227                     if formats_info[0].get('vcodec') == 'none':
1228                         self.report_error('The first format must '
1229                                           'contain the video, try using '
1230                                           '"-f %s+%s"' % (format_2, format_1))
1231                         return
1232                     # Formats must be opposite (video+audio)
1233                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1234                         self.report_error(
1235                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1236                             % (format_1, format_2))
1237                         return
1238                     output_ext = (
1239                         formats_info[0]['ext']
1240                         if self.params.get('merge_output_format') is None
1241                         else self.params['merge_output_format'])
1242                     return {
1243                         'requested_formats': formats_info,
1244                         'format': '%s+%s' % (formats_info[0].get('format'),
1245                                              formats_info[1].get('format')),
1246                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1247                                                 formats_info[1].get('format_id')),
1248                         'width': formats_info[0].get('width'),
1249                         'height': formats_info[0].get('height'),
1250                         'resolution': formats_info[0].get('resolution'),
1251                         'fps': formats_info[0].get('fps'),
1252                         'vcodec': formats_info[0].get('vcodec'),
1253                         'vbr': formats_info[0].get('vbr'),
1254                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1255                         'acodec': formats_info[1].get('acodec'),
1256                         'abr': formats_info[1].get('abr'),
1257                         'ext': output_ext,
1258                     }
1259                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1260
1261                 def selector_function(ctx):
1262                     for pair in itertools.product(
1263                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1264                         yield _merge(pair)
1265
1266             filters = [self._build_format_filter(f) for f in selector.filters]
1267
1268             def final_selector(ctx):
1269                 ctx_copy = copy.deepcopy(ctx)
1270                 for _filter in filters:
1271                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1272                 return selector_function(ctx_copy)
1273             return final_selector
1274
1275         stream = io.BytesIO(format_spec.encode('utf-8'))
1276         try:
1277             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1278         except tokenize.TokenError:
1279             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1280
1281         class TokenIterator(object):
1282             def __init__(self, tokens):
1283                 self.tokens = tokens
1284                 self.counter = 0
1285
1286             def __iter__(self):
1287                 return self
1288
1289             def __next__(self):
1290                 if self.counter >= len(self.tokens):
1291                     raise StopIteration()
1292                 value = self.tokens[self.counter]
1293                 self.counter += 1
1294                 return value
1295
1296             next = __next__
1297
1298             def restore_last_token(self):
1299                 self.counter -= 1
1300
1301         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1302         return _build_selector_function(parsed_selector)
1303
1304     def _calc_headers(self, info_dict):
1305         res = std_headers.copy()
1306
1307         add_headers = info_dict.get('http_headers')
1308         if add_headers:
1309             res.update(add_headers)
1310
1311         cookies = self._calc_cookies(info_dict)
1312         if cookies:
1313             res['Cookie'] = cookies
1314
1315         if 'X-Forwarded-For' not in res:
1316             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1317             if x_forwarded_for_ip:
1318                 res['X-Forwarded-For'] = x_forwarded_for_ip
1319
1320         return res
1321
1322     def _calc_cookies(self, info_dict):
1323         pr = sanitized_Request(info_dict['url'])
1324         self.cookiejar.add_cookie_header(pr)
1325         return pr.get_header('Cookie')
1326
1327     def process_video_result(self, info_dict, download=True):
1328         assert info_dict.get('_type', 'video') == 'video'
1329
1330         if 'id' not in info_dict:
1331             raise ExtractorError('Missing "id" field in extractor result')
1332         if 'title' not in info_dict:
1333             raise ExtractorError('Missing "title" field in extractor result')
1334
1335         if not isinstance(info_dict['id'], compat_str):
1336             self.report_warning('"id" field is not a string - forcing string conversion')
1337             info_dict['id'] = compat_str(info_dict['id'])
1338
1339         if 'playlist' not in info_dict:
1340             # It isn't part of a playlist
1341             info_dict['playlist'] = None
1342             info_dict['playlist_index'] = None
1343
1344         thumbnails = info_dict.get('thumbnails')
1345         if thumbnails is None:
1346             thumbnail = info_dict.get('thumbnail')
1347             if thumbnail:
1348                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1349         if thumbnails:
1350             thumbnails.sort(key=lambda t: (
1351                 t.get('preference') if t.get('preference') is not None else -1,
1352                 t.get('width') if t.get('width') is not None else -1,
1353                 t.get('height') if t.get('height') is not None else -1,
1354                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1355             for i, t in enumerate(thumbnails):
1356                 t['url'] = sanitize_url(t['url'])
1357                 if t.get('width') and t.get('height'):
1358                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1359                 if t.get('id') is None:
1360                     t['id'] = '%d' % i
1361
1362         if self.params.get('list_thumbnails'):
1363             self.list_thumbnails(info_dict)
1364             return
1365
1366         thumbnail = info_dict.get('thumbnail')
1367         if thumbnail:
1368             info_dict['thumbnail'] = sanitize_url(thumbnail)
1369         elif thumbnails:
1370             info_dict['thumbnail'] = thumbnails[-1]['url']
1371
1372         if 'display_id' not in info_dict and 'id' in info_dict:
1373             info_dict['display_id'] = info_dict['id']
1374
1375         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1376             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1377             # see http://bugs.python.org/issue1646728)
1378             try:
1379                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1380                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1381             except (ValueError, OverflowError, OSError):
1382                 pass
1383
1384         # Auto generate title fields corresponding to the *_number fields when missing
1385         # in order to always have clean titles. This is very common for TV series.
1386         for field in ('chapter', 'season', 'episode'):
1387             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1388                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1389
1390         subtitles = info_dict.get('subtitles')
1391         if subtitles:
1392             for _, subtitle in subtitles.items():
1393                 for subtitle_format in subtitle:
1394                     if subtitle_format.get('url'):
1395                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1396                     if subtitle_format.get('ext') is None:
1397                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1398
1399         if self.params.get('listsubtitles', False):
1400             if 'automatic_captions' in info_dict:
1401                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1402             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1403             return
1404         info_dict['requested_subtitles'] = self.process_subtitles(
1405             info_dict['id'], subtitles,
1406             info_dict.get('automatic_captions'))
1407
1408         # We now pick which formats have to be downloaded
1409         if info_dict.get('formats') is None:
1410             # There's only one format available
1411             formats = [info_dict]
1412         else:
1413             formats = info_dict['formats']
1414
1415         if not formats:
1416             raise ExtractorError('No video formats found!')
1417
1418         formats_dict = {}
1419
1420         # We check that all the formats have the format and format_id fields
1421         for i, format in enumerate(formats):
1422             if 'url' not in format:
1423                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1424
1425             format['url'] = sanitize_url(format['url'])
1426
1427             if format.get('format_id') is None:
1428                 format['format_id'] = compat_str(i)
1429             else:
1430                 # Sanitize format_id from characters used in format selector expression
1431                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1432             format_id = format['format_id']
1433             if format_id not in formats_dict:
1434                 formats_dict[format_id] = []
1435             formats_dict[format_id].append(format)
1436
1437         # Make sure all formats have unique format_id
1438         for format_id, ambiguous_formats in formats_dict.items():
1439             if len(ambiguous_formats) > 1:
1440                 for i, format in enumerate(ambiguous_formats):
1441                     format['format_id'] = '%s-%d' % (format_id, i)
1442
1443         for i, format in enumerate(formats):
1444             if format.get('format') is None:
1445                 format['format'] = '{id} - {res}{note}'.format(
1446                     id=format['format_id'],
1447                     res=self.format_resolution(format),
1448                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1449                 )
1450             # Automatically determine file extension if missing
1451             if format.get('ext') is None:
1452                 format['ext'] = determine_ext(format['url']).lower()
1453             # Automatically determine protocol if missing (useful for format
1454             # selection purposes)
1455             if format.get('protocol') is None:
1456                 format['protocol'] = determine_protocol(format)
1457             # Add HTTP headers, so that external programs can use them from the
1458             # json output
1459             full_format_info = info_dict.copy()
1460             full_format_info.update(format)
1461             format['http_headers'] = self._calc_headers(full_format_info)
1462         # Remove private housekeeping stuff
1463         if '__x_forwarded_for_ip' in info_dict:
1464             del info_dict['__x_forwarded_for_ip']
1465
1466         # TODO Central sorting goes here
1467
1468         if formats[0] is not info_dict:
1469             # only set the 'formats' fields if the original info_dict list them
1470             # otherwise we end up with a circular reference, the first (and unique)
1471             # element in the 'formats' field in info_dict is info_dict itself,
1472             # which can't be exported to json
1473             info_dict['formats'] = formats
1474         if self.params.get('listformats'):
1475             self.list_formats(info_dict)
1476             return
1477
1478         req_format = self.params.get('format')
1479         if req_format is None:
1480             req_format_list = []
1481             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1482                     not info_dict.get('is_live')):
1483                 merger = FFmpegMergerPP(self)
1484                 if merger.available and merger.can_merge():
1485                     req_format_list.append('bestvideo+bestaudio')
1486             req_format_list.append('best')
1487             req_format = '/'.join(req_format_list)
1488         format_selector = self.build_format_selector(req_format)
1489
1490         # While in format selection we may need to have an access to the original
1491         # format set in order to calculate some metrics or do some processing.
1492         # For now we need to be able to guess whether original formats provided
1493         # by extractor are incomplete or not (i.e. whether extractor provides only
1494         # video-only or audio-only formats) for proper formats selection for
1495         # extractors with such incomplete formats (see
1496         # https://github.com/rg3/youtube-dl/pull/5556).
1497         # Since formats may be filtered during format selection and may not match
1498         # the original formats the results may be incorrect. Thus original formats
1499         # or pre-calculated metrics should be passed to format selection routines
1500         # as well.
1501         # We will pass a context object containing all necessary additional data
1502         # instead of just formats.
1503         # This fixes incorrect format selection issue (see
1504         # https://github.com/rg3/youtube-dl/issues/10083).
1505         incomplete_formats = (
1506             # All formats are video-only or
1507             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1508             # all formats are audio-only
1509             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1510
1511         ctx = {
1512             'formats': formats,
1513             'incomplete_formats': incomplete_formats,
1514         }
1515
1516         formats_to_download = list(format_selector(ctx))
1517         if not formats_to_download:
1518             raise ExtractorError('requested format not available',
1519                                  expected=True)
1520
1521         if download:
1522             if len(formats_to_download) > 1:
1523                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1524             for format in formats_to_download:
1525                 new_info = dict(info_dict)
1526                 new_info.update(format)
1527                 self.process_info(new_info)
1528         # We update the info dict with the best quality format (backwards compatibility)
1529         info_dict.update(formats_to_download[-1])
1530         return info_dict
1531
1532     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1533         """Select the requested subtitles and their format"""
1534         available_subs = {}
1535         if normal_subtitles and self.params.get('writesubtitles'):
1536             available_subs.update(normal_subtitles)
1537         if automatic_captions and self.params.get('writeautomaticsub'):
1538             for lang, cap_info in automatic_captions.items():
1539                 if lang not in available_subs:
1540                     available_subs[lang] = cap_info
1541
1542         if (not self.params.get('writesubtitles') and not
1543                 self.params.get('writeautomaticsub') or not
1544                 available_subs):
1545             return None
1546
1547         if self.params.get('allsubtitles', False):
1548             requested_langs = available_subs.keys()
1549         else:
1550             if self.params.get('subtitleslangs', False):
1551                 requested_langs = self.params.get('subtitleslangs')
1552             elif 'en' in available_subs:
1553                 requested_langs = ['en']
1554             else:
1555                 requested_langs = [list(available_subs.keys())[0]]
1556
1557         formats_query = self.params.get('subtitlesformat', 'best')
1558         formats_preference = formats_query.split('/') if formats_query else []
1559         subs = {}
1560         for lang in requested_langs:
1561             formats = available_subs.get(lang)
1562             if formats is None:
1563                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1564                 continue
1565             for ext in formats_preference:
1566                 if ext == 'best':
1567                     f = formats[-1]
1568                     break
1569                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1570                 if matches:
1571                     f = matches[-1]
1572                     break
1573             else:
1574                 f = formats[-1]
1575                 self.report_warning(
1576                     'No subtitle format found matching "%s" for language %s, '
1577                     'using %s' % (formats_query, lang, f['ext']))
1578             subs[lang] = f
1579         return subs
1580
1581     def process_info(self, info_dict):
1582         """Process a single resolved IE result."""
1583
1584         assert info_dict.get('_type', 'video') == 'video'
1585
1586         max_downloads = self.params.get('max_downloads')
1587         if max_downloads is not None:
1588             if self._num_downloads >= int(max_downloads):
1589                 raise MaxDownloadsReached()
1590
1591         info_dict['fulltitle'] = info_dict['title']
1592         if len(info_dict['title']) > 200:
1593             info_dict['title'] = info_dict['title'][:197] + '...'
1594
1595         if 'format' not in info_dict:
1596             info_dict['format'] = info_dict['ext']
1597
1598         reason = self._match_entry(info_dict, incomplete=False)
1599         if reason is not None:
1600             self.to_screen('[download] ' + reason)
1601             return
1602
1603         self._num_downloads += 1
1604
1605         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1606
1607         # Forced printings
1608         if self.params.get('forcetitle', False):
1609             self.to_stdout(info_dict['fulltitle'])
1610         if self.params.get('forceid', False):
1611             self.to_stdout(info_dict['id'])
1612         if self.params.get('forceurl', False):
1613             if info_dict.get('requested_formats') is not None:
1614                 for f in info_dict['requested_formats']:
1615                     self.to_stdout(f['url'] + f.get('play_path', ''))
1616             else:
1617                 # For RTMP URLs, also include the playpath
1618                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1619         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1620             self.to_stdout(info_dict['thumbnail'])
1621         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1622             self.to_stdout(info_dict['description'])
1623         if self.params.get('forcefilename', False) and filename is not None:
1624             self.to_stdout(filename)
1625         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1626             self.to_stdout(formatSeconds(info_dict['duration']))
1627         if self.params.get('forceformat', False):
1628             self.to_stdout(info_dict['format'])
1629         if self.params.get('forcejson', False):
1630             self.to_stdout(json.dumps(info_dict))
1631
1632         # Do nothing else if in simulate mode
1633         if self.params.get('simulate', False):
1634             return
1635
1636         if filename is None:
1637             return
1638
1639         try:
1640             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1641             if dn and not os.path.exists(dn):
1642                 os.makedirs(dn)
1643         except (OSError, IOError) as err:
1644             self.report_error('unable to create directory ' + error_to_compat_str(err))
1645             return
1646
1647         if self.params.get('writedescription', False):
1648             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1649             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1650                 self.to_screen('[info] Video description is already present')
1651             elif info_dict.get('description') is None:
1652                 self.report_warning('There\'s no description to write.')
1653             else:
1654                 try:
1655                     self.to_screen('[info] Writing video description to: ' + descfn)
1656                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1657                         descfile.write(info_dict['description'])
1658                 except (OSError, IOError):
1659                     self.report_error('Cannot write description file ' + descfn)
1660                     return
1661
1662         if self.params.get('writeannotations', False):
1663             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1664             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1665                 self.to_screen('[info] Video annotations are already present')
1666             else:
1667                 try:
1668                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1669                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1670                         annofile.write(info_dict['annotations'])
1671                 except (KeyError, TypeError):
1672                     self.report_warning('There are no annotations to write.')
1673                 except (OSError, IOError):
1674                     self.report_error('Cannot write annotations file: ' + annofn)
1675                     return
1676
1677         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1678                                        self.params.get('writeautomaticsub')])
1679
1680         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1681             # subtitles download errors are already managed as troubles in relevant IE
1682             # that way it will silently go on when used with unsupporting IE
1683             subtitles = info_dict['requested_subtitles']
1684             ie = self.get_info_extractor(info_dict['extractor_key'])
1685             for sub_lang, sub_info in subtitles.items():
1686                 sub_format = sub_info['ext']
1687                 if sub_info.get('data') is not None:
1688                     sub_data = sub_info['data']
1689                 else:
1690                     try:
1691                         sub_data = ie._download_webpage(
1692                             sub_info['url'], info_dict['id'], note=False)
1693                     except ExtractorError as err:
1694                         self.report_warning('Unable to download subtitle for "%s": %s' %
1695                                             (sub_lang, error_to_compat_str(err.cause)))
1696                         continue
1697                 try:
1698                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1699                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1700                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1701                     else:
1702                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1703                         # Use newline='' to prevent conversion of newline characters
1704                         # See https://github.com/rg3/youtube-dl/issues/10268
1705                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1706                             subfile.write(sub_data)
1707                 except (OSError, IOError):
1708                     self.report_error('Cannot write subtitles file ' + sub_filename)
1709                     return
1710
1711         if self.params.get('writeinfojson', False):
1712             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1713             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1714                 self.to_screen('[info] Video description metadata is already present')
1715             else:
1716                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1717                 try:
1718                     write_json_file(self.filter_requested_info(info_dict), infofn)
1719                 except (OSError, IOError):
1720                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1721                     return
1722
1723         self._write_thumbnails(info_dict, filename)
1724
1725         if not self.params.get('skip_download', False):
1726             try:
1727                 def dl(name, info):
1728                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1729                     for ph in self._progress_hooks:
1730                         fd.add_progress_hook(ph)
1731                     if self.params.get('verbose'):
1732                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1733                     return fd.download(name, info)
1734
1735                 if info_dict.get('requested_formats') is not None:
1736                     downloaded = []
1737                     success = True
1738                     merger = FFmpegMergerPP(self)
1739                     if not merger.available:
1740                         postprocessors = []
1741                         self.report_warning('You have requested multiple '
1742                                             'formats but ffmpeg or avconv are not installed.'
1743                                             ' The formats won\'t be merged.')
1744                     else:
1745                         postprocessors = [merger]
1746
1747                     def compatible_formats(formats):
1748                         video, audio = formats
1749                         # Check extension
1750                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1751                         if video_ext and audio_ext:
1752                             COMPATIBLE_EXTS = (
1753                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1754                                 ('webm')
1755                             )
1756                             for exts in COMPATIBLE_EXTS:
1757                                 if video_ext in exts and audio_ext in exts:
1758                                     return True
1759                         # TODO: Check acodec/vcodec
1760                         return False
1761
1762                     filename_real_ext = os.path.splitext(filename)[1][1:]
1763                     filename_wo_ext = (
1764                         os.path.splitext(filename)[0]
1765                         if filename_real_ext == info_dict['ext']
1766                         else filename)
1767                     requested_formats = info_dict['requested_formats']
1768                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1769                         info_dict['ext'] = 'mkv'
1770                         self.report_warning(
1771                             'Requested formats are incompatible for merge and will be merged into mkv.')
1772                     # Ensure filename always has a correct extension for successful merge
1773                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1774                     if os.path.exists(encodeFilename(filename)):
1775                         self.to_screen(
1776                             '[download] %s has already been downloaded and '
1777                             'merged' % filename)
1778                     else:
1779                         for f in requested_formats:
1780                             new_info = dict(info_dict)
1781                             new_info.update(f)
1782                             fname = self.prepare_filename(new_info)
1783                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1784                             downloaded.append(fname)
1785                             partial_success = dl(fname, new_info)
1786                             success = success and partial_success
1787                         info_dict['__postprocessors'] = postprocessors
1788                         info_dict['__files_to_merge'] = downloaded
1789                 else:
1790                     # Just a single file
1791                     success = dl(filename, info_dict)
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1794                 return
1795             except (OSError, IOError) as err:
1796                 raise UnavailableVideoError(err)
1797             except (ContentTooShortError, ) as err:
1798                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1799                 return
1800
1801             if success and filename != '-':
1802                 # Fixup content
1803                 fixup_policy = self.params.get('fixup')
1804                 if fixup_policy is None:
1805                     fixup_policy = 'detect_or_warn'
1806
1807                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1808
1809                 stretched_ratio = info_dict.get('stretched_ratio')
1810                 if stretched_ratio is not None and stretched_ratio != 1:
1811                     if fixup_policy == 'warn':
1812                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1813                             info_dict['id'], stretched_ratio))
1814                     elif fixup_policy == 'detect_or_warn':
1815                         stretched_pp = FFmpegFixupStretchedPP(self)
1816                         if stretched_pp.available:
1817                             info_dict.setdefault('__postprocessors', [])
1818                             info_dict['__postprocessors'].append(stretched_pp)
1819                         else:
1820                             self.report_warning(
1821                                 '%s: Non-uniform pixel ratio (%s). %s'
1822                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1823                     else:
1824                         assert fixup_policy in ('ignore', 'never')
1825
1826                 if (info_dict.get('requested_formats') is None and
1827                         info_dict.get('container') == 'm4a_dash'):
1828                     if fixup_policy == 'warn':
1829                         self.report_warning(
1830                             '%s: writing DASH m4a. '
1831                             'Only some players support this container.'
1832                             % info_dict['id'])
1833                     elif fixup_policy == 'detect_or_warn':
1834                         fixup_pp = FFmpegFixupM4aPP(self)
1835                         if fixup_pp.available:
1836                             info_dict.setdefault('__postprocessors', [])
1837                             info_dict['__postprocessors'].append(fixup_pp)
1838                         else:
1839                             self.report_warning(
1840                                 '%s: writing DASH m4a. '
1841                                 'Only some players support this container. %s'
1842                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1843                     else:
1844                         assert fixup_policy in ('ignore', 'never')
1845
1846                 if (info_dict.get('protocol') == 'm3u8_native' or
1847                         info_dict.get('protocol') == 'm3u8' and
1848                         self.params.get('hls_prefer_native')):
1849                     if fixup_policy == 'warn':
1850                         self.report_warning('%s: malformated aac bitstream.' % (
1851                             info_dict['id']))
1852                     elif fixup_policy == 'detect_or_warn':
1853                         fixup_pp = FFmpegFixupM3u8PP(self)
1854                         if fixup_pp.available:
1855                             info_dict.setdefault('__postprocessors', [])
1856                             info_dict['__postprocessors'].append(fixup_pp)
1857                         else:
1858                             self.report_warning(
1859                                 '%s: malformated aac bitstream. %s'
1860                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1861                     else:
1862                         assert fixup_policy in ('ignore', 'never')
1863
1864                 try:
1865                     self.post_process(filename, info_dict)
1866                 except (PostProcessingError) as err:
1867                     self.report_error('postprocessing: %s' % str(err))
1868                     return
1869                 self.record_download_archive(info_dict)
1870
1871     def download(self, url_list):
1872         """Download a given list of URLs."""
1873         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1874         if (len(url_list) > 1 and
1875                 '%' not in outtmpl and
1876                 self.params.get('max_downloads') != 1):
1877             raise SameFileError(outtmpl)
1878
1879         for url in url_list:
1880             try:
1881                 # It also downloads the videos
1882                 res = self.extract_info(
1883                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1884             except UnavailableVideoError:
1885                 self.report_error('unable to download video')
1886             except MaxDownloadsReached:
1887                 self.to_screen('[info] Maximum number of downloaded files reached.')
1888                 raise
1889             else:
1890                 if self.params.get('dump_single_json', False):
1891                     self.to_stdout(json.dumps(res))
1892
1893         return self._download_retcode
1894
1895     def download_with_info_file(self, info_filename):
1896         with contextlib.closing(fileinput.FileInput(
1897                 [info_filename], mode='r',
1898                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1899             # FileInput doesn't have a read method, we can't call json.load
1900             info = self.filter_requested_info(json.loads('\n'.join(f)))
1901         try:
1902             self.process_ie_result(info, download=True)
1903         except DownloadError:
1904             webpage_url = info.get('webpage_url')
1905             if webpage_url is not None:
1906                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1907                 return self.download([webpage_url])
1908             else:
1909                 raise
1910         return self._download_retcode
1911
1912     @staticmethod
1913     def filter_requested_info(info_dict):
1914         return dict(
1915             (k, v) for k, v in info_dict.items()
1916             if k not in ['requested_formats', 'requested_subtitles'])
1917
1918     def post_process(self, filename, ie_info):
1919         """Run all the postprocessors on the given file."""
1920         info = dict(ie_info)
1921         info['filepath'] = filename
1922         pps_chain = []
1923         if ie_info.get('__postprocessors') is not None:
1924             pps_chain.extend(ie_info['__postprocessors'])
1925         pps_chain.extend(self._pps)
1926         for pp in pps_chain:
1927             files_to_delete = []
1928             try:
1929                 files_to_delete, info = pp.run(info)
1930             except PostProcessingError as e:
1931                 self.report_error(e.msg)
1932             if files_to_delete and not self.params.get('keepvideo', False):
1933                 for old_filename in files_to_delete:
1934                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1935                     try:
1936                         os.remove(encodeFilename(old_filename))
1937                     except (IOError, OSError):
1938                         self.report_warning('Unable to remove downloaded original file')
1939
1940     def _make_archive_id(self, info_dict):
1941         # Future-proof against any change in case
1942         # and backwards compatibility with prior versions
1943         extractor = info_dict.get('extractor_key')
1944         if extractor is None:
1945             if 'id' in info_dict:
1946                 extractor = info_dict.get('ie_key')  # key in a playlist
1947         if extractor is None:
1948             return None  # Incomplete video information
1949         return extractor.lower() + ' ' + info_dict['id']
1950
1951     def in_download_archive(self, info_dict):
1952         fn = self.params.get('download_archive')
1953         if fn is None:
1954             return False
1955
1956         vid_id = self._make_archive_id(info_dict)
1957         if vid_id is None:
1958             return False  # Incomplete video information
1959
1960         try:
1961             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1962                 for line in archive_file:
1963                     if line.strip() == vid_id:
1964                         return True
1965         except IOError as ioe:
1966             if ioe.errno != errno.ENOENT:
1967                 raise
1968         return False
1969
1970     def record_download_archive(self, info_dict):
1971         fn = self.params.get('download_archive')
1972         if fn is None:
1973             return
1974         vid_id = self._make_archive_id(info_dict)
1975         assert vid_id
1976         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1977             archive_file.write(vid_id + '\n')
1978
1979     @staticmethod
1980     def format_resolution(format, default='unknown'):
1981         if format.get('vcodec') == 'none':
1982             return 'audio only'
1983         if format.get('resolution') is not None:
1984             return format['resolution']
1985         if format.get('height') is not None:
1986             if format.get('width') is not None:
1987                 res = '%sx%s' % (format['width'], format['height'])
1988             else:
1989                 res = '%sp' % format['height']
1990         elif format.get('width') is not None:
1991             res = '%dx?' % format['width']
1992         else:
1993             res = default
1994         return res
1995
1996     def _format_note(self, fdict):
1997         res = ''
1998         if fdict.get('ext') in ['f4f', 'f4m']:
1999             res += '(unsupported) '
2000         if fdict.get('language'):
2001             if res:
2002                 res += ' '
2003             res += '[%s] ' % fdict['language']
2004         if fdict.get('format_note') is not None:
2005             res += fdict['format_note'] + ' '
2006         if fdict.get('tbr') is not None:
2007             res += '%4dk ' % fdict['tbr']
2008         if fdict.get('container') is not None:
2009             if res:
2010                 res += ', '
2011             res += '%s container' % fdict['container']
2012         if (fdict.get('vcodec') is not None and
2013                 fdict.get('vcodec') != 'none'):
2014             if res:
2015                 res += ', '
2016             res += fdict['vcodec']
2017             if fdict.get('vbr') is not None:
2018                 res += '@'
2019         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2020             res += 'video@'
2021         if fdict.get('vbr') is not None:
2022             res += '%4dk' % fdict['vbr']
2023         if fdict.get('fps') is not None:
2024             if res:
2025                 res += ', '
2026             res += '%sfps' % fdict['fps']
2027         if fdict.get('acodec') is not None:
2028             if res:
2029                 res += ', '
2030             if fdict['acodec'] == 'none':
2031                 res += 'video only'
2032             else:
2033                 res += '%-5s' % fdict['acodec']
2034         elif fdict.get('abr') is not None:
2035             if res:
2036                 res += ', '
2037             res += 'audio'
2038         if fdict.get('abr') is not None:
2039             res += '@%3dk' % fdict['abr']
2040         if fdict.get('asr') is not None:
2041             res += ' (%5dHz)' % fdict['asr']
2042         if fdict.get('filesize') is not None:
2043             if res:
2044                 res += ', '
2045             res += format_bytes(fdict['filesize'])
2046         elif fdict.get('filesize_approx') is not None:
2047             if res:
2048                 res += ', '
2049             res += '~' + format_bytes(fdict['filesize_approx'])
2050         return res
2051
2052     def list_formats(self, info_dict):
2053         formats = info_dict.get('formats', [info_dict])
2054         table = [
2055             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2056             for f in formats
2057             if f.get('preference') is None or f['preference'] >= -1000]
2058         if len(formats) > 1:
2059             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2060
2061         header_line = ['format code', 'extension', 'resolution', 'note']
2062         self.to_screen(
2063             '[info] Available formats for %s:\n%s' %
2064             (info_dict['id'], render_table(header_line, table)))
2065
2066     def list_thumbnails(self, info_dict):
2067         thumbnails = info_dict.get('thumbnails')
2068         if not thumbnails:
2069             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2070             return
2071
2072         self.to_screen(
2073             '[info] Thumbnails for %s:' % info_dict['id'])
2074         self.to_screen(render_table(
2075             ['ID', 'width', 'height', 'URL'],
2076             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2077
2078     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2079         if not subtitles:
2080             self.to_screen('%s has no %s' % (video_id, name))
2081             return
2082         self.to_screen(
2083             'Available %s for %s:' % (name, video_id))
2084         self.to_screen(render_table(
2085             ['Language', 'formats'],
2086             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2087                 for lang, formats in subtitles.items()]))
2088
2089     def urlopen(self, req):
2090         """ Start an HTTP download """
2091         if isinstance(req, compat_basestring):
2092             req = sanitized_Request(req)
2093         return self._opener.open(req, timeout=self._socket_timeout)
2094
2095     def print_debug_header(self):
2096         if not self.params.get('verbose'):
2097             return
2098
2099         if type('') is not compat_str:
2100             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2101             self.report_warning(
2102                 'Your Python is broken! Update to a newer and supported version')
2103
2104         stdout_encoding = getattr(
2105             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2106         encoding_str = (
2107             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2108                 locale.getpreferredencoding(),
2109                 sys.getfilesystemencoding(),
2110                 stdout_encoding,
2111                 self.get_encoding()))
2112         write_string(encoding_str, encoding=None)
2113
2114         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2115         if _LAZY_LOADER:
2116             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2117         try:
2118             sp = subprocess.Popen(
2119                 ['git', 'rev-parse', '--short', 'HEAD'],
2120                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2121                 cwd=os.path.dirname(os.path.abspath(__file__)))
2122             out, err = sp.communicate()
2123             out = out.decode().strip()
2124             if re.match('[0-9a-f]+', out):
2125                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2126         except Exception:
2127             try:
2128                 sys.exc_clear()
2129             except Exception:
2130                 pass
2131         self._write_string('[debug] Python version %s - %s\n' % (
2132             platform.python_version(), platform_name()))
2133
2134         exe_versions = FFmpegPostProcessor.get_versions(self)
2135         exe_versions['rtmpdump'] = rtmpdump_version()
2136         exe_str = ', '.join(
2137             '%s %s' % (exe, v)
2138             for exe, v in sorted(exe_versions.items())
2139             if v
2140         )
2141         if not exe_str:
2142             exe_str = 'none'
2143         self._write_string('[debug] exe versions: %s\n' % exe_str)
2144
2145         proxy_map = {}
2146         for handler in self._opener.handlers:
2147             if hasattr(handler, 'proxies'):
2148                 proxy_map.update(handler.proxies)
2149         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2150
2151         if self.params.get('call_home', False):
2152             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2153             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2154             latest_version = self.urlopen(
2155                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2156             if version_tuple(latest_version) > version_tuple(__version__):
2157                 self.report_warning(
2158                     'You are using an outdated version (newest version: %s)! '
2159                     'See https://yt-dl.org/update if you need help updating.' %
2160                     latest_version)
2161
2162     def _setup_opener(self):
2163         timeout_val = self.params.get('socket_timeout')
2164         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2165
2166         opts_cookiefile = self.params.get('cookiefile')
2167         opts_proxy = self.params.get('proxy')
2168
2169         if opts_cookiefile is None:
2170             self.cookiejar = compat_cookiejar.CookieJar()
2171         else:
2172             opts_cookiefile = compat_expanduser(opts_cookiefile)
2173             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2174                 opts_cookiefile)
2175             if os.access(opts_cookiefile, os.R_OK):
2176                 self.cookiejar.load()
2177
2178         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2179         if opts_proxy is not None:
2180             if opts_proxy == '':
2181                 proxies = {}
2182             else:
2183                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2184         else:
2185             proxies = compat_urllib_request.getproxies()
2186             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2187             if 'http' in proxies and 'https' not in proxies:
2188                 proxies['https'] = proxies['http']
2189         proxy_handler = PerRequestProxyHandler(proxies)
2190
2191         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2192         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2193         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2194         data_handler = compat_urllib_request_DataHandler()
2195
2196         # When passing our own FileHandler instance, build_opener won't add the
2197         # default FileHandler and allows us to disable the file protocol, which
2198         # can be used for malicious purposes (see
2199         # https://github.com/rg3/youtube-dl/issues/8227)
2200         file_handler = compat_urllib_request.FileHandler()
2201
2202         def file_open(*args, **kwargs):
2203             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2204         file_handler.file_open = file_open
2205
2206         opener = compat_urllib_request.build_opener(
2207             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2208
2209         # Delete the default user-agent header, which would otherwise apply in
2210         # cases where our custom HTTP handler doesn't come into play
2211         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2212         opener.addheaders = []
2213         self._opener = opener
2214
2215     def encode(self, s):
2216         if isinstance(s, bytes):
2217             return s  # Already encoded
2218
2219         try:
2220             return s.encode(self.get_encoding())
2221         except UnicodeEncodeError as err:
2222             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2223             raise
2224
2225     def get_encoding(self):
2226         encoding = self.params.get('encoding')
2227         if encoding is None:
2228             encoding = preferredencoding()
2229         return encoding
2230
2231     def _write_thumbnails(self, info_dict, filename):
2232         if self.params.get('writethumbnail', False):
2233             thumbnails = info_dict.get('thumbnails')
2234             if thumbnails:
2235                 thumbnails = [thumbnails[-1]]
2236         elif self.params.get('write_all_thumbnails', False):
2237             thumbnails = info_dict.get('thumbnails')
2238         else:
2239             return
2240
2241         if not thumbnails:
2242             # No thumbnails present, so return immediately
2243             return
2244
2245         for t in thumbnails:
2246             thumb_ext = determine_ext(t['url'], 'jpg')
2247             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2248             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2249             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2250
2251             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2252                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2253                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2254             else:
2255                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2256                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2257                 try:
2258                     uf = self.urlopen(t['url'])
2259                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2260                         shutil.copyfileobj(uf, thumbf)
2261                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2262                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2263                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264                     self.report_warning('Unable to download thumbnail "%s": %s' %
2265                                         (t['url'], error_to_compat_str(err)))