]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/YoutubeDL.py
[utils] Support attributes with no values in get_elements_by_attribute()
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_get_terminal_size,
33     compat_http_client,
34     compat_kwargs,
35     compat_numeric_types,
36     compat_os_name,
37     compat_str,
38     compat_tokenize_tokenize,
39     compat_urllib_error,
40     compat_urllib_request,
41     compat_urllib_request_DataHandler,
42 )
43 from .utils import (
44     age_restricted,
45     args_to_str,
46     ContentTooShortError,
47     date_from_str,
48     DateRange,
49     DEFAULT_OUTTMPL,
50     determine_ext,
51     determine_protocol,
52     DownloadError,
53     encode_compat_str,
54     encodeFilename,
55     error_to_compat_str,
56     expand_path,
57     ExtractorError,
58     format_bytes,
59     formatSeconds,
60     GeoRestrictedError,
61     int_or_none,
62     ISO3166Utils,
63     locked_file,
64     make_HTTPS_handler,
65     MaxDownloadsReached,
66     PagedList,
67     parse_filesize,
68     PerRequestProxyHandler,
69     platform_name,
70     PostProcessingError,
71     preferredencoding,
72     prepend_extension,
73     register_socks_protocols,
74     render_table,
75     replace_extension,
76     SameFileError,
77     sanitize_filename,
78     sanitize_path,
79     sanitize_url,
80     sanitized_Request,
81     std_headers,
82     subtitles_filename,
83     UnavailableVideoError,
84     url_basename,
85     version_tuple,
86     write_json_file,
87     write_string,
88     YoutubeDLCookieProcessor,
89     YoutubeDLHandler,
90 )
91 from .cache import Cache
92 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
93 from .downloader import get_suitable_downloader
94 from .downloader.rtmp import rtmpdump_version
95 from .postprocessor import (
96     FFmpegFixupM3u8PP,
97     FFmpegFixupM4aPP,
98     FFmpegFixupStretchedPP,
99     FFmpegMergerPP,
100     FFmpegPostProcessor,
101     get_postprocessor,
102 )
103 from .version import __version__
104
105 if compat_os_name == 'nt':
106     import ctypes
107
108
109 class YoutubeDL(object):
110     """YoutubeDL class.
111
112     YoutubeDL objects are the ones responsible of downloading the
113     actual video file and writing it to disk if the user has requested
114     it, among some other tasks. In most cases there should be one per
115     program. As, given a video URL, the downloader doesn't know how to
116     extract all the needed information, task that InfoExtractors do, it
117     has to pass the URL to one of them.
118
119     For this, YoutubeDL objects have a method that allows
120     InfoExtractors to be registered in a given order. When it is passed
121     a URL, the YoutubeDL object handles it to the first InfoExtractor it
122     finds that reports being able to handle it. The InfoExtractor extracts
123     all the information about the video or videos the URL refers to, and
124     YoutubeDL process the extracted information, possibly using a File
125     Downloader to download the video.
126
127     YoutubeDL objects accept a lot of parameters. In order not to saturate
128     the object constructor with arguments, it receives a dictionary of
129     options instead. These options are available through the params
130     attribute for the InfoExtractors to use. The YoutubeDL also
131     registers itself as the downloader in charge for the InfoExtractors
132     that are added to it, so this is a "mutual registration".
133
134     Available options:
135
136     username:          Username for authentication purposes.
137     password:          Password for authentication purposes.
138     videopassword:     Password for accessing a video.
139     ap_mso:            Adobe Pass multiple-system operator identifier.
140     ap_username:       Multiple-system operator account username.
141     ap_password:       Multiple-system operator account password.
142     usenetrc:          Use netrc for authentication instead.
143     verbose:           Print additional info to stdout.
144     quiet:             Do not print messages to stdout.
145     no_warnings:       Do not print out anything for warnings.
146     forceurl:          Force printing final URL.
147     forcetitle:        Force printing title.
148     forceid:           Force printing ID.
149     forcethumbnail:    Force printing thumbnail URL.
150     forcedescription:  Force printing description.
151     forcefilename:     Force printing final filename.
152     forceduration:     Force printing duration.
153     forcejson:         Force printing info_dict as JSON.
154     dump_single_json:  Force printing the info_dict of the whole playlist
155                        (or video) as a single JSON line.
156     simulate:          Do not download the video files.
157     format:            Video format code. See options.py for more information.
158     outtmpl:           Template for output names.
159     restrictfilenames: Do not allow "&" and spaces in file names
160     ignoreerrors:      Do not stop on download errors.
161     force_generic_extractor: Force downloader to use the generic extractor
162     nooverwrites:      Prevent overwriting files.
163     playliststart:     Playlist item to start at.
164     playlistend:       Playlist item to end at.
165     playlist_items:    Specific indices of playlist to download.
166     playlistreverse:   Download playlist items in reverse order.
167     playlistrandom:    Download playlist items in random order.
168     matchtitle:        Download only matching titles.
169     rejecttitle:       Reject downloads for matching titles.
170     logger:            Log messages to a logging.Logger instance.
171     logtostderr:       Log messages to stderr instead of stdout.
172     writedescription:  Write the video description to a .description file
173     writeinfojson:     Write the video description to a .info.json file
174     writeannotations:  Write the video annotations to a .annotations.xml file
175     writethumbnail:    Write the thumbnail image to a file
176     write_all_thumbnails:  Write all thumbnail formats to files
177     writesubtitles:    Write the video subtitles to a file
178     writeautomaticsub: Write the automatically generated subtitles to a file
179     allsubtitles:      Downloads all the subtitles of the video
180                        (requires writesubtitles or writeautomaticsub)
181     listsubtitles:     Lists all available subtitles for the video
182     subtitlesformat:   The format code for subtitles
183     subtitleslangs:    List of languages of the subtitles to download
184     keepvideo:         Keep the video file after post-processing
185     daterange:         A DateRange object, download only if the upload_date is in the range.
186     skip_download:     Skip the actual download of the video file
187     cachedir:          Location of the cache files in the filesystem.
188                        False to disable filesystem cache.
189     noplaylist:        Download single video instead of a playlist if in doubt.
190     age_limit:         An integer representing the user's age in years.
191                        Unsuitable videos for the given age are skipped.
192     min_views:         An integer representing the minimum view count the video
193                        must have in order to not be skipped.
194                        Videos without view count information are always
195                        downloaded. None for no limit.
196     max_views:         An integer representing the maximum view count.
197                        Videos that are more popular than that are not
198                        downloaded.
199                        Videos without view count information are always
200                        downloaded. None for no limit.
201     download_archive:  File name of a file where all downloads are recorded.
202                        Videos already present in the file are not downloaded
203                        again.
204     cookiefile:        File name where cookies should be read from and dumped to.
205     nocheckcertificate:Do not verify SSL certificates
206     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
207                        At the moment, this is only supported by YouTube.
208     proxy:             URL of the proxy server to use
209     geo_verification_proxy:  URL of the proxy to use for IP address verification
210                        on geo-restricted sites. (Experimental)
211     socket_timeout:    Time to wait for unresponsive hosts, in seconds
212     bidi_workaround:   Work around buggy terminals without bidirectional text
213                        support, using fridibi
214     debug_printtraffic:Print out sent and received HTTP traffic
215     include_ads:       Download ads as well
216     default_search:    Prepend this string if an input url is not valid.
217                        'auto' for elaborate guessing
218     encoding:          Use this encoding instead of the system-specified.
219     extract_flat:      Do not resolve URLs, return the immediate result.
220                        Pass in 'in_playlist' to only show this behavior for
221                        playlist items.
222     postprocessors:    A list of dictionaries, each with an entry
223                        * key:  The name of the postprocessor. See
224                                youtube_dl/postprocessor/__init__.py for a list.
225                        as well as any further keyword arguments for the
226                        postprocessor.
227     progress_hooks:    A list of functions that get called on download
228                        progress, with a dictionary with the entries
229                        * status: One of "downloading", "error", or "finished".
230                                  Check this first and ignore unknown values.
231
232                        If status is one of "downloading", or "finished", the
233                        following properties may also be present:
234                        * filename: The final filename (always present)
235                        * tmpfilename: The filename we're currently writing to
236                        * downloaded_bytes: Bytes on disk
237                        * total_bytes: Size of the whole file, None if unknown
238                        * total_bytes_estimate: Guess of the eventual file size,
239                                                None if unavailable.
240                        * elapsed: The number of seconds since download started.
241                        * eta: The estimated time in seconds, None if unknown
242                        * speed: The download speed in bytes/second, None if
243                                 unknown
244                        * fragment_index: The counter of the currently
245                                          downloaded video fragment.
246                        * fragment_count: The number of fragments (= individual
247                                          files that will be merged)
248
249                        Progress hooks are guaranteed to be called at least once
250                        (with status "finished") if the download is successful.
251     merge_output_format: Extension to use when merging formats.
252     fixup:             Automatically correct known faults of the file.
253                        One of:
254                        - "never": do nothing
255                        - "warn": only emit a warning
256                        - "detect_or_warn": check whether we can do anything
257                                            about it, warn otherwise (default)
258     source_address:    (Experimental) Client-side IP address to bind to.
259     call_home:         Boolean, true iff we are allowed to contact the
260                        youtube-dl servers for debugging.
261     sleep_interval:    Number of seconds to sleep before each download when
262                        used alone or a lower bound of a range for randomized
263                        sleep before each download (minimum possible number
264                        of seconds to sleep) when used along with
265                        max_sleep_interval.
266     max_sleep_interval:Upper bound of a range for randomized sleep before each
267                        download (maximum possible number of seconds to sleep).
268                        Must only be used along with sleep_interval.
269                        Actual sleep time will be a random float from range
270                        [sleep_interval; max_sleep_interval].
271     listformats:       Print an overview of available video formats and exit.
272     list_thumbnails:   Print a table of all thumbnails and exit.
273     match_filter:      A function that gets called with the info_dict of
274                        every video.
275                        If it returns a message, the video is ignored.
276                        If it returns None, the video is downloaded.
277                        match_filter_func in utils.py is one example for this.
278     no_color:          Do not emit color codes in output.
279     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
280                        HTTP header (experimental)
281     geo_bypass_country:
282                        Two-letter ISO 3166-2 country code that will be used for
283                        explicit geographic restriction bypassing via faking
284                        X-Forwarded-For HTTP header (experimental)
285
286     The following options determine which downloader is picked:
287     external_downloader: Executable of the external downloader to call.
288                        None or unset for standard (built-in) downloader.
289     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
290                        if True, otherwise use ffmpeg/avconv if False, otherwise
291                        use downloader suggested by extractor if None.
292
293     The following parameters are not used by YoutubeDL itself, they are used by
294     the downloader (see youtube_dl/downloader/common.py):
295     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
296     noresizebuffer, retries, continuedl, noprogress, consoletitle,
297     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
298
299     The following options are used by the post processors:
300     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
301                        otherwise prefer avconv.
302     postprocessor_args: A list of additional command-line arguments for the
303                         postprocessor.
304     """
305
306     _NUMERIC_FIELDS = set((
307         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
308         'timestamp', 'upload_year', 'upload_month', 'upload_day',
309         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
310         'average_rating', 'comment_count', 'age_limit',
311         'start_time', 'end_time',
312         'chapter_number', 'season_number', 'episode_number',
313         'track_number', 'disc_number', 'release_year',
314         'playlist_index',
315     ))
316
317     params = None
318     _ies = []
319     _pps = []
320     _download_retcode = None
321     _num_downloads = None
322     _screen_file = None
323
324     def __init__(self, params=None, auto_init=True):
325         """Create a FileDownloader object with the given options."""
326         if params is None:
327             params = {}
328         self._ies = []
329         self._ies_instances = {}
330         self._pps = []
331         self._progress_hooks = []
332         self._download_retcode = 0
333         self._num_downloads = 0
334         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
335         self._err_file = sys.stderr
336         self.params = {
337             # Default parameters
338             'nocheckcertificate': False,
339         }
340         self.params.update(params)
341         self.cache = Cache(self)
342
343         def check_deprecated(param, option, suggestion):
344             if self.params.get(param) is not None:
345                 self.report_warning(
346                     '%s is deprecated. Use %s instead.' % (option, suggestion))
347                 return True
348             return False
349
350         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
351             if self.params.get('geo_verification_proxy') is None:
352                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
353
354         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
355         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
356         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
357
358         if params.get('bidi_workaround', False):
359             try:
360                 import pty
361                 master, slave = pty.openpty()
362                 width = compat_get_terminal_size().columns
363                 if width is None:
364                     width_args = []
365                 else:
366                     width_args = ['-w', str(width)]
367                 sp_kwargs = dict(
368                     stdin=subprocess.PIPE,
369                     stdout=slave,
370                     stderr=self._err_file)
371                 try:
372                     self._output_process = subprocess.Popen(
373                         ['bidiv'] + width_args, **sp_kwargs
374                     )
375                 except OSError:
376                     self._output_process = subprocess.Popen(
377                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
378                 self._output_channel = os.fdopen(master, 'rb')
379             except OSError as ose:
380                 if ose.errno == errno.ENOENT:
381                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
382                 else:
383                     raise
384
385         if (sys.platform != 'win32' and
386                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
387                 not params.get('restrictfilenames', False)):
388             # Unicode filesystem API will throw errors (#1474, #13027)
389             self.report_warning(
390                 'Assuming --restrict-filenames since file system encoding '
391                 'cannot encode all characters. '
392                 'Set the LC_ALL environment variable to fix this.')
393             self.params['restrictfilenames'] = True
394
395         if isinstance(params.get('outtmpl'), bytes):
396             self.report_warning(
397                 'Parameter outtmpl is bytes, but should be a unicode string. '
398                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
399
400         self._setup_opener()
401
402         if auto_init:
403             self.print_debug_header()
404             self.add_default_info_extractors()
405
406         for pp_def_raw in self.params.get('postprocessors', []):
407             pp_class = get_postprocessor(pp_def_raw['key'])
408             pp_def = dict(pp_def_raw)
409             del pp_def['key']
410             pp = pp_class(self, **compat_kwargs(pp_def))
411             self.add_post_processor(pp)
412
413         for ph in self.params.get('progress_hooks', []):
414             self.add_progress_hook(ph)
415
416         register_socks_protocols()
417
418     def warn_if_short_id(self, argv):
419         # short YouTube ID starting with dash?
420         idxs = [
421             i for i, a in enumerate(argv)
422             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
423         if idxs:
424             correct_argv = (
425                 ['youtube-dl'] +
426                 [a for i, a in enumerate(argv) if i not in idxs] +
427                 ['--'] + [argv[i] for i in idxs]
428             )
429             self.report_warning(
430                 'Long argument string detected. '
431                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
432                 args_to_str(correct_argv))
433
434     def add_info_extractor(self, ie):
435         """Add an InfoExtractor object to the end of the list."""
436         self._ies.append(ie)
437         if not isinstance(ie, type):
438             self._ies_instances[ie.ie_key()] = ie
439             ie.set_downloader(self)
440
441     def get_info_extractor(self, ie_key):
442         """
443         Get an instance of an IE with name ie_key, it will try to get one from
444         the _ies list, if there's no instance it will create a new one and add
445         it to the extractor list.
446         """
447         ie = self._ies_instances.get(ie_key)
448         if ie is None:
449             ie = get_info_extractor(ie_key)()
450             self.add_info_extractor(ie)
451         return ie
452
453     def add_default_info_extractors(self):
454         """
455         Add the InfoExtractors returned by gen_extractors to the end of the list
456         """
457         for ie in gen_extractor_classes():
458             self.add_info_extractor(ie)
459
460     def add_post_processor(self, pp):
461         """Add a PostProcessor object to the end of the chain."""
462         self._pps.append(pp)
463         pp.set_downloader(self)
464
465     def add_progress_hook(self, ph):
466         """Add the progress hook (currently only for the file downloader)"""
467         self._progress_hooks.append(ph)
468
469     def _bidi_workaround(self, message):
470         if not hasattr(self, '_output_channel'):
471             return message
472
473         assert hasattr(self, '_output_process')
474         assert isinstance(message, compat_str)
475         line_count = message.count('\n') + 1
476         self._output_process.stdin.write((message + '\n').encode('utf-8'))
477         self._output_process.stdin.flush()
478         res = ''.join(self._output_channel.readline().decode('utf-8')
479                       for _ in range(line_count))
480         return res[:-len('\n')]
481
482     def to_screen(self, message, skip_eol=False):
483         """Print message to stdout if not in quiet mode."""
484         return self.to_stdout(message, skip_eol, check_quiet=True)
485
486     def _write_string(self, s, out=None):
487         write_string(s, out=out, encoding=self.params.get('encoding'))
488
489     def to_stdout(self, message, skip_eol=False, check_quiet=False):
490         """Print message to stdout if not in quiet mode."""
491         if self.params.get('logger'):
492             self.params['logger'].debug(message)
493         elif not check_quiet or not self.params.get('quiet', False):
494             message = self._bidi_workaround(message)
495             terminator = ['\n', ''][skip_eol]
496             output = message + terminator
497
498             self._write_string(output, self._screen_file)
499
500     def to_stderr(self, message):
501         """Print message to stderr."""
502         assert isinstance(message, compat_str)
503         if self.params.get('logger'):
504             self.params['logger'].error(message)
505         else:
506             message = self._bidi_workaround(message)
507             output = message + '\n'
508             self._write_string(output, self._err_file)
509
510     def to_console_title(self, message):
511         if not self.params.get('consoletitle', False):
512             return
513         if compat_os_name == 'nt':
514             if ctypes.windll.kernel32.GetConsoleWindow():
515                 # c_wchar_p() might not be necessary if `message` is
516                 # already of type unicode()
517                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
518         elif 'TERM' in os.environ:
519             self._write_string('\033]0;%s\007' % message, self._screen_file)
520
521     def save_console_title(self):
522         if not self.params.get('consoletitle', False):
523             return
524         if compat_os_name != 'nt' and 'TERM' in os.environ:
525             # Save the title on stack
526             self._write_string('\033[22;0t', self._screen_file)
527
528     def restore_console_title(self):
529         if not self.params.get('consoletitle', False):
530             return
531         if compat_os_name != 'nt' and 'TERM' in os.environ:
532             # Restore the title from stack
533             self._write_string('\033[23;0t', self._screen_file)
534
535     def __enter__(self):
536         self.save_console_title()
537         return self
538
539     def __exit__(self, *args):
540         self.restore_console_title()
541
542         if self.params.get('cookiefile') is not None:
543             self.cookiejar.save()
544
545     def trouble(self, message=None, tb=None):
546         """Determine action to take when a download problem appears.
547
548         Depending on if the downloader has been configured to ignore
549         download errors or not, this method may throw an exception or
550         not when errors are found, after printing the message.
551
552         tb, if given, is additional traceback information.
553         """
554         if message is not None:
555             self.to_stderr(message)
556         if self.params.get('verbose'):
557             if tb is None:
558                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
559                     tb = ''
560                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
561                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
562                     tb += encode_compat_str(traceback.format_exc())
563                 else:
564                     tb_data = traceback.format_list(traceback.extract_stack())
565                     tb = ''.join(tb_data)
566             self.to_stderr(tb)
567         if not self.params.get('ignoreerrors', False):
568             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
569                 exc_info = sys.exc_info()[1].exc_info
570             else:
571                 exc_info = sys.exc_info()
572             raise DownloadError(message, exc_info)
573         self._download_retcode = 1
574
575     def report_warning(self, message):
576         '''
577         Print the message to stderr, it will be prefixed with 'WARNING:'
578         If stderr is a tty file the 'WARNING:' will be colored
579         '''
580         if self.params.get('logger') is not None:
581             self.params['logger'].warning(message)
582         else:
583             if self.params.get('no_warnings'):
584                 return
585             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
586                 _msg_header = '\033[0;33mWARNING:\033[0m'
587             else:
588                 _msg_header = 'WARNING:'
589             warning_message = '%s %s' % (_msg_header, message)
590             self.to_stderr(warning_message)
591
592     def report_error(self, message, tb=None):
593         '''
594         Do the same as trouble, but prefixes the message with 'ERROR:', colored
595         in red if stderr is a tty file.
596         '''
597         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
598             _msg_header = '\033[0;31mERROR:\033[0m'
599         else:
600             _msg_header = 'ERROR:'
601         error_message = '%s %s' % (_msg_header, message)
602         self.trouble(error_message, tb)
603
604     def report_file_already_downloaded(self, file_name):
605         """Report file has already been fully downloaded."""
606         try:
607             self.to_screen('[download] %s has already been downloaded' % file_name)
608         except UnicodeEncodeError:
609             self.to_screen('[download] The file has already been downloaded')
610
611     def prepare_filename(self, info_dict):
612         """Generate the output filename."""
613         try:
614             template_dict = dict(info_dict)
615
616             template_dict['epoch'] = int(time.time())
617             autonumber_size = self.params.get('autonumber_size')
618             if autonumber_size is None:
619                 autonumber_size = 5
620             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
621             if template_dict.get('resolution') is None:
622                 if template_dict.get('width') and template_dict.get('height'):
623                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
624                 elif template_dict.get('height'):
625                     template_dict['resolution'] = '%sp' % template_dict['height']
626                 elif template_dict.get('width'):
627                     template_dict['resolution'] = '%dx?' % template_dict['width']
628
629             sanitize = lambda k, v: sanitize_filename(
630                 compat_str(v),
631                 restricted=self.params.get('restrictfilenames'),
632                 is_id=(k == 'id' or k.endswith('_id')))
633             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
634                                  for k, v in template_dict.items()
635                                  if v is not None and not isinstance(v, (list, tuple, dict)))
636             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
637
638             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
639
640             # For fields playlist_index and autonumber convert all occurrences
641             # of %(field)s to %(field)0Nd for backward compatibility
642             field_size_compat_map = {
643                 'playlist_index': len(str(template_dict['n_entries'])),
644                 'autonumber': autonumber_size,
645             }
646             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
647             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
648             if mobj:
649                 outtmpl = re.sub(
650                     FIELD_SIZE_COMPAT_RE,
651                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
652                     outtmpl)
653
654             # Missing numeric fields used together with integer presentation types
655             # in format specification will break the argument substitution since
656             # string 'NA' is returned for missing fields. We will patch output
657             # template for missing fields to meet string presentation type.
658             for numeric_field in self._NUMERIC_FIELDS:
659                 if numeric_field not in template_dict:
660                     # As of [1] format syntax is:
661                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
662                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
663                     FORMAT_RE = r'''(?x)
664                         (?<!%)
665                         %
666                         \({0}\)  # mapping key
667                         (?:[#0\-+ ]+)?  # conversion flags (optional)
668                         (?:\d+)?  # minimum field width (optional)
669                         (?:\.\d+)?  # precision (optional)
670                         [hlL]?  # length modifier (optional)
671                         [diouxXeEfFgGcrs%]  # conversion type
672                     '''
673                     outtmpl = re.sub(
674                         FORMAT_RE.format(numeric_field),
675                         r'%({0})s'.format(numeric_field), outtmpl)
676
677             filename = expand_path(outtmpl % template_dict)
678             # Temporary fix for #4787
679             # 'Treat' all problem characters by passing filename through preferredencoding
680             # to workaround encoding issues with subprocess on python2 @ Windows
681             if sys.version_info < (3, 0) and sys.platform == 'win32':
682                 filename = encodeFilename(filename, True).decode(preferredencoding())
683             return sanitize_path(filename)
684         except ValueError as err:
685             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
686             return None
687
688     def _match_entry(self, info_dict, incomplete):
689         """ Returns None iff the file should be downloaded """
690
691         video_title = info_dict.get('title', info_dict.get('id', 'video'))
692         if 'title' in info_dict:
693             # This can happen when we're just evaluating the playlist
694             title = info_dict['title']
695             matchtitle = self.params.get('matchtitle', False)
696             if matchtitle:
697                 if not re.search(matchtitle, title, re.IGNORECASE):
698                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
699             rejecttitle = self.params.get('rejecttitle', False)
700             if rejecttitle:
701                 if re.search(rejecttitle, title, re.IGNORECASE):
702                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
703         date = info_dict.get('upload_date')
704         if date is not None:
705             dateRange = self.params.get('daterange', DateRange())
706             if date not in dateRange:
707                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
708         view_count = info_dict.get('view_count')
709         if view_count is not None:
710             min_views = self.params.get('min_views')
711             if min_views is not None and view_count < min_views:
712                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
713             max_views = self.params.get('max_views')
714             if max_views is not None and view_count > max_views:
715                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
716         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
717             return 'Skipping "%s" because it is age restricted' % video_title
718         if self.in_download_archive(info_dict):
719             return '%s has already been recorded in archive' % video_title
720
721         if not incomplete:
722             match_filter = self.params.get('match_filter')
723             if match_filter is not None:
724                 ret = match_filter(info_dict)
725                 if ret is not None:
726                     return ret
727
728         return None
729
730     @staticmethod
731     def add_extra_info(info_dict, extra_info):
732         '''Set the keys from extra_info in info dict if they are missing'''
733         for key, value in extra_info.items():
734             info_dict.setdefault(key, value)
735
736     def extract_info(self, url, download=True, ie_key=None, extra_info={},
737                      process=True, force_generic_extractor=False):
738         '''
739         Returns a list with a dictionary for each video we find.
740         If 'download', also downloads the videos.
741         extra_info is a dict containing the extra values to add to each result
742         '''
743
744         if not ie_key and force_generic_extractor:
745             ie_key = 'Generic'
746
747         if ie_key:
748             ies = [self.get_info_extractor(ie_key)]
749         else:
750             ies = self._ies
751
752         for ie in ies:
753             if not ie.suitable(url):
754                 continue
755
756             ie = self.get_info_extractor(ie.ie_key())
757             if not ie.working():
758                 self.report_warning('The program functionality for this site has been marked as broken, '
759                                     'and will probably not work.')
760
761             try:
762                 ie_result = ie.extract(url)
763                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
764                     break
765                 if isinstance(ie_result, list):
766                     # Backwards compatibility: old IE result format
767                     ie_result = {
768                         '_type': 'compat_list',
769                         'entries': ie_result,
770                     }
771                 self.add_default_extra_info(ie_result, ie, url)
772                 if process:
773                     return self.process_ie_result(ie_result, download, extra_info)
774                 else:
775                     return ie_result
776             except GeoRestrictedError as e:
777                 msg = e.msg
778                 if e.countries:
779                     msg += '\nThis video is available in %s.' % ', '.join(
780                         map(ISO3166Utils.short2full, e.countries))
781                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
782                 self.report_error(msg)
783                 break
784             except ExtractorError as e:  # An error we somewhat expected
785                 self.report_error(compat_str(e), e.format_traceback())
786                 break
787             except MaxDownloadsReached:
788                 raise
789             except Exception as e:
790                 if self.params.get('ignoreerrors', False):
791                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
792                     break
793                 else:
794                     raise
795         else:
796             self.report_error('no suitable InfoExtractor for URL %s' % url)
797
798     def add_default_extra_info(self, ie_result, ie, url):
799         self.add_extra_info(ie_result, {
800             'extractor': ie.IE_NAME,
801             'webpage_url': url,
802             'webpage_url_basename': url_basename(url),
803             'extractor_key': ie.ie_key(),
804         })
805
806     def process_ie_result(self, ie_result, download=True, extra_info={}):
807         """
808         Take the result of the ie(may be modified) and resolve all unresolved
809         references (URLs, playlist items).
810
811         It will also download the videos if 'download'.
812         Returns the resolved ie_result.
813         """
814         result_type = ie_result.get('_type', 'video')
815
816         if result_type in ('url', 'url_transparent'):
817             ie_result['url'] = sanitize_url(ie_result['url'])
818             extract_flat = self.params.get('extract_flat', False)
819             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
820                     extract_flat is True):
821                 if self.params.get('forcejson', False):
822                     self.to_stdout(json.dumps(ie_result))
823                 return ie_result
824
825         if result_type == 'video':
826             self.add_extra_info(ie_result, extra_info)
827             return self.process_video_result(ie_result, download=download)
828         elif result_type == 'url':
829             # We have to add extra_info to the results because it may be
830             # contained in a playlist
831             return self.extract_info(ie_result['url'],
832                                      download,
833                                      ie_key=ie_result.get('ie_key'),
834                                      extra_info=extra_info)
835         elif result_type == 'url_transparent':
836             # Use the information from the embedding page
837             info = self.extract_info(
838                 ie_result['url'], ie_key=ie_result.get('ie_key'),
839                 extra_info=extra_info, download=False, process=False)
840
841             # extract_info may return None when ignoreerrors is enabled and
842             # extraction failed with an error, don't crash and return early
843             # in this case
844             if not info:
845                 return info
846
847             force_properties = dict(
848                 (k, v) for k, v in ie_result.items() if v is not None)
849             for f in ('_type', 'url', 'ie_key'):
850                 if f in force_properties:
851                     del force_properties[f]
852             new_result = info.copy()
853             new_result.update(force_properties)
854
855             # Extracted info may not be a video result (i.e.
856             # info.get('_type', 'video') != video) but rather an url or
857             # url_transparent. In such cases outer metadata (from ie_result)
858             # should be propagated to inner one (info). For this to happen
859             # _type of info should be overridden with url_transparent. This
860             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
861             if new_result.get('_type') == 'url':
862                 new_result['_type'] = 'url_transparent'
863
864             return self.process_ie_result(
865                 new_result, download=download, extra_info=extra_info)
866         elif result_type in ('playlist', 'multi_video'):
867             # We process each entry in the playlist
868             playlist = ie_result.get('title') or ie_result.get('id')
869             self.to_screen('[download] Downloading playlist: %s' % playlist)
870
871             playlist_results = []
872
873             playliststart = self.params.get('playliststart', 1) - 1
874             playlistend = self.params.get('playlistend')
875             # For backwards compatibility, interpret -1 as whole list
876             if playlistend == -1:
877                 playlistend = None
878
879             playlistitems_str = self.params.get('playlist_items')
880             playlistitems = None
881             if playlistitems_str is not None:
882                 def iter_playlistitems(format):
883                     for string_segment in format.split(','):
884                         if '-' in string_segment:
885                             start, end = string_segment.split('-')
886                             for item in range(int(start), int(end) + 1):
887                                 yield int(item)
888                         else:
889                             yield int(string_segment)
890                 playlistitems = iter_playlistitems(playlistitems_str)
891
892             ie_entries = ie_result['entries']
893             if isinstance(ie_entries, list):
894                 n_all_entries = len(ie_entries)
895                 if playlistitems:
896                     entries = [
897                         ie_entries[i - 1] for i in playlistitems
898                         if -n_all_entries <= i - 1 < n_all_entries]
899                 else:
900                     entries = ie_entries[playliststart:playlistend]
901                 n_entries = len(entries)
902                 self.to_screen(
903                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
904                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
905             elif isinstance(ie_entries, PagedList):
906                 if playlistitems:
907                     entries = []
908                     for item in playlistitems:
909                         entries.extend(ie_entries.getslice(
910                             item - 1, item
911                         ))
912                 else:
913                     entries = ie_entries.getslice(
914                         playliststart, playlistend)
915                 n_entries = len(entries)
916                 self.to_screen(
917                     '[%s] playlist %s: Downloading %d videos' %
918                     (ie_result['extractor'], playlist, n_entries))
919             else:  # iterable
920                 if playlistitems:
921                     entry_list = list(ie_entries)
922                     entries = [entry_list[i - 1] for i in playlistitems]
923                 else:
924                     entries = list(itertools.islice(
925                         ie_entries, playliststart, playlistend))
926                 n_entries = len(entries)
927                 self.to_screen(
928                     '[%s] playlist %s: Downloading %d videos' %
929                     (ie_result['extractor'], playlist, n_entries))
930
931             if self.params.get('playlistreverse', False):
932                 entries = entries[::-1]
933
934             if self.params.get('playlistrandom', False):
935                 random.shuffle(entries)
936
937             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
938
939             for i, entry in enumerate(entries, 1):
940                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
941                 # This __x_forwarded_for_ip thing is a bit ugly but requires
942                 # minimal changes
943                 if x_forwarded_for:
944                     entry['__x_forwarded_for_ip'] = x_forwarded_for
945                 extra = {
946                     'n_entries': n_entries,
947                     'playlist': playlist,
948                     'playlist_id': ie_result.get('id'),
949                     'playlist_title': ie_result.get('title'),
950                     'playlist_index': i + playliststart,
951                     'extractor': ie_result['extractor'],
952                     'webpage_url': ie_result['webpage_url'],
953                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
954                     'extractor_key': ie_result['extractor_key'],
955                 }
956
957                 reason = self._match_entry(entry, incomplete=True)
958                 if reason is not None:
959                     self.to_screen('[download] ' + reason)
960                     continue
961
962                 entry_result = self.process_ie_result(entry,
963                                                       download=download,
964                                                       extra_info=extra)
965                 playlist_results.append(entry_result)
966             ie_result['entries'] = playlist_results
967             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
968             return ie_result
969         elif result_type == 'compat_list':
970             self.report_warning(
971                 'Extractor %s returned a compat_list result. '
972                 'It needs to be updated.' % ie_result.get('extractor'))
973
974             def _fixup(r):
975                 self.add_extra_info(
976                     r,
977                     {
978                         'extractor': ie_result['extractor'],
979                         'webpage_url': ie_result['webpage_url'],
980                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
981                         'extractor_key': ie_result['extractor_key'],
982                     }
983                 )
984                 return r
985             ie_result['entries'] = [
986                 self.process_ie_result(_fixup(r), download, extra_info)
987                 for r in ie_result['entries']
988             ]
989             return ie_result
990         else:
991             raise Exception('Invalid result type: %s' % result_type)
992
993     def _build_format_filter(self, filter_spec):
994         " Returns a function to filter the formats according to the filter_spec "
995
996         OPERATORS = {
997             '<': operator.lt,
998             '<=': operator.le,
999             '>': operator.gt,
1000             '>=': operator.ge,
1001             '=': operator.eq,
1002             '!=': operator.ne,
1003         }
1004         operator_rex = re.compile(r'''(?x)\s*
1005             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1006             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1007             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1008             $
1009             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1010         m = operator_rex.search(filter_spec)
1011         if m:
1012             try:
1013                 comparison_value = int(m.group('value'))
1014             except ValueError:
1015                 comparison_value = parse_filesize(m.group('value'))
1016                 if comparison_value is None:
1017                     comparison_value = parse_filesize(m.group('value') + 'B')
1018                 if comparison_value is None:
1019                     raise ValueError(
1020                         'Invalid value %r in format specification %r' % (
1021                             m.group('value'), filter_spec))
1022             op = OPERATORS[m.group('op')]
1023
1024         if not m:
1025             STR_OPERATORS = {
1026                 '=': operator.eq,
1027                 '!=': operator.ne,
1028                 '^=': lambda attr, value: attr.startswith(value),
1029                 '$=': lambda attr, value: attr.endswith(value),
1030                 '*=': lambda attr, value: value in attr,
1031             }
1032             str_operator_rex = re.compile(r'''(?x)
1033                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1034                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1035                 \s*(?P<value>[a-zA-Z0-9._-]+)
1036                 \s*$
1037                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1038             m = str_operator_rex.search(filter_spec)
1039             if m:
1040                 comparison_value = m.group('value')
1041                 op = STR_OPERATORS[m.group('op')]
1042
1043         if not m:
1044             raise ValueError('Invalid filter specification %r' % filter_spec)
1045
1046         def _filter(f):
1047             actual_value = f.get(m.group('key'))
1048             if actual_value is None:
1049                 return m.group('none_inclusive')
1050             return op(actual_value, comparison_value)
1051         return _filter
1052
1053     def build_format_selector(self, format_spec):
1054         def syntax_error(note, start):
1055             message = (
1056                 'Invalid format specification: '
1057                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1058             return SyntaxError(message)
1059
1060         PICKFIRST = 'PICKFIRST'
1061         MERGE = 'MERGE'
1062         SINGLE = 'SINGLE'
1063         GROUP = 'GROUP'
1064         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1065
1066         def _parse_filter(tokens):
1067             filter_parts = []
1068             for type, string, start, _, _ in tokens:
1069                 if type == tokenize.OP and string == ']':
1070                     return ''.join(filter_parts)
1071                 else:
1072                     filter_parts.append(string)
1073
1074         def _remove_unused_ops(tokens):
1075             # Remove operators that we don't use and join them with the surrounding strings
1076             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1077             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1078             last_string, last_start, last_end, last_line = None, None, None, None
1079             for type, string, start, end, line in tokens:
1080                 if type == tokenize.OP and string == '[':
1081                     if last_string:
1082                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1083                         last_string = None
1084                     yield type, string, start, end, line
1085                     # everything inside brackets will be handled by _parse_filter
1086                     for type, string, start, end, line in tokens:
1087                         yield type, string, start, end, line
1088                         if type == tokenize.OP and string == ']':
1089                             break
1090                 elif type == tokenize.OP and string in ALLOWED_OPS:
1091                     if last_string:
1092                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1093                         last_string = None
1094                     yield type, string, start, end, line
1095                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1096                     if not last_string:
1097                         last_string = string
1098                         last_start = start
1099                         last_end = end
1100                     else:
1101                         last_string += string
1102             if last_string:
1103                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1104
1105         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1106             selectors = []
1107             current_selector = None
1108             for type, string, start, _, _ in tokens:
1109                 # ENCODING is only defined in python 3.x
1110                 if type == getattr(tokenize, 'ENCODING', None):
1111                     continue
1112                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1113                     current_selector = FormatSelector(SINGLE, string, [])
1114                 elif type == tokenize.OP:
1115                     if string == ')':
1116                         if not inside_group:
1117                             # ')' will be handled by the parentheses group
1118                             tokens.restore_last_token()
1119                         break
1120                     elif inside_merge and string in ['/', ',']:
1121                         tokens.restore_last_token()
1122                         break
1123                     elif inside_choice and string == ',':
1124                         tokens.restore_last_token()
1125                         break
1126                     elif string == ',':
1127                         if not current_selector:
1128                             raise syntax_error('"," must follow a format selector', start)
1129                         selectors.append(current_selector)
1130                         current_selector = None
1131                     elif string == '/':
1132                         if not current_selector:
1133                             raise syntax_error('"/" must follow a format selector', start)
1134                         first_choice = current_selector
1135                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1136                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1137                     elif string == '[':
1138                         if not current_selector:
1139                             current_selector = FormatSelector(SINGLE, 'best', [])
1140                         format_filter = _parse_filter(tokens)
1141                         current_selector.filters.append(format_filter)
1142                     elif string == '(':
1143                         if current_selector:
1144                             raise syntax_error('Unexpected "("', start)
1145                         group = _parse_format_selection(tokens, inside_group=True)
1146                         current_selector = FormatSelector(GROUP, group, [])
1147                     elif string == '+':
1148                         video_selector = current_selector
1149                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1150                         if not video_selector or not audio_selector:
1151                             raise syntax_error('"+" must be between two format selectors', start)
1152                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1153                     else:
1154                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1155                 elif type == tokenize.ENDMARKER:
1156                     break
1157             if current_selector:
1158                 selectors.append(current_selector)
1159             return selectors
1160
1161         def _build_selector_function(selector):
1162             if isinstance(selector, list):
1163                 fs = [_build_selector_function(s) for s in selector]
1164
1165                 def selector_function(ctx):
1166                     for f in fs:
1167                         for format in f(ctx):
1168                             yield format
1169                 return selector_function
1170             elif selector.type == GROUP:
1171                 selector_function = _build_selector_function(selector.selector)
1172             elif selector.type == PICKFIRST:
1173                 fs = [_build_selector_function(s) for s in selector.selector]
1174
1175                 def selector_function(ctx):
1176                     for f in fs:
1177                         picked_formats = list(f(ctx))
1178                         if picked_formats:
1179                             return picked_formats
1180                     return []
1181             elif selector.type == SINGLE:
1182                 format_spec = selector.selector
1183
1184                 def selector_function(ctx):
1185                     formats = list(ctx['formats'])
1186                     if not formats:
1187                         return
1188                     if format_spec == 'all':
1189                         for f in formats:
1190                             yield f
1191                     elif format_spec in ['best', 'worst', None]:
1192                         format_idx = 0 if format_spec == 'worst' else -1
1193                         audiovideo_formats = [
1194                             f for f in formats
1195                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1196                         if audiovideo_formats:
1197                             yield audiovideo_formats[format_idx]
1198                         # for extractors with incomplete formats (audio only (soundcloud)
1199                         # or video only (imgur)) we will fallback to best/worst
1200                         # {video,audio}-only format
1201                         elif ctx['incomplete_formats']:
1202                             yield formats[format_idx]
1203                     elif format_spec == 'bestaudio':
1204                         audio_formats = [
1205                             f for f in formats
1206                             if f.get('vcodec') == 'none']
1207                         if audio_formats:
1208                             yield audio_formats[-1]
1209                     elif format_spec == 'worstaudio':
1210                         audio_formats = [
1211                             f for f in formats
1212                             if f.get('vcodec') == 'none']
1213                         if audio_formats:
1214                             yield audio_formats[0]
1215                     elif format_spec == 'bestvideo':
1216                         video_formats = [
1217                             f for f in formats
1218                             if f.get('acodec') == 'none']
1219                         if video_formats:
1220                             yield video_formats[-1]
1221                     elif format_spec == 'worstvideo':
1222                         video_formats = [
1223                             f for f in formats
1224                             if f.get('acodec') == 'none']
1225                         if video_formats:
1226                             yield video_formats[0]
1227                     else:
1228                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1229                         if format_spec in extensions:
1230                             filter_f = lambda f: f['ext'] == format_spec
1231                         else:
1232                             filter_f = lambda f: f['format_id'] == format_spec
1233                         matches = list(filter(filter_f, formats))
1234                         if matches:
1235                             yield matches[-1]
1236             elif selector.type == MERGE:
1237                 def _merge(formats_info):
1238                     format_1, format_2 = [f['format_id'] for f in formats_info]
1239                     # The first format must contain the video and the
1240                     # second the audio
1241                     if formats_info[0].get('vcodec') == 'none':
1242                         self.report_error('The first format must '
1243                                           'contain the video, try using '
1244                                           '"-f %s+%s"' % (format_2, format_1))
1245                         return
1246                     # Formats must be opposite (video+audio)
1247                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1248                         self.report_error(
1249                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1250                             % (format_1, format_2))
1251                         return
1252                     output_ext = (
1253                         formats_info[0]['ext']
1254                         if self.params.get('merge_output_format') is None
1255                         else self.params['merge_output_format'])
1256                     return {
1257                         'requested_formats': formats_info,
1258                         'format': '%s+%s' % (formats_info[0].get('format'),
1259                                              formats_info[1].get('format')),
1260                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1261                                                 formats_info[1].get('format_id')),
1262                         'width': formats_info[0].get('width'),
1263                         'height': formats_info[0].get('height'),
1264                         'resolution': formats_info[0].get('resolution'),
1265                         'fps': formats_info[0].get('fps'),
1266                         'vcodec': formats_info[0].get('vcodec'),
1267                         'vbr': formats_info[0].get('vbr'),
1268                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1269                         'acodec': formats_info[1].get('acodec'),
1270                         'abr': formats_info[1].get('abr'),
1271                         'ext': output_ext,
1272                     }
1273                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1274
1275                 def selector_function(ctx):
1276                     for pair in itertools.product(
1277                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1278                         yield _merge(pair)
1279
1280             filters = [self._build_format_filter(f) for f in selector.filters]
1281
1282             def final_selector(ctx):
1283                 ctx_copy = copy.deepcopy(ctx)
1284                 for _filter in filters:
1285                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1286                 return selector_function(ctx_copy)
1287             return final_selector
1288
1289         stream = io.BytesIO(format_spec.encode('utf-8'))
1290         try:
1291             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1292         except tokenize.TokenError:
1293             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1294
1295         class TokenIterator(object):
1296             def __init__(self, tokens):
1297                 self.tokens = tokens
1298                 self.counter = 0
1299
1300             def __iter__(self):
1301                 return self
1302
1303             def __next__(self):
1304                 if self.counter >= len(self.tokens):
1305                     raise StopIteration()
1306                 value = self.tokens[self.counter]
1307                 self.counter += 1
1308                 return value
1309
1310             next = __next__
1311
1312             def restore_last_token(self):
1313                 self.counter -= 1
1314
1315         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1316         return _build_selector_function(parsed_selector)
1317
1318     def _calc_headers(self, info_dict):
1319         res = std_headers.copy()
1320
1321         add_headers = info_dict.get('http_headers')
1322         if add_headers:
1323             res.update(add_headers)
1324
1325         cookies = self._calc_cookies(info_dict)
1326         if cookies:
1327             res['Cookie'] = cookies
1328
1329         if 'X-Forwarded-For' not in res:
1330             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1331             if x_forwarded_for_ip:
1332                 res['X-Forwarded-For'] = x_forwarded_for_ip
1333
1334         return res
1335
1336     def _calc_cookies(self, info_dict):
1337         pr = sanitized_Request(info_dict['url'])
1338         self.cookiejar.add_cookie_header(pr)
1339         return pr.get_header('Cookie')
1340
1341     def process_video_result(self, info_dict, download=True):
1342         assert info_dict.get('_type', 'video') == 'video'
1343
1344         if 'id' not in info_dict:
1345             raise ExtractorError('Missing "id" field in extractor result')
1346         if 'title' not in info_dict:
1347             raise ExtractorError('Missing "title" field in extractor result')
1348
1349         def report_force_conversion(field, field_not, conversion):
1350             self.report_warning(
1351                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1352                 % (field, field_not, conversion))
1353
1354         def sanitize_string_field(info, string_field):
1355             field = info.get(string_field)
1356             if field is None or isinstance(field, compat_str):
1357                 return
1358             report_force_conversion(string_field, 'a string', 'string')
1359             info[string_field] = compat_str(field)
1360
1361         def sanitize_numeric_fields(info):
1362             for numeric_field in self._NUMERIC_FIELDS:
1363                 field = info.get(numeric_field)
1364                 if field is None or isinstance(field, compat_numeric_types):
1365                     continue
1366                 report_force_conversion(numeric_field, 'numeric', 'int')
1367                 info[numeric_field] = int_or_none(field)
1368
1369         sanitize_string_field(info_dict, 'id')
1370         sanitize_numeric_fields(info_dict)
1371
1372         if 'playlist' not in info_dict:
1373             # It isn't part of a playlist
1374             info_dict['playlist'] = None
1375             info_dict['playlist_index'] = None
1376
1377         thumbnails = info_dict.get('thumbnails')
1378         if thumbnails is None:
1379             thumbnail = info_dict.get('thumbnail')
1380             if thumbnail:
1381                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1382         if thumbnails:
1383             thumbnails.sort(key=lambda t: (
1384                 t.get('preference') if t.get('preference') is not None else -1,
1385                 t.get('width') if t.get('width') is not None else -1,
1386                 t.get('height') if t.get('height') is not None else -1,
1387                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1388             for i, t in enumerate(thumbnails):
1389                 t['url'] = sanitize_url(t['url'])
1390                 if t.get('width') and t.get('height'):
1391                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1392                 if t.get('id') is None:
1393                     t['id'] = '%d' % i
1394
1395         if self.params.get('list_thumbnails'):
1396             self.list_thumbnails(info_dict)
1397             return
1398
1399         thumbnail = info_dict.get('thumbnail')
1400         if thumbnail:
1401             info_dict['thumbnail'] = sanitize_url(thumbnail)
1402         elif thumbnails:
1403             info_dict['thumbnail'] = thumbnails[-1]['url']
1404
1405         if 'display_id' not in info_dict and 'id' in info_dict:
1406             info_dict['display_id'] = info_dict['id']
1407
1408         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1409             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1410             # see http://bugs.python.org/issue1646728)
1411             try:
1412                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1413                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1414             except (ValueError, OverflowError, OSError):
1415                 pass
1416
1417         # Auto generate title fields corresponding to the *_number fields when missing
1418         # in order to always have clean titles. This is very common for TV series.
1419         for field in ('chapter', 'season', 'episode'):
1420             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1421                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1422
1423         subtitles = info_dict.get('subtitles')
1424         if subtitles:
1425             for _, subtitle in subtitles.items():
1426                 for subtitle_format in subtitle:
1427                     if subtitle_format.get('url'):
1428                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1429                     if subtitle_format.get('ext') is None:
1430                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1431
1432         if self.params.get('listsubtitles', False):
1433             if 'automatic_captions' in info_dict:
1434                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1435             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1436             return
1437         info_dict['requested_subtitles'] = self.process_subtitles(
1438             info_dict['id'], subtitles,
1439             info_dict.get('automatic_captions'))
1440
1441         # We now pick which formats have to be downloaded
1442         if info_dict.get('formats') is None:
1443             # There's only one format available
1444             formats = [info_dict]
1445         else:
1446             formats = info_dict['formats']
1447
1448         if not formats:
1449             raise ExtractorError('No video formats found!')
1450
1451         def is_wellformed(f):
1452             url = f.get('url')
1453             valid_url = url and isinstance(url, compat_str)
1454             if not valid_url:
1455                 self.report_warning(
1456                     '"url" field is missing or empty - skipping format, '
1457                     'there is an error in extractor')
1458             return valid_url
1459
1460         # Filter out malformed formats for better extraction robustness
1461         formats = list(filter(is_wellformed, formats))
1462
1463         formats_dict = {}
1464
1465         # We check that all the formats have the format and format_id fields
1466         for i, format in enumerate(formats):
1467             sanitize_string_field(format, 'format_id')
1468             sanitize_numeric_fields(format)
1469             format['url'] = sanitize_url(format['url'])
1470             if format.get('format_id') is None:
1471                 format['format_id'] = compat_str(i)
1472             else:
1473                 # Sanitize format_id from characters used in format selector expression
1474                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1475             format_id = format['format_id']
1476             if format_id not in formats_dict:
1477                 formats_dict[format_id] = []
1478             formats_dict[format_id].append(format)
1479
1480         # Make sure all formats have unique format_id
1481         for format_id, ambiguous_formats in formats_dict.items():
1482             if len(ambiguous_formats) > 1:
1483                 for i, format in enumerate(ambiguous_formats):
1484                     format['format_id'] = '%s-%d' % (format_id, i)
1485
1486         for i, format in enumerate(formats):
1487             if format.get('format') is None:
1488                 format['format'] = '{id} - {res}{note}'.format(
1489                     id=format['format_id'],
1490                     res=self.format_resolution(format),
1491                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1492                 )
1493             # Automatically determine file extension if missing
1494             if format.get('ext') is None:
1495                 format['ext'] = determine_ext(format['url']).lower()
1496             # Automatically determine protocol if missing (useful for format
1497             # selection purposes)
1498             if format.get('protocol') is None:
1499                 format['protocol'] = determine_protocol(format)
1500             # Add HTTP headers, so that external programs can use them from the
1501             # json output
1502             full_format_info = info_dict.copy()
1503             full_format_info.update(format)
1504             format['http_headers'] = self._calc_headers(full_format_info)
1505         # Remove private housekeeping stuff
1506         if '__x_forwarded_for_ip' in info_dict:
1507             del info_dict['__x_forwarded_for_ip']
1508
1509         # TODO Central sorting goes here
1510
1511         if formats[0] is not info_dict:
1512             # only set the 'formats' fields if the original info_dict list them
1513             # otherwise we end up with a circular reference, the first (and unique)
1514             # element in the 'formats' field in info_dict is info_dict itself,
1515             # which can't be exported to json
1516             info_dict['formats'] = formats
1517         if self.params.get('listformats'):
1518             self.list_formats(info_dict)
1519             return
1520
1521         req_format = self.params.get('format')
1522         if req_format is None:
1523             req_format_list = []
1524             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1525                     not info_dict.get('is_live')):
1526                 merger = FFmpegMergerPP(self)
1527                 if merger.available and merger.can_merge():
1528                     req_format_list.append('bestvideo+bestaudio')
1529             req_format_list.append('best')
1530             req_format = '/'.join(req_format_list)
1531         format_selector = self.build_format_selector(req_format)
1532
1533         # While in format selection we may need to have an access to the original
1534         # format set in order to calculate some metrics or do some processing.
1535         # For now we need to be able to guess whether original formats provided
1536         # by extractor are incomplete or not (i.e. whether extractor provides only
1537         # video-only or audio-only formats) for proper formats selection for
1538         # extractors with such incomplete formats (see
1539         # https://github.com/rg3/youtube-dl/pull/5556).
1540         # Since formats may be filtered during format selection and may not match
1541         # the original formats the results may be incorrect. Thus original formats
1542         # or pre-calculated metrics should be passed to format selection routines
1543         # as well.
1544         # We will pass a context object containing all necessary additional data
1545         # instead of just formats.
1546         # This fixes incorrect format selection issue (see
1547         # https://github.com/rg3/youtube-dl/issues/10083).
1548         incomplete_formats = (
1549             # All formats are video-only or
1550             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1551             # all formats are audio-only
1552             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1553
1554         ctx = {
1555             'formats': formats,
1556             'incomplete_formats': incomplete_formats,
1557         }
1558
1559         formats_to_download = list(format_selector(ctx))
1560         if not formats_to_download:
1561             raise ExtractorError('requested format not available',
1562                                  expected=True)
1563
1564         if download:
1565             if len(formats_to_download) > 1:
1566                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1567             for format in formats_to_download:
1568                 new_info = dict(info_dict)
1569                 new_info.update(format)
1570                 self.process_info(new_info)
1571         # We update the info dict with the best quality format (backwards compatibility)
1572         info_dict.update(formats_to_download[-1])
1573         return info_dict
1574
1575     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1576         """Select the requested subtitles and their format"""
1577         available_subs = {}
1578         if normal_subtitles and self.params.get('writesubtitles'):
1579             available_subs.update(normal_subtitles)
1580         if automatic_captions and self.params.get('writeautomaticsub'):
1581             for lang, cap_info in automatic_captions.items():
1582                 if lang not in available_subs:
1583                     available_subs[lang] = cap_info
1584
1585         if (not self.params.get('writesubtitles') and not
1586                 self.params.get('writeautomaticsub') or not
1587                 available_subs):
1588             return None
1589
1590         if self.params.get('allsubtitles', False):
1591             requested_langs = available_subs.keys()
1592         else:
1593             if self.params.get('subtitleslangs', False):
1594                 requested_langs = self.params.get('subtitleslangs')
1595             elif 'en' in available_subs:
1596                 requested_langs = ['en']
1597             else:
1598                 requested_langs = [list(available_subs.keys())[0]]
1599
1600         formats_query = self.params.get('subtitlesformat', 'best')
1601         formats_preference = formats_query.split('/') if formats_query else []
1602         subs = {}
1603         for lang in requested_langs:
1604             formats = available_subs.get(lang)
1605             if formats is None:
1606                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1607                 continue
1608             for ext in formats_preference:
1609                 if ext == 'best':
1610                     f = formats[-1]
1611                     break
1612                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1613                 if matches:
1614                     f = matches[-1]
1615                     break
1616             else:
1617                 f = formats[-1]
1618                 self.report_warning(
1619                     'No subtitle format found matching "%s" for language %s, '
1620                     'using %s' % (formats_query, lang, f['ext']))
1621             subs[lang] = f
1622         return subs
1623
1624     def process_info(self, info_dict):
1625         """Process a single resolved IE result."""
1626
1627         assert info_dict.get('_type', 'video') == 'video'
1628
1629         max_downloads = self.params.get('max_downloads')
1630         if max_downloads is not None:
1631             if self._num_downloads >= int(max_downloads):
1632                 raise MaxDownloadsReached()
1633
1634         info_dict['fulltitle'] = info_dict['title']
1635         if len(info_dict['title']) > 200:
1636             info_dict['title'] = info_dict['title'][:197] + '...'
1637
1638         if 'format' not in info_dict:
1639             info_dict['format'] = info_dict['ext']
1640
1641         reason = self._match_entry(info_dict, incomplete=False)
1642         if reason is not None:
1643             self.to_screen('[download] ' + reason)
1644             return
1645
1646         self._num_downloads += 1
1647
1648         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1649
1650         # Forced printings
1651         if self.params.get('forcetitle', False):
1652             self.to_stdout(info_dict['fulltitle'])
1653         if self.params.get('forceid', False):
1654             self.to_stdout(info_dict['id'])
1655         if self.params.get('forceurl', False):
1656             if info_dict.get('requested_formats') is not None:
1657                 for f in info_dict['requested_formats']:
1658                     self.to_stdout(f['url'] + f.get('play_path', ''))
1659             else:
1660                 # For RTMP URLs, also include the playpath
1661                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1662         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1663             self.to_stdout(info_dict['thumbnail'])
1664         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1665             self.to_stdout(info_dict['description'])
1666         if self.params.get('forcefilename', False) and filename is not None:
1667             self.to_stdout(filename)
1668         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1669             self.to_stdout(formatSeconds(info_dict['duration']))
1670         if self.params.get('forceformat', False):
1671             self.to_stdout(info_dict['format'])
1672         if self.params.get('forcejson', False):
1673             self.to_stdout(json.dumps(info_dict))
1674
1675         # Do nothing else if in simulate mode
1676         if self.params.get('simulate', False):
1677             return
1678
1679         if filename is None:
1680             return
1681
1682         try:
1683             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1684             if dn and not os.path.exists(dn):
1685                 os.makedirs(dn)
1686         except (OSError, IOError) as err:
1687             self.report_error('unable to create directory ' + error_to_compat_str(err))
1688             return
1689
1690         if self.params.get('writedescription', False):
1691             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1692             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1693                 self.to_screen('[info] Video description is already present')
1694             elif info_dict.get('description') is None:
1695                 self.report_warning('There\'s no description to write.')
1696             else:
1697                 try:
1698                     self.to_screen('[info] Writing video description to: ' + descfn)
1699                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1700                         descfile.write(info_dict['description'])
1701                 except (OSError, IOError):
1702                     self.report_error('Cannot write description file ' + descfn)
1703                     return
1704
1705         if self.params.get('writeannotations', False):
1706             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1707             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1708                 self.to_screen('[info] Video annotations are already present')
1709             else:
1710                 try:
1711                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1712                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1713                         annofile.write(info_dict['annotations'])
1714                 except (KeyError, TypeError):
1715                     self.report_warning('There are no annotations to write.')
1716                 except (OSError, IOError):
1717                     self.report_error('Cannot write annotations file: ' + annofn)
1718                     return
1719
1720         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1721                                        self.params.get('writeautomaticsub')])
1722
1723         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1724             # subtitles download errors are already managed as troubles in relevant IE
1725             # that way it will silently go on when used with unsupporting IE
1726             subtitles = info_dict['requested_subtitles']
1727             ie = self.get_info_extractor(info_dict['extractor_key'])
1728             for sub_lang, sub_info in subtitles.items():
1729                 sub_format = sub_info['ext']
1730                 if sub_info.get('data') is not None:
1731                     sub_data = sub_info['data']
1732                 else:
1733                     try:
1734                         sub_data = ie._download_webpage(
1735                             sub_info['url'], info_dict['id'], note=False)
1736                     except ExtractorError as err:
1737                         self.report_warning('Unable to download subtitle for "%s": %s' %
1738                                             (sub_lang, error_to_compat_str(err.cause)))
1739                         continue
1740                 try:
1741                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1742                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1743                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1744                     else:
1745                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1746                         # Use newline='' to prevent conversion of newline characters
1747                         # See https://github.com/rg3/youtube-dl/issues/10268
1748                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1749                             subfile.write(sub_data)
1750                 except (OSError, IOError):
1751                     self.report_error('Cannot write subtitles file ' + sub_filename)
1752                     return
1753
1754         if self.params.get('writeinfojson', False):
1755             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1756             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1757                 self.to_screen('[info] Video description metadata is already present')
1758             else:
1759                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1760                 try:
1761                     write_json_file(self.filter_requested_info(info_dict), infofn)
1762                 except (OSError, IOError):
1763                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1764                     return
1765
1766         self._write_thumbnails(info_dict, filename)
1767
1768         if not self.params.get('skip_download', False):
1769             try:
1770                 def dl(name, info):
1771                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1772                     for ph in self._progress_hooks:
1773                         fd.add_progress_hook(ph)
1774                     if self.params.get('verbose'):
1775                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1776                     return fd.download(name, info)
1777
1778                 if info_dict.get('requested_formats') is not None:
1779                     downloaded = []
1780                     success = True
1781                     merger = FFmpegMergerPP(self)
1782                     if not merger.available:
1783                         postprocessors = []
1784                         self.report_warning('You have requested multiple '
1785                                             'formats but ffmpeg or avconv are not installed.'
1786                                             ' The formats won\'t be merged.')
1787                     else:
1788                         postprocessors = [merger]
1789
1790                     def compatible_formats(formats):
1791                         video, audio = formats
1792                         # Check extension
1793                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1794                         if video_ext and audio_ext:
1795                             COMPATIBLE_EXTS = (
1796                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1797                                 ('webm')
1798                             )
1799                             for exts in COMPATIBLE_EXTS:
1800                                 if video_ext in exts and audio_ext in exts:
1801                                     return True
1802                         # TODO: Check acodec/vcodec
1803                         return False
1804
1805                     filename_real_ext = os.path.splitext(filename)[1][1:]
1806                     filename_wo_ext = (
1807                         os.path.splitext(filename)[0]
1808                         if filename_real_ext == info_dict['ext']
1809                         else filename)
1810                     requested_formats = info_dict['requested_formats']
1811                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1812                         info_dict['ext'] = 'mkv'
1813                         self.report_warning(
1814                             'Requested formats are incompatible for merge and will be merged into mkv.')
1815                     # Ensure filename always has a correct extension for successful merge
1816                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1817                     if os.path.exists(encodeFilename(filename)):
1818                         self.to_screen(
1819                             '[download] %s has already been downloaded and '
1820                             'merged' % filename)
1821                     else:
1822                         for f in requested_formats:
1823                             new_info = dict(info_dict)
1824                             new_info.update(f)
1825                             fname = self.prepare_filename(new_info)
1826                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1827                             downloaded.append(fname)
1828                             partial_success = dl(fname, new_info)
1829                             success = success and partial_success
1830                         info_dict['__postprocessors'] = postprocessors
1831                         info_dict['__files_to_merge'] = downloaded
1832                 else:
1833                     # Just a single file
1834                     success = dl(filename, info_dict)
1835             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1837                 return
1838             except (OSError, IOError) as err:
1839                 raise UnavailableVideoError(err)
1840             except (ContentTooShortError, ) as err:
1841                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1842                 return
1843
1844             if success and filename != '-':
1845                 # Fixup content
1846                 fixup_policy = self.params.get('fixup')
1847                 if fixup_policy is None:
1848                     fixup_policy = 'detect_or_warn'
1849
1850                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1851
1852                 stretched_ratio = info_dict.get('stretched_ratio')
1853                 if stretched_ratio is not None and stretched_ratio != 1:
1854                     if fixup_policy == 'warn':
1855                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1856                             info_dict['id'], stretched_ratio))
1857                     elif fixup_policy == 'detect_or_warn':
1858                         stretched_pp = FFmpegFixupStretchedPP(self)
1859                         if stretched_pp.available:
1860                             info_dict.setdefault('__postprocessors', [])
1861                             info_dict['__postprocessors'].append(stretched_pp)
1862                         else:
1863                             self.report_warning(
1864                                 '%s: Non-uniform pixel ratio (%s). %s'
1865                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1866                     else:
1867                         assert fixup_policy in ('ignore', 'never')
1868
1869                 if (info_dict.get('requested_formats') is None and
1870                         info_dict.get('container') == 'm4a_dash'):
1871                     if fixup_policy == 'warn':
1872                         self.report_warning(
1873                             '%s: writing DASH m4a. '
1874                             'Only some players support this container.'
1875                             % info_dict['id'])
1876                     elif fixup_policy == 'detect_or_warn':
1877                         fixup_pp = FFmpegFixupM4aPP(self)
1878                         if fixup_pp.available:
1879                             info_dict.setdefault('__postprocessors', [])
1880                             info_dict['__postprocessors'].append(fixup_pp)
1881                         else:
1882                             self.report_warning(
1883                                 '%s: writing DASH m4a. '
1884                                 'Only some players support this container. %s'
1885                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1886                     else:
1887                         assert fixup_policy in ('ignore', 'never')
1888
1889                 if (info_dict.get('protocol') == 'm3u8_native' or
1890                         info_dict.get('protocol') == 'm3u8' and
1891                         self.params.get('hls_prefer_native')):
1892                     if fixup_policy == 'warn':
1893                         self.report_warning('%s: malformated aac bitstream.' % (
1894                             info_dict['id']))
1895                     elif fixup_policy == 'detect_or_warn':
1896                         fixup_pp = FFmpegFixupM3u8PP(self)
1897                         if fixup_pp.available:
1898                             info_dict.setdefault('__postprocessors', [])
1899                             info_dict['__postprocessors'].append(fixup_pp)
1900                         else:
1901                             self.report_warning(
1902                                 '%s: malformated aac bitstream. %s'
1903                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1904                     else:
1905                         assert fixup_policy in ('ignore', 'never')
1906
1907                 try:
1908                     self.post_process(filename, info_dict)
1909                 except (PostProcessingError) as err:
1910                     self.report_error('postprocessing: %s' % str(err))
1911                     return
1912                 self.record_download_archive(info_dict)
1913
1914     def download(self, url_list):
1915         """Download a given list of URLs."""
1916         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1917         if (len(url_list) > 1 and
1918                 outtmpl != '-' and
1919                 '%' not in outtmpl and
1920                 self.params.get('max_downloads') != 1):
1921             raise SameFileError(outtmpl)
1922
1923         for url in url_list:
1924             try:
1925                 # It also downloads the videos
1926                 res = self.extract_info(
1927                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1928             except UnavailableVideoError:
1929                 self.report_error('unable to download video')
1930             except MaxDownloadsReached:
1931                 self.to_screen('[info] Maximum number of downloaded files reached.')
1932                 raise
1933             else:
1934                 if self.params.get('dump_single_json', False):
1935                     self.to_stdout(json.dumps(res))
1936
1937         return self._download_retcode
1938
1939     def download_with_info_file(self, info_filename):
1940         with contextlib.closing(fileinput.FileInput(
1941                 [info_filename], mode='r',
1942                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1943             # FileInput doesn't have a read method, we can't call json.load
1944             info = self.filter_requested_info(json.loads('\n'.join(f)))
1945         try:
1946             self.process_ie_result(info, download=True)
1947         except DownloadError:
1948             webpage_url = info.get('webpage_url')
1949             if webpage_url is not None:
1950                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1951                 return self.download([webpage_url])
1952             else:
1953                 raise
1954         return self._download_retcode
1955
1956     @staticmethod
1957     def filter_requested_info(info_dict):
1958         return dict(
1959             (k, v) for k, v in info_dict.items()
1960             if k not in ['requested_formats', 'requested_subtitles'])
1961
1962     def post_process(self, filename, ie_info):
1963         """Run all the postprocessors on the given file."""
1964         info = dict(ie_info)
1965         info['filepath'] = filename
1966         pps_chain = []
1967         if ie_info.get('__postprocessors') is not None:
1968             pps_chain.extend(ie_info['__postprocessors'])
1969         pps_chain.extend(self._pps)
1970         for pp in pps_chain:
1971             files_to_delete = []
1972             try:
1973                 files_to_delete, info = pp.run(info)
1974             except PostProcessingError as e:
1975                 self.report_error(e.msg)
1976             if files_to_delete and not self.params.get('keepvideo', False):
1977                 for old_filename in files_to_delete:
1978                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1979                     try:
1980                         os.remove(encodeFilename(old_filename))
1981                     except (IOError, OSError):
1982                         self.report_warning('Unable to remove downloaded original file')
1983
1984     def _make_archive_id(self, info_dict):
1985         # Future-proof against any change in case
1986         # and backwards compatibility with prior versions
1987         extractor = info_dict.get('extractor_key')
1988         if extractor is None:
1989             if 'id' in info_dict:
1990                 extractor = info_dict.get('ie_key')  # key in a playlist
1991         if extractor is None:
1992             return None  # Incomplete video information
1993         return extractor.lower() + ' ' + info_dict['id']
1994
1995     def in_download_archive(self, info_dict):
1996         fn = self.params.get('download_archive')
1997         if fn is None:
1998             return False
1999
2000         vid_id = self._make_archive_id(info_dict)
2001         if vid_id is None:
2002             return False  # Incomplete video information
2003
2004         try:
2005             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2006                 for line in archive_file:
2007                     if line.strip() == vid_id:
2008                         return True
2009         except IOError as ioe:
2010             if ioe.errno != errno.ENOENT:
2011                 raise
2012         return False
2013
2014     def record_download_archive(self, info_dict):
2015         fn = self.params.get('download_archive')
2016         if fn is None:
2017             return
2018         vid_id = self._make_archive_id(info_dict)
2019         assert vid_id
2020         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2021             archive_file.write(vid_id + '\n')
2022
2023     @staticmethod
2024     def format_resolution(format, default='unknown'):
2025         if format.get('vcodec') == 'none':
2026             return 'audio only'
2027         if format.get('resolution') is not None:
2028             return format['resolution']
2029         if format.get('height') is not None:
2030             if format.get('width') is not None:
2031                 res = '%sx%s' % (format['width'], format['height'])
2032             else:
2033                 res = '%sp' % format['height']
2034         elif format.get('width') is not None:
2035             res = '%dx?' % format['width']
2036         else:
2037             res = default
2038         return res
2039
2040     def _format_note(self, fdict):
2041         res = ''
2042         if fdict.get('ext') in ['f4f', 'f4m']:
2043             res += '(unsupported) '
2044         if fdict.get('language'):
2045             if res:
2046                 res += ' '
2047             res += '[%s] ' % fdict['language']
2048         if fdict.get('format_note') is not None:
2049             res += fdict['format_note'] + ' '
2050         if fdict.get('tbr') is not None:
2051             res += '%4dk ' % fdict['tbr']
2052         if fdict.get('container') is not None:
2053             if res:
2054                 res += ', '
2055             res += '%s container' % fdict['container']
2056         if (fdict.get('vcodec') is not None and
2057                 fdict.get('vcodec') != 'none'):
2058             if res:
2059                 res += ', '
2060             res += fdict['vcodec']
2061             if fdict.get('vbr') is not None:
2062                 res += '@'
2063         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2064             res += 'video@'
2065         if fdict.get('vbr') is not None:
2066             res += '%4dk' % fdict['vbr']
2067         if fdict.get('fps') is not None:
2068             if res:
2069                 res += ', '
2070             res += '%sfps' % fdict['fps']
2071         if fdict.get('acodec') is not None:
2072             if res:
2073                 res += ', '
2074             if fdict['acodec'] == 'none':
2075                 res += 'video only'
2076             else:
2077                 res += '%-5s' % fdict['acodec']
2078         elif fdict.get('abr') is not None:
2079             if res:
2080                 res += ', '
2081             res += 'audio'
2082         if fdict.get('abr') is not None:
2083             res += '@%3dk' % fdict['abr']
2084         if fdict.get('asr') is not None:
2085             res += ' (%5dHz)' % fdict['asr']
2086         if fdict.get('filesize') is not None:
2087             if res:
2088                 res += ', '
2089             res += format_bytes(fdict['filesize'])
2090         elif fdict.get('filesize_approx') is not None:
2091             if res:
2092                 res += ', '
2093             res += '~' + format_bytes(fdict['filesize_approx'])
2094         return res
2095
2096     def list_formats(self, info_dict):
2097         formats = info_dict.get('formats', [info_dict])
2098         table = [
2099             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2100             for f in formats
2101             if f.get('preference') is None or f['preference'] >= -1000]
2102         if len(formats) > 1:
2103             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2104
2105         header_line = ['format code', 'extension', 'resolution', 'note']
2106         self.to_screen(
2107             '[info] Available formats for %s:\n%s' %
2108             (info_dict['id'], render_table(header_line, table)))
2109
2110     def list_thumbnails(self, info_dict):
2111         thumbnails = info_dict.get('thumbnails')
2112         if not thumbnails:
2113             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2114             return
2115
2116         self.to_screen(
2117             '[info] Thumbnails for %s:' % info_dict['id'])
2118         self.to_screen(render_table(
2119             ['ID', 'width', 'height', 'URL'],
2120             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2121
2122     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2123         if not subtitles:
2124             self.to_screen('%s has no %s' % (video_id, name))
2125             return
2126         self.to_screen(
2127             'Available %s for %s:' % (name, video_id))
2128         self.to_screen(render_table(
2129             ['Language', 'formats'],
2130             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2131                 for lang, formats in subtitles.items()]))
2132
2133     def urlopen(self, req):
2134         """ Start an HTTP download """
2135         if isinstance(req, compat_basestring):
2136             req = sanitized_Request(req)
2137         return self._opener.open(req, timeout=self._socket_timeout)
2138
2139     def print_debug_header(self):
2140         if not self.params.get('verbose'):
2141             return
2142
2143         if type('') is not compat_str:
2144             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2145             self.report_warning(
2146                 'Your Python is broken! Update to a newer and supported version')
2147
2148         stdout_encoding = getattr(
2149             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2150         encoding_str = (
2151             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2152                 locale.getpreferredencoding(),
2153                 sys.getfilesystemencoding(),
2154                 stdout_encoding,
2155                 self.get_encoding()))
2156         write_string(encoding_str, encoding=None)
2157
2158         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2159         if _LAZY_LOADER:
2160             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2161         try:
2162             sp = subprocess.Popen(
2163                 ['git', 'rev-parse', '--short', 'HEAD'],
2164                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2165                 cwd=os.path.dirname(os.path.abspath(__file__)))
2166             out, err = sp.communicate()
2167             out = out.decode().strip()
2168             if re.match('[0-9a-f]+', out):
2169                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2170         except Exception:
2171             try:
2172                 sys.exc_clear()
2173             except Exception:
2174                 pass
2175         self._write_string('[debug] Python version %s - %s\n' % (
2176             platform.python_version(), platform_name()))
2177
2178         exe_versions = FFmpegPostProcessor.get_versions(self)
2179         exe_versions['rtmpdump'] = rtmpdump_version()
2180         exe_str = ', '.join(
2181             '%s %s' % (exe, v)
2182             for exe, v in sorted(exe_versions.items())
2183             if v
2184         )
2185         if not exe_str:
2186             exe_str = 'none'
2187         self._write_string('[debug] exe versions: %s\n' % exe_str)
2188
2189         proxy_map = {}
2190         for handler in self._opener.handlers:
2191             if hasattr(handler, 'proxies'):
2192                 proxy_map.update(handler.proxies)
2193         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2194
2195         if self.params.get('call_home', False):
2196             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2197             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2198             latest_version = self.urlopen(
2199                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2200             if version_tuple(latest_version) > version_tuple(__version__):
2201                 self.report_warning(
2202                     'You are using an outdated version (newest version: %s)! '
2203                     'See https://yt-dl.org/update if you need help updating.' %
2204                     latest_version)
2205
2206     def _setup_opener(self):
2207         timeout_val = self.params.get('socket_timeout')
2208         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2209
2210         opts_cookiefile = self.params.get('cookiefile')
2211         opts_proxy = self.params.get('proxy')
2212
2213         if opts_cookiefile is None:
2214             self.cookiejar = compat_cookiejar.CookieJar()
2215         else:
2216             opts_cookiefile = expand_path(opts_cookiefile)
2217             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2218                 opts_cookiefile)
2219             if os.access(opts_cookiefile, os.R_OK):
2220                 self.cookiejar.load()
2221
2222         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2223         if opts_proxy is not None:
2224             if opts_proxy == '':
2225                 proxies = {}
2226             else:
2227                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2228         else:
2229             proxies = compat_urllib_request.getproxies()
2230             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2231             if 'http' in proxies and 'https' not in proxies:
2232                 proxies['https'] = proxies['http']
2233         proxy_handler = PerRequestProxyHandler(proxies)
2234
2235         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2236         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2237         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2238         data_handler = compat_urllib_request_DataHandler()
2239
2240         # When passing our own FileHandler instance, build_opener won't add the
2241         # default FileHandler and allows us to disable the file protocol, which
2242         # can be used for malicious purposes (see
2243         # https://github.com/rg3/youtube-dl/issues/8227)
2244         file_handler = compat_urllib_request.FileHandler()
2245
2246         def file_open(*args, **kwargs):
2247             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2248         file_handler.file_open = file_open
2249
2250         opener = compat_urllib_request.build_opener(
2251             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2252
2253         # Delete the default user-agent header, which would otherwise apply in
2254         # cases where our custom HTTP handler doesn't come into play
2255         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2256         opener.addheaders = []
2257         self._opener = opener
2258
2259     def encode(self, s):
2260         if isinstance(s, bytes):
2261             return s  # Already encoded
2262
2263         try:
2264             return s.encode(self.get_encoding())
2265         except UnicodeEncodeError as err:
2266             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2267             raise
2268
2269     def get_encoding(self):
2270         encoding = self.params.get('encoding')
2271         if encoding is None:
2272             encoding = preferredencoding()
2273         return encoding
2274
2275     def _write_thumbnails(self, info_dict, filename):
2276         if self.params.get('writethumbnail', False):
2277             thumbnails = info_dict.get('thumbnails')
2278             if thumbnails:
2279                 thumbnails = [thumbnails[-1]]
2280         elif self.params.get('write_all_thumbnails', False):
2281             thumbnails = info_dict.get('thumbnails')
2282         else:
2283             return
2284
2285         if not thumbnails:
2286             # No thumbnails present, so return immediately
2287             return
2288
2289         for t in thumbnails:
2290             thumb_ext = determine_ext(t['url'], 'jpg')
2291             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2292             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2293             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2294
2295             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2296                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2297                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2298             else:
2299                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2300                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2301                 try:
2302                     uf = self.urlopen(t['url'])
2303                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2304                         shutil.copyfileobj(uf, thumbf)
2305                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2306                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2307                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2308                     self.report_warning('Unable to download thumbnail "%s": %s' %
2309                                         (t['url'], error_to_compat_str(err)))