Improve geo bypass mechanism
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     subtitles_filename,
86     UnavailableVideoError,
87     url_basename,
88     version_tuple,
89     write_json_file,
90     write_string,
91     YoutubeDLCookieProcessor,
92     YoutubeDLHandler,
93 )
94 from .cache import Cache
95 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
96 from .extractor.openload import PhantomJSwrapper
97 from .downloader import get_suitable_downloader
98 from .downloader.rtmp import rtmpdump_version
99 from .postprocessor import (
100     FFmpegFixupM3u8PP,
101     FFmpegFixupM4aPP,
102     FFmpegFixupStretchedPP,
103     FFmpegMergerPP,
104     FFmpegPostProcessor,
105     get_postprocessor,
106 )
107 from .version import __version__
108
109 if compat_os_name == 'nt':
110     import ctypes
111
112
113 class YoutubeDL(object):
114     """YoutubeDL class.
115
116     YoutubeDL objects are the ones responsible of downloading the
117     actual video file and writing it to disk if the user has requested
118     it, among some other tasks. In most cases there should be one per
119     program. As, given a video URL, the downloader doesn't know how to
120     extract all the needed information, task that InfoExtractors do, it
121     has to pass the URL to one of them.
122
123     For this, YoutubeDL objects have a method that allows
124     InfoExtractors to be registered in a given order. When it is passed
125     a URL, the YoutubeDL object handles it to the first InfoExtractor it
126     finds that reports being able to handle it. The InfoExtractor extracts
127     all the information about the video or videos the URL refers to, and
128     YoutubeDL process the extracted information, possibly using a File
129     Downloader to download the video.
130
131     YoutubeDL objects accept a lot of parameters. In order not to saturate
132     the object constructor with arguments, it receives a dictionary of
133     options instead. These options are available through the params
134     attribute for the InfoExtractors to use. The YoutubeDL also
135     registers itself as the downloader in charge for the InfoExtractors
136     that are added to it, so this is a "mutual registration".
137
138     Available options:
139
140     username:          Username for authentication purposes.
141     password:          Password for authentication purposes.
142     videopassword:     Password for accessing a video.
143     ap_mso:            Adobe Pass multiple-system operator identifier.
144     ap_username:       Multiple-system operator account username.
145     ap_password:       Multiple-system operator account password.
146     usenetrc:          Use netrc for authentication instead.
147     verbose:           Print additional info to stdout.
148     quiet:             Do not print messages to stdout.
149     no_warnings:       Do not print out anything for warnings.
150     forceurl:          Force printing final URL.
151     forcetitle:        Force printing title.
152     forceid:           Force printing ID.
153     forcethumbnail:    Force printing thumbnail URL.
154     forcedescription:  Force printing description.
155     forcefilename:     Force printing final filename.
156     forceduration:     Force printing duration.
157     forcejson:         Force printing info_dict as JSON.
158     dump_single_json:  Force printing the info_dict of the whole playlist
159                        (or video) as a single JSON line.
160     simulate:          Do not download the video files.
161     format:            Video format code. See options.py for more information.
162     outtmpl:           Template for output names.
163     restrictfilenames: Do not allow "&" and spaces in file names
164     ignoreerrors:      Do not stop on download errors.
165     force_generic_extractor: Force downloader to use the generic extractor
166     nooverwrites:      Prevent overwriting files.
167     playliststart:     Playlist item to start at.
168     playlistend:       Playlist item to end at.
169     playlist_items:    Specific indices of playlist to download.
170     playlistreverse:   Download playlist items in reverse order.
171     playlistrandom:    Download playlist items in random order.
172     matchtitle:        Download only matching titles.
173     rejecttitle:       Reject downloads for matching titles.
174     logger:            Log messages to a logging.Logger instance.
175     logtostderr:       Log messages to stderr instead of stdout.
176     writedescription:  Write the video description to a .description file
177     writeinfojson:     Write the video description to a .info.json file
178     writeannotations:  Write the video annotations to a .annotations.xml file
179     writethumbnail:    Write the thumbnail image to a file
180     write_all_thumbnails:  Write all thumbnail formats to files
181     writesubtitles:    Write the video subtitles to a file
182     writeautomaticsub: Write the automatically generated subtitles to a file
183     allsubtitles:      Downloads all the subtitles of the video
184                        (requires writesubtitles or writeautomaticsub)
185     listsubtitles:     Lists all available subtitles for the video
186     subtitlesformat:   The format code for subtitles
187     subtitleslangs:    List of languages of the subtitles to download
188     keepvideo:         Keep the video file after post-processing
189     daterange:         A DateRange object, download only if the upload_date is in the range.
190     skip_download:     Skip the actual download of the video file
191     cachedir:          Location of the cache files in the filesystem.
192                        False to disable filesystem cache.
193     noplaylist:        Download single video instead of a playlist if in doubt.
194     age_limit:         An integer representing the user's age in years.
195                        Unsuitable videos for the given age are skipped.
196     min_views:         An integer representing the minimum view count the video
197                        must have in order to not be skipped.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     max_views:         An integer representing the maximum view count.
201                        Videos that are more popular than that are not
202                        downloaded.
203                        Videos without view count information are always
204                        downloaded. None for no limit.
205     download_archive:  File name of a file where all downloads are recorded.
206                        Videos already present in the file are not downloaded
207                        again.
208     cookiefile:        File name where cookies should be read from and dumped to.
209     nocheckcertificate:Do not verify SSL certificates
210     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
211                        At the moment, this is only supported by YouTube.
212     proxy:             URL of the proxy server to use
213     geo_verification_proxy:  URL of the proxy to use for IP address verification
214                        on geo-restricted sites. (Experimental)
215     socket_timeout:    Time to wait for unresponsive hosts, in seconds
216     bidi_workaround:   Work around buggy terminals without bidirectional text
217                        support, using fridibi
218     debug_printtraffic:Print out sent and received HTTP traffic
219     include_ads:       Download ads as well
220     default_search:    Prepend this string if an input url is not valid.
221                        'auto' for elaborate guessing
222     encoding:          Use this encoding instead of the system-specified.
223     extract_flat:      Do not resolve URLs, return the immediate result.
224                        Pass in 'in_playlist' to only show this behavior for
225                        playlist items.
226     postprocessors:    A list of dictionaries, each with an entry
227                        * key:  The name of the postprocessor. See
228                                youtube_dl/postprocessor/__init__.py for a list.
229                        as well as any further keyword arguments for the
230                        postprocessor.
231     progress_hooks:    A list of functions that get called on download
232                        progress, with a dictionary with the entries
233                        * status: One of "downloading", "error", or "finished".
234                                  Check this first and ignore unknown values.
235
236                        If status is one of "downloading", or "finished", the
237                        following properties may also be present:
238                        * filename: The final filename (always present)
239                        * tmpfilename: The filename we're currently writing to
240                        * downloaded_bytes: Bytes on disk
241                        * total_bytes: Size of the whole file, None if unknown
242                        * total_bytes_estimate: Guess of the eventual file size,
243                                                None if unavailable.
244                        * elapsed: The number of seconds since download started.
245                        * eta: The estimated time in seconds, None if unknown
246                        * speed: The download speed in bytes/second, None if
247                                 unknown
248                        * fragment_index: The counter of the currently
249                                          downloaded video fragment.
250                        * fragment_count: The number of fragments (= individual
251                                          files that will be merged)
252
253                        Progress hooks are guaranteed to be called at least once
254                        (with status "finished") if the download is successful.
255     merge_output_format: Extension to use when merging formats.
256     fixup:             Automatically correct known faults of the file.
257                        One of:
258                        - "never": do nothing
259                        - "warn": only emit a warning
260                        - "detect_or_warn": check whether we can do anything
261                                            about it, warn otherwise (default)
262     source_address:    (Experimental) Client-side IP address to bind to.
263     call_home:         Boolean, true iff we are allowed to contact the
264                        youtube-dl servers for debugging.
265     sleep_interval:    Number of seconds to sleep before each download when
266                        used alone or a lower bound of a range for randomized
267                        sleep before each download (minimum possible number
268                        of seconds to sleep) when used along with
269                        max_sleep_interval.
270     max_sleep_interval:Upper bound of a range for randomized sleep before each
271                        download (maximum possible number of seconds to sleep).
272                        Must only be used along with sleep_interval.
273                        Actual sleep time will be a random float from range
274                        [sleep_interval; max_sleep_interval].
275     listformats:       Print an overview of available video formats and exit.
276     list_thumbnails:   Print a table of all thumbnails and exit.
277     match_filter:      A function that gets called with the info_dict of
278                        every video.
279                        If it returns a message, the video is ignored.
280                        If it returns None, the video is downloaded.
281                        match_filter_func in utils.py is one example for this.
282     no_color:          Do not emit color codes in output.
283     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
284                        HTTP header (experimental)
285     geo_bypass_country:
286                        Two-letter ISO 3166-2 country code that will be used for
287                        explicit geographic restriction bypassing via faking
288                        X-Forwarded-For HTTP header (experimental)
289     geo_bypass_ip_block:
290                        IP range in CIDR notation that will be used similarly to
291                        geo_bypass_country (experimental)
292
293     The following options determine which downloader is picked:
294     external_downloader: Executable of the external downloader to call.
295                        None or unset for standard (built-in) downloader.
296     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
297                        if True, otherwise use ffmpeg/avconv if False, otherwise
298                        use downloader suggested by extractor if None.
299
300     The following parameters are not used by YoutubeDL itself, they are used by
301     the downloader (see youtube_dl/downloader/common.py):
302     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
303     noresizebuffer, retries, continuedl, noprogress, consoletitle,
304     xattr_set_filesize, external_downloader_args, hls_use_mpegts,
305     http_chunk_size.
306
307     The following options are used by the post processors:
308     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
309                        otherwise prefer avconv.
310     postprocessor_args: A list of additional command-line arguments for the
311                         postprocessor.
312
313     The following options are used by the Youtube extractor:
314     youtube_include_dash_manifest: If True (default), DASH manifests and related
315                         data will be downloaded and processed by extractor.
316                         You can reduce network I/O by disabling it if you don't
317                         care about DASH.
318     """
319
320     _NUMERIC_FIELDS = set((
321         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
322         'timestamp', 'upload_year', 'upload_month', 'upload_day',
323         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
324         'average_rating', 'comment_count', 'age_limit',
325         'start_time', 'end_time',
326         'chapter_number', 'season_number', 'episode_number',
327         'track_number', 'disc_number', 'release_year',
328         'playlist_index',
329     ))
330
331     params = None
332     _ies = []
333     _pps = []
334     _download_retcode = None
335     _num_downloads = None
336     _screen_file = None
337
338     def __init__(self, params=None, auto_init=True):
339         """Create a FileDownloader object with the given options."""
340         if params is None:
341             params = {}
342         self._ies = []
343         self._ies_instances = {}
344         self._pps = []
345         self._progress_hooks = []
346         self._download_retcode = 0
347         self._num_downloads = 0
348         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
349         self._err_file = sys.stderr
350         self.params = {
351             # Default parameters
352             'nocheckcertificate': False,
353         }
354         self.params.update(params)
355         self.cache = Cache(self)
356
357         def check_deprecated(param, option, suggestion):
358             if self.params.get(param) is not None:
359                 self.report_warning(
360                     '%s is deprecated. Use %s instead.' % (option, suggestion))
361                 return True
362             return False
363
364         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
365             if self.params.get('geo_verification_proxy') is None:
366                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
367
368         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
369         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
370         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
371
372         if params.get('bidi_workaround', False):
373             try:
374                 import pty
375                 master, slave = pty.openpty()
376                 width = compat_get_terminal_size().columns
377                 if width is None:
378                     width_args = []
379                 else:
380                     width_args = ['-w', str(width)]
381                 sp_kwargs = dict(
382                     stdin=subprocess.PIPE,
383                     stdout=slave,
384                     stderr=self._err_file)
385                 try:
386                     self._output_process = subprocess.Popen(
387                         ['bidiv'] + width_args, **sp_kwargs
388                     )
389                 except OSError:
390                     self._output_process = subprocess.Popen(
391                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
392                 self._output_channel = os.fdopen(master, 'rb')
393             except OSError as ose:
394                 if ose.errno == errno.ENOENT:
395                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
396                 else:
397                     raise
398
399         if (sys.platform != 'win32' and
400                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
401                 not params.get('restrictfilenames', False)):
402             # Unicode filesystem API will throw errors (#1474, #13027)
403             self.report_warning(
404                 'Assuming --restrict-filenames since file system encoding '
405                 'cannot encode all characters. '
406                 'Set the LC_ALL environment variable to fix this.')
407             self.params['restrictfilenames'] = True
408
409         if isinstance(params.get('outtmpl'), bytes):
410             self.report_warning(
411                 'Parameter outtmpl is bytes, but should be a unicode string. '
412                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
413
414         self._setup_opener()
415
416         if auto_init:
417             self.print_debug_header()
418             self.add_default_info_extractors()
419
420         for pp_def_raw in self.params.get('postprocessors', []):
421             pp_class = get_postprocessor(pp_def_raw['key'])
422             pp_def = dict(pp_def_raw)
423             del pp_def['key']
424             pp = pp_class(self, **compat_kwargs(pp_def))
425             self.add_post_processor(pp)
426
427         for ph in self.params.get('progress_hooks', []):
428             self.add_progress_hook(ph)
429
430         register_socks_protocols()
431
432     def warn_if_short_id(self, argv):
433         # short YouTube ID starting with dash?
434         idxs = [
435             i for i, a in enumerate(argv)
436             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
437         if idxs:
438             correct_argv = (
439                 ['youtube-dl'] +
440                 [a for i, a in enumerate(argv) if i not in idxs] +
441                 ['--'] + [argv[i] for i in idxs]
442             )
443             self.report_warning(
444                 'Long argument string detected. '
445                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
446                 args_to_str(correct_argv))
447
448     def add_info_extractor(self, ie):
449         """Add an InfoExtractor object to the end of the list."""
450         self._ies.append(ie)
451         if not isinstance(ie, type):
452             self._ies_instances[ie.ie_key()] = ie
453             ie.set_downloader(self)
454
455     def get_info_extractor(self, ie_key):
456         """
457         Get an instance of an IE with name ie_key, it will try to get one from
458         the _ies list, if there's no instance it will create a new one and add
459         it to the extractor list.
460         """
461         ie = self._ies_instances.get(ie_key)
462         if ie is None:
463             ie = get_info_extractor(ie_key)()
464             self.add_info_extractor(ie)
465         return ie
466
467     def add_default_info_extractors(self):
468         """
469         Add the InfoExtractors returned by gen_extractors to the end of the list
470         """
471         for ie in gen_extractor_classes():
472             self.add_info_extractor(ie)
473
474     def add_post_processor(self, pp):
475         """Add a PostProcessor object to the end of the chain."""
476         self._pps.append(pp)
477         pp.set_downloader(self)
478
479     def add_progress_hook(self, ph):
480         """Add the progress hook (currently only for the file downloader)"""
481         self._progress_hooks.append(ph)
482
483     def _bidi_workaround(self, message):
484         if not hasattr(self, '_output_channel'):
485             return message
486
487         assert hasattr(self, '_output_process')
488         assert isinstance(message, compat_str)
489         line_count = message.count('\n') + 1
490         self._output_process.stdin.write((message + '\n').encode('utf-8'))
491         self._output_process.stdin.flush()
492         res = ''.join(self._output_channel.readline().decode('utf-8')
493                       for _ in range(line_count))
494         return res[:-len('\n')]
495
496     def to_screen(self, message, skip_eol=False):
497         """Print message to stdout if not in quiet mode."""
498         return self.to_stdout(message, skip_eol, check_quiet=True)
499
500     def _write_string(self, s, out=None):
501         write_string(s, out=out, encoding=self.params.get('encoding'))
502
503     def to_stdout(self, message, skip_eol=False, check_quiet=False):
504         """Print message to stdout if not in quiet mode."""
505         if self.params.get('logger'):
506             self.params['logger'].debug(message)
507         elif not check_quiet or not self.params.get('quiet', False):
508             message = self._bidi_workaround(message)
509             terminator = ['\n', ''][skip_eol]
510             output = message + terminator
511
512             self._write_string(output, self._screen_file)
513
514     def to_stderr(self, message):
515         """Print message to stderr."""
516         assert isinstance(message, compat_str)
517         if self.params.get('logger'):
518             self.params['logger'].error(message)
519         else:
520             message = self._bidi_workaround(message)
521             output = message + '\n'
522             self._write_string(output, self._err_file)
523
524     def to_console_title(self, message):
525         if not self.params.get('consoletitle', False):
526             return
527         if compat_os_name == 'nt':
528             if ctypes.windll.kernel32.GetConsoleWindow():
529                 # c_wchar_p() might not be necessary if `message` is
530                 # already of type unicode()
531                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
532         elif 'TERM' in os.environ:
533             self._write_string('\033]0;%s\007' % message, self._screen_file)
534
535     def save_console_title(self):
536         if not self.params.get('consoletitle', False):
537             return
538         if self.params.get('simulate', False):
539             return
540         if compat_os_name != 'nt' and 'TERM' in os.environ:
541             # Save the title on stack
542             self._write_string('\033[22;0t', self._screen_file)
543
544     def restore_console_title(self):
545         if not self.params.get('consoletitle', False):
546             return
547         if self.params.get('simulate', False):
548             return
549         if compat_os_name != 'nt' and 'TERM' in os.environ:
550             # Restore the title from stack
551             self._write_string('\033[23;0t', self._screen_file)
552
553     def __enter__(self):
554         self.save_console_title()
555         return self
556
557     def __exit__(self, *args):
558         self.restore_console_title()
559
560         if self.params.get('cookiefile') is not None:
561             self.cookiejar.save()
562
563     def trouble(self, message=None, tb=None):
564         """Determine action to take when a download problem appears.
565
566         Depending on if the downloader has been configured to ignore
567         download errors or not, this method may throw an exception or
568         not when errors are found, after printing the message.
569
570         tb, if given, is additional traceback information.
571         """
572         if message is not None:
573             self.to_stderr(message)
574         if self.params.get('verbose'):
575             if tb is None:
576                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
577                     tb = ''
578                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
579                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
580                     tb += encode_compat_str(traceback.format_exc())
581                 else:
582                     tb_data = traceback.format_list(traceback.extract_stack())
583                     tb = ''.join(tb_data)
584             self.to_stderr(tb)
585         if not self.params.get('ignoreerrors', False):
586             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
587                 exc_info = sys.exc_info()[1].exc_info
588             else:
589                 exc_info = sys.exc_info()
590             raise DownloadError(message, exc_info)
591         self._download_retcode = 1
592
593     def report_warning(self, message):
594         '''
595         Print the message to stderr, it will be prefixed with 'WARNING:'
596         If stderr is a tty file the 'WARNING:' will be colored
597         '''
598         if self.params.get('logger') is not None:
599             self.params['logger'].warning(message)
600         else:
601             if self.params.get('no_warnings'):
602                 return
603             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
604                 _msg_header = '\033[0;33mWARNING:\033[0m'
605             else:
606                 _msg_header = 'WARNING:'
607             warning_message = '%s %s' % (_msg_header, message)
608             self.to_stderr(warning_message)
609
610     def report_error(self, message, tb=None):
611         '''
612         Do the same as trouble, but prefixes the message with 'ERROR:', colored
613         in red if stderr is a tty file.
614         '''
615         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
616             _msg_header = '\033[0;31mERROR:\033[0m'
617         else:
618             _msg_header = 'ERROR:'
619         error_message = '%s %s' % (_msg_header, message)
620         self.trouble(error_message, tb)
621
622     def report_file_already_downloaded(self, file_name):
623         """Report file has already been fully downloaded."""
624         try:
625             self.to_screen('[download] %s has already been downloaded' % file_name)
626         except UnicodeEncodeError:
627             self.to_screen('[download] The file has already been downloaded')
628
629     def prepare_filename(self, info_dict):
630         """Generate the output filename."""
631         try:
632             template_dict = dict(info_dict)
633
634             template_dict['epoch'] = int(time.time())
635             autonumber_size = self.params.get('autonumber_size')
636             if autonumber_size is None:
637                 autonumber_size = 5
638             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
639             if template_dict.get('resolution') is None:
640                 if template_dict.get('width') and template_dict.get('height'):
641                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
642                 elif template_dict.get('height'):
643                     template_dict['resolution'] = '%sp' % template_dict['height']
644                 elif template_dict.get('width'):
645                     template_dict['resolution'] = '%dx?' % template_dict['width']
646
647             sanitize = lambda k, v: sanitize_filename(
648                 compat_str(v),
649                 restricted=self.params.get('restrictfilenames'),
650                 is_id=(k == 'id' or k.endswith('_id')))
651             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
652                                  for k, v in template_dict.items()
653                                  if v is not None and not isinstance(v, (list, tuple, dict)))
654             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
655
656             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
657
658             # For fields playlist_index and autonumber convert all occurrences
659             # of %(field)s to %(field)0Nd for backward compatibility
660             field_size_compat_map = {
661                 'playlist_index': len(str(template_dict['n_entries'])),
662                 'autonumber': autonumber_size,
663             }
664             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
665             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
666             if mobj:
667                 outtmpl = re.sub(
668                     FIELD_SIZE_COMPAT_RE,
669                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
670                     outtmpl)
671
672             # Missing numeric fields used together with integer presentation types
673             # in format specification will break the argument substitution since
674             # string 'NA' is returned for missing fields. We will patch output
675             # template for missing fields to meet string presentation type.
676             for numeric_field in self._NUMERIC_FIELDS:
677                 if numeric_field not in template_dict:
678                     # As of [1] format syntax is:
679                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
680                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
681                     FORMAT_RE = r'''(?x)
682                         (?<!%)
683                         %
684                         \({0}\)  # mapping key
685                         (?:[#0\-+ ]+)?  # conversion flags (optional)
686                         (?:\d+)?  # minimum field width (optional)
687                         (?:\.\d+)?  # precision (optional)
688                         [hlL]?  # length modifier (optional)
689                         [diouxXeEfFgGcrs%]  # conversion type
690                     '''
691                     outtmpl = re.sub(
692                         FORMAT_RE.format(numeric_field),
693                         r'%({0})s'.format(numeric_field), outtmpl)
694
695             # expand_path translates '%%' into '%' and '$$' into '$'
696             # correspondingly that is not what we want since we need to keep
697             # '%%' intact for template dict substitution step. Working around
698             # with boundary-alike separator hack.
699             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
700             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
701
702             # outtmpl should be expand_path'ed before template dict substitution
703             # because meta fields may contain env variables we don't want to
704             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
705             # title "Hello $PATH", we don't want `$PATH` to be expanded.
706             filename = expand_path(outtmpl).replace(sep, '') % template_dict
707
708             # Temporary fix for #4787
709             # 'Treat' all problem characters by passing filename through preferredencoding
710             # to workaround encoding issues with subprocess on python2 @ Windows
711             if sys.version_info < (3, 0) and sys.platform == 'win32':
712                 filename = encodeFilename(filename, True).decode(preferredencoding())
713             return sanitize_path(filename)
714         except ValueError as err:
715             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
716             return None
717
718     def _match_entry(self, info_dict, incomplete):
719         """ Returns None iff the file should be downloaded """
720
721         video_title = info_dict.get('title', info_dict.get('id', 'video'))
722         if 'title' in info_dict:
723             # This can happen when we're just evaluating the playlist
724             title = info_dict['title']
725             matchtitle = self.params.get('matchtitle', False)
726             if matchtitle:
727                 if not re.search(matchtitle, title, re.IGNORECASE):
728                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
729             rejecttitle = self.params.get('rejecttitle', False)
730             if rejecttitle:
731                 if re.search(rejecttitle, title, re.IGNORECASE):
732                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
733         date = info_dict.get('upload_date')
734         if date is not None:
735             dateRange = self.params.get('daterange', DateRange())
736             if date not in dateRange:
737                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
738         view_count = info_dict.get('view_count')
739         if view_count is not None:
740             min_views = self.params.get('min_views')
741             if min_views is not None and view_count < min_views:
742                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
743             max_views = self.params.get('max_views')
744             if max_views is not None and view_count > max_views:
745                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
746         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
747             return 'Skipping "%s" because it is age restricted' % video_title
748         if self.in_download_archive(info_dict):
749             return '%s has already been recorded in archive' % video_title
750
751         if not incomplete:
752             match_filter = self.params.get('match_filter')
753             if match_filter is not None:
754                 ret = match_filter(info_dict)
755                 if ret is not None:
756                     return ret
757
758         return None
759
760     @staticmethod
761     def add_extra_info(info_dict, extra_info):
762         '''Set the keys from extra_info in info dict if they are missing'''
763         for key, value in extra_info.items():
764             info_dict.setdefault(key, value)
765
766     def extract_info(self, url, download=True, ie_key=None, extra_info={},
767                      process=True, force_generic_extractor=False):
768         '''
769         Returns a list with a dictionary for each video we find.
770         If 'download', also downloads the videos.
771         extra_info is a dict containing the extra values to add to each result
772         '''
773
774         if not ie_key and force_generic_extractor:
775             ie_key = 'Generic'
776
777         if ie_key:
778             ies = [self.get_info_extractor(ie_key)]
779         else:
780             ies = self._ies
781
782         for ie in ies:
783             if not ie.suitable(url):
784                 continue
785
786             ie = self.get_info_extractor(ie.ie_key())
787             if not ie.working():
788                 self.report_warning('The program functionality for this site has been marked as broken, '
789                                     'and will probably not work.')
790
791             try:
792                 ie_result = ie.extract(url)
793                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
794                     break
795                 if isinstance(ie_result, list):
796                     # Backwards compatibility: old IE result format
797                     ie_result = {
798                         '_type': 'compat_list',
799                         'entries': ie_result,
800                     }
801                 self.add_default_extra_info(ie_result, ie, url)
802                 if process:
803                     return self.process_ie_result(ie_result, download, extra_info)
804                 else:
805                     return ie_result
806             except GeoRestrictedError as e:
807                 msg = e.msg
808                 if e.countries:
809                     msg += '\nThis video is available in %s.' % ', '.join(
810                         map(ISO3166Utils.short2full, e.countries))
811                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
812                 self.report_error(msg)
813                 break
814             except ExtractorError as e:  # An error we somewhat expected
815                 self.report_error(compat_str(e), e.format_traceback())
816                 break
817             except MaxDownloadsReached:
818                 raise
819             except Exception as e:
820                 if self.params.get('ignoreerrors', False):
821                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
822                     break
823                 else:
824                     raise
825         else:
826             self.report_error('no suitable InfoExtractor for URL %s' % url)
827
828     def add_default_extra_info(self, ie_result, ie, url):
829         self.add_extra_info(ie_result, {
830             'extractor': ie.IE_NAME,
831             'webpage_url': url,
832             'webpage_url_basename': url_basename(url),
833             'extractor_key': ie.ie_key(),
834         })
835
836     def process_ie_result(self, ie_result, download=True, extra_info={}):
837         """
838         Take the result of the ie(may be modified) and resolve all unresolved
839         references (URLs, playlist items).
840
841         It will also download the videos if 'download'.
842         Returns the resolved ie_result.
843         """
844         result_type = ie_result.get('_type', 'video')
845
846         if result_type in ('url', 'url_transparent'):
847             ie_result['url'] = sanitize_url(ie_result['url'])
848             extract_flat = self.params.get('extract_flat', False)
849             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
850                     extract_flat is True):
851                 if self.params.get('forcejson', False):
852                     self.to_stdout(json.dumps(ie_result))
853                 return ie_result
854
855         if result_type == 'video':
856             self.add_extra_info(ie_result, extra_info)
857             return self.process_video_result(ie_result, download=download)
858         elif result_type == 'url':
859             # We have to add extra_info to the results because it may be
860             # contained in a playlist
861             return self.extract_info(ie_result['url'],
862                                      download,
863                                      ie_key=ie_result.get('ie_key'),
864                                      extra_info=extra_info)
865         elif result_type == 'url_transparent':
866             # Use the information from the embedding page
867             info = self.extract_info(
868                 ie_result['url'], ie_key=ie_result.get('ie_key'),
869                 extra_info=extra_info, download=False, process=False)
870
871             # extract_info may return None when ignoreerrors is enabled and
872             # extraction failed with an error, don't crash and return early
873             # in this case
874             if not info:
875                 return info
876
877             force_properties = dict(
878                 (k, v) for k, v in ie_result.items() if v is not None)
879             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
880                 if f in force_properties:
881                     del force_properties[f]
882             new_result = info.copy()
883             new_result.update(force_properties)
884
885             # Extracted info may not be a video result (i.e.
886             # info.get('_type', 'video') != video) but rather an url or
887             # url_transparent. In such cases outer metadata (from ie_result)
888             # should be propagated to inner one (info). For this to happen
889             # _type of info should be overridden with url_transparent. This
890             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
891             if new_result.get('_type') == 'url':
892                 new_result['_type'] = 'url_transparent'
893
894             return self.process_ie_result(
895                 new_result, download=download, extra_info=extra_info)
896         elif result_type in ('playlist', 'multi_video'):
897             # We process each entry in the playlist
898             playlist = ie_result.get('title') or ie_result.get('id')
899             self.to_screen('[download] Downloading playlist: %s' % playlist)
900
901             playlist_results = []
902
903             playliststart = self.params.get('playliststart', 1) - 1
904             playlistend = self.params.get('playlistend')
905             # For backwards compatibility, interpret -1 as whole list
906             if playlistend == -1:
907                 playlistend = None
908
909             playlistitems_str = self.params.get('playlist_items')
910             playlistitems = None
911             if playlistitems_str is not None:
912                 def iter_playlistitems(format):
913                     for string_segment in format.split(','):
914                         if '-' in string_segment:
915                             start, end = string_segment.split('-')
916                             for item in range(int(start), int(end) + 1):
917                                 yield int(item)
918                         else:
919                             yield int(string_segment)
920                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
921
922             ie_entries = ie_result['entries']
923
924             def make_playlistitems_entries(list_ie_entries):
925                 num_entries = len(list_ie_entries)
926                 return [
927                     list_ie_entries[i - 1] for i in playlistitems
928                     if -num_entries <= i - 1 < num_entries]
929
930             def report_download(num_entries):
931                 self.to_screen(
932                     '[%s] playlist %s: Downloading %d videos' %
933                     (ie_result['extractor'], playlist, num_entries))
934
935             if isinstance(ie_entries, list):
936                 n_all_entries = len(ie_entries)
937                 if playlistitems:
938                     entries = make_playlistitems_entries(ie_entries)
939                 else:
940                     entries = ie_entries[playliststart:playlistend]
941                 n_entries = len(entries)
942                 self.to_screen(
943                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
944                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
945             elif isinstance(ie_entries, PagedList):
946                 if playlistitems:
947                     entries = []
948                     for item in playlistitems:
949                         entries.extend(ie_entries.getslice(
950                             item - 1, item
951                         ))
952                 else:
953                     entries = ie_entries.getslice(
954                         playliststart, playlistend)
955                 n_entries = len(entries)
956                 report_download(n_entries)
957             else:  # iterable
958                 if playlistitems:
959                     entries = make_playlistitems_entries(list(itertools.islice(
960                         ie_entries, 0, max(playlistitems))))
961                 else:
962                     entries = list(itertools.islice(
963                         ie_entries, playliststart, playlistend))
964                 n_entries = len(entries)
965                 report_download(n_entries)
966
967             if self.params.get('playlistreverse', False):
968                 entries = entries[::-1]
969
970             if self.params.get('playlistrandom', False):
971                 random.shuffle(entries)
972
973             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
974
975             for i, entry in enumerate(entries, 1):
976                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
977                 # This __x_forwarded_for_ip thing is a bit ugly but requires
978                 # minimal changes
979                 if x_forwarded_for:
980                     entry['__x_forwarded_for_ip'] = x_forwarded_for
981                 extra = {
982                     'n_entries': n_entries,
983                     'playlist': playlist,
984                     'playlist_id': ie_result.get('id'),
985                     'playlist_title': ie_result.get('title'),
986                     'playlist_uploader': ie_result.get('uploader'),
987                     'playlist_uploader_id': ie_result.get('uploader_id'),
988                     'playlist_index': i + playliststart,
989                     'extractor': ie_result['extractor'],
990                     'webpage_url': ie_result['webpage_url'],
991                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
992                     'extractor_key': ie_result['extractor_key'],
993                 }
994
995                 reason = self._match_entry(entry, incomplete=True)
996                 if reason is not None:
997                     self.to_screen('[download] ' + reason)
998                     continue
999
1000                 entry_result = self.process_ie_result(entry,
1001                                                       download=download,
1002                                                       extra_info=extra)
1003                 playlist_results.append(entry_result)
1004             ie_result['entries'] = playlist_results
1005             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1006             return ie_result
1007         elif result_type == 'compat_list':
1008             self.report_warning(
1009                 'Extractor %s returned a compat_list result. '
1010                 'It needs to be updated.' % ie_result.get('extractor'))
1011
1012             def _fixup(r):
1013                 self.add_extra_info(
1014                     r,
1015                     {
1016                         'extractor': ie_result['extractor'],
1017                         'webpage_url': ie_result['webpage_url'],
1018                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1019                         'extractor_key': ie_result['extractor_key'],
1020                     }
1021                 )
1022                 return r
1023             ie_result['entries'] = [
1024                 self.process_ie_result(_fixup(r), download, extra_info)
1025                 for r in ie_result['entries']
1026             ]
1027             return ie_result
1028         else:
1029             raise Exception('Invalid result type: %s' % result_type)
1030
1031     def _build_format_filter(self, filter_spec):
1032         " Returns a function to filter the formats according to the filter_spec "
1033
1034         OPERATORS = {
1035             '<': operator.lt,
1036             '<=': operator.le,
1037             '>': operator.gt,
1038             '>=': operator.ge,
1039             '=': operator.eq,
1040             '!=': operator.ne,
1041         }
1042         operator_rex = re.compile(r'''(?x)\s*
1043             (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
1044             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1045             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1046             $
1047             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1048         m = operator_rex.search(filter_spec)
1049         if m:
1050             try:
1051                 comparison_value = int(m.group('value'))
1052             except ValueError:
1053                 comparison_value = parse_filesize(m.group('value'))
1054                 if comparison_value is None:
1055                     comparison_value = parse_filesize(m.group('value') + 'B')
1056                 if comparison_value is None:
1057                     raise ValueError(
1058                         'Invalid value %r in format specification %r' % (
1059                             m.group('value'), filter_spec))
1060             op = OPERATORS[m.group('op')]
1061
1062         if not m:
1063             STR_OPERATORS = {
1064                 '=': operator.eq,
1065                 '!=': operator.ne,
1066                 '^=': lambda attr, value: attr.startswith(value),
1067                 '$=': lambda attr, value: attr.endswith(value),
1068                 '*=': lambda attr, value: value in attr,
1069             }
1070             str_operator_rex = re.compile(r'''(?x)
1071                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1072                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1073                 \s*(?P<value>[a-zA-Z0-9._-]+)
1074                 \s*$
1075                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1076             m = str_operator_rex.search(filter_spec)
1077             if m:
1078                 comparison_value = m.group('value')
1079                 op = STR_OPERATORS[m.group('op')]
1080
1081         if not m:
1082             raise ValueError('Invalid filter specification %r' % filter_spec)
1083
1084         def _filter(f):
1085             actual_value = f.get(m.group('key'))
1086             if actual_value is None:
1087                 return m.group('none_inclusive')
1088             return op(actual_value, comparison_value)
1089         return _filter
1090
1091     def _default_format_spec(self, info_dict, download=True):
1092
1093         def can_merge():
1094             merger = FFmpegMergerPP(self)
1095             return merger.available and merger.can_merge()
1096
1097         def prefer_best():
1098             if self.params.get('simulate', False):
1099                 return False
1100             if not download:
1101                 return False
1102             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1103                 return True
1104             if info_dict.get('is_live'):
1105                 return True
1106             if not can_merge():
1107                 return True
1108             return False
1109
1110         req_format_list = ['bestvideo+bestaudio', 'best']
1111         if prefer_best():
1112             req_format_list.reverse()
1113         return '/'.join(req_format_list)
1114
1115     def build_format_selector(self, format_spec):
1116         def syntax_error(note, start):
1117             message = (
1118                 'Invalid format specification: '
1119                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1120             return SyntaxError(message)
1121
1122         PICKFIRST = 'PICKFIRST'
1123         MERGE = 'MERGE'
1124         SINGLE = 'SINGLE'
1125         GROUP = 'GROUP'
1126         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1127
1128         def _parse_filter(tokens):
1129             filter_parts = []
1130             for type, string, start, _, _ in tokens:
1131                 if type == tokenize.OP and string == ']':
1132                     return ''.join(filter_parts)
1133                 else:
1134                     filter_parts.append(string)
1135
1136         def _remove_unused_ops(tokens):
1137             # Remove operators that we don't use and join them with the surrounding strings
1138             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1139             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1140             last_string, last_start, last_end, last_line = None, None, None, None
1141             for type, string, start, end, line in tokens:
1142                 if type == tokenize.OP and string == '[':
1143                     if last_string:
1144                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1145                         last_string = None
1146                     yield type, string, start, end, line
1147                     # everything inside brackets will be handled by _parse_filter
1148                     for type, string, start, end, line in tokens:
1149                         yield type, string, start, end, line
1150                         if type == tokenize.OP and string == ']':
1151                             break
1152                 elif type == tokenize.OP and string in ALLOWED_OPS:
1153                     if last_string:
1154                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1155                         last_string = None
1156                     yield type, string, start, end, line
1157                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1158                     if not last_string:
1159                         last_string = string
1160                         last_start = start
1161                         last_end = end
1162                     else:
1163                         last_string += string
1164             if last_string:
1165                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1166
1167         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1168             selectors = []
1169             current_selector = None
1170             for type, string, start, _, _ in tokens:
1171                 # ENCODING is only defined in python 3.x
1172                 if type == getattr(tokenize, 'ENCODING', None):
1173                     continue
1174                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1175                     current_selector = FormatSelector(SINGLE, string, [])
1176                 elif type == tokenize.OP:
1177                     if string == ')':
1178                         if not inside_group:
1179                             # ')' will be handled by the parentheses group
1180                             tokens.restore_last_token()
1181                         break
1182                     elif inside_merge and string in ['/', ',']:
1183                         tokens.restore_last_token()
1184                         break
1185                     elif inside_choice and string == ',':
1186                         tokens.restore_last_token()
1187                         break
1188                     elif string == ',':
1189                         if not current_selector:
1190                             raise syntax_error('"," must follow a format selector', start)
1191                         selectors.append(current_selector)
1192                         current_selector = None
1193                     elif string == '/':
1194                         if not current_selector:
1195                             raise syntax_error('"/" must follow a format selector', start)
1196                         first_choice = current_selector
1197                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1198                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1199                     elif string == '[':
1200                         if not current_selector:
1201                             current_selector = FormatSelector(SINGLE, 'best', [])
1202                         format_filter = _parse_filter(tokens)
1203                         current_selector.filters.append(format_filter)
1204                     elif string == '(':
1205                         if current_selector:
1206                             raise syntax_error('Unexpected "("', start)
1207                         group = _parse_format_selection(tokens, inside_group=True)
1208                         current_selector = FormatSelector(GROUP, group, [])
1209                     elif string == '+':
1210                         video_selector = current_selector
1211                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1212                         if not video_selector or not audio_selector:
1213                             raise syntax_error('"+" must be between two format selectors', start)
1214                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1215                     else:
1216                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1217                 elif type == tokenize.ENDMARKER:
1218                     break
1219             if current_selector:
1220                 selectors.append(current_selector)
1221             return selectors
1222
1223         def _build_selector_function(selector):
1224             if isinstance(selector, list):
1225                 fs = [_build_selector_function(s) for s in selector]
1226
1227                 def selector_function(ctx):
1228                     for f in fs:
1229                         for format in f(ctx):
1230                             yield format
1231                 return selector_function
1232             elif selector.type == GROUP:
1233                 selector_function = _build_selector_function(selector.selector)
1234             elif selector.type == PICKFIRST:
1235                 fs = [_build_selector_function(s) for s in selector.selector]
1236
1237                 def selector_function(ctx):
1238                     for f in fs:
1239                         picked_formats = list(f(ctx))
1240                         if picked_formats:
1241                             return picked_formats
1242                     return []
1243             elif selector.type == SINGLE:
1244                 format_spec = selector.selector
1245
1246                 def selector_function(ctx):
1247                     formats = list(ctx['formats'])
1248                     if not formats:
1249                         return
1250                     if format_spec == 'all':
1251                         for f in formats:
1252                             yield f
1253                     elif format_spec in ['best', 'worst', None]:
1254                         format_idx = 0 if format_spec == 'worst' else -1
1255                         audiovideo_formats = [
1256                             f for f in formats
1257                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1258                         if audiovideo_formats:
1259                             yield audiovideo_formats[format_idx]
1260                         # for extractors with incomplete formats (audio only (soundcloud)
1261                         # or video only (imgur)) we will fallback to best/worst
1262                         # {video,audio}-only format
1263                         elif ctx['incomplete_formats']:
1264                             yield formats[format_idx]
1265                     elif format_spec == 'bestaudio':
1266                         audio_formats = [
1267                             f for f in formats
1268                             if f.get('vcodec') == 'none']
1269                         if audio_formats:
1270                             yield audio_formats[-1]
1271                     elif format_spec == 'worstaudio':
1272                         audio_formats = [
1273                             f for f in formats
1274                             if f.get('vcodec') == 'none']
1275                         if audio_formats:
1276                             yield audio_formats[0]
1277                     elif format_spec == 'bestvideo':
1278                         video_formats = [
1279                             f for f in formats
1280                             if f.get('acodec') == 'none']
1281                         if video_formats:
1282                             yield video_formats[-1]
1283                     elif format_spec == 'worstvideo':
1284                         video_formats = [
1285                             f for f in formats
1286                             if f.get('acodec') == 'none']
1287                         if video_formats:
1288                             yield video_formats[0]
1289                     else:
1290                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1291                         if format_spec in extensions:
1292                             filter_f = lambda f: f['ext'] == format_spec
1293                         else:
1294                             filter_f = lambda f: f['format_id'] == format_spec
1295                         matches = list(filter(filter_f, formats))
1296                         if matches:
1297                             yield matches[-1]
1298             elif selector.type == MERGE:
1299                 def _merge(formats_info):
1300                     format_1, format_2 = [f['format_id'] for f in formats_info]
1301                     # The first format must contain the video and the
1302                     # second the audio
1303                     if formats_info[0].get('vcodec') == 'none':
1304                         self.report_error('The first format must '
1305                                           'contain the video, try using '
1306                                           '"-f %s+%s"' % (format_2, format_1))
1307                         return
1308                     # Formats must be opposite (video+audio)
1309                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1310                         self.report_error(
1311                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1312                             % (format_1, format_2))
1313                         return
1314                     output_ext = (
1315                         formats_info[0]['ext']
1316                         if self.params.get('merge_output_format') is None
1317                         else self.params['merge_output_format'])
1318                     return {
1319                         'requested_formats': formats_info,
1320                         'format': '%s+%s' % (formats_info[0].get('format'),
1321                                              formats_info[1].get('format')),
1322                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1323                                                 formats_info[1].get('format_id')),
1324                         'width': formats_info[0].get('width'),
1325                         'height': formats_info[0].get('height'),
1326                         'resolution': formats_info[0].get('resolution'),
1327                         'fps': formats_info[0].get('fps'),
1328                         'vcodec': formats_info[0].get('vcodec'),
1329                         'vbr': formats_info[0].get('vbr'),
1330                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1331                         'acodec': formats_info[1].get('acodec'),
1332                         'abr': formats_info[1].get('abr'),
1333                         'ext': output_ext,
1334                     }
1335                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1336
1337                 def selector_function(ctx):
1338                     for pair in itertools.product(
1339                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1340                         yield _merge(pair)
1341
1342             filters = [self._build_format_filter(f) for f in selector.filters]
1343
1344             def final_selector(ctx):
1345                 ctx_copy = copy.deepcopy(ctx)
1346                 for _filter in filters:
1347                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1348                 return selector_function(ctx_copy)
1349             return final_selector
1350
1351         stream = io.BytesIO(format_spec.encode('utf-8'))
1352         try:
1353             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1354         except tokenize.TokenError:
1355             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1356
1357         class TokenIterator(object):
1358             def __init__(self, tokens):
1359                 self.tokens = tokens
1360                 self.counter = 0
1361
1362             def __iter__(self):
1363                 return self
1364
1365             def __next__(self):
1366                 if self.counter >= len(self.tokens):
1367                     raise StopIteration()
1368                 value = self.tokens[self.counter]
1369                 self.counter += 1
1370                 return value
1371
1372             next = __next__
1373
1374             def restore_last_token(self):
1375                 self.counter -= 1
1376
1377         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1378         return _build_selector_function(parsed_selector)
1379
1380     def _calc_headers(self, info_dict):
1381         res = std_headers.copy()
1382
1383         add_headers = info_dict.get('http_headers')
1384         if add_headers:
1385             res.update(add_headers)
1386
1387         cookies = self._calc_cookies(info_dict)
1388         if cookies:
1389             res['Cookie'] = cookies
1390
1391         if 'X-Forwarded-For' not in res:
1392             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1393             if x_forwarded_for_ip:
1394                 res['X-Forwarded-For'] = x_forwarded_for_ip
1395
1396         return res
1397
1398     def _calc_cookies(self, info_dict):
1399         pr = sanitized_Request(info_dict['url'])
1400         self.cookiejar.add_cookie_header(pr)
1401         return pr.get_header('Cookie')
1402
1403     def process_video_result(self, info_dict, download=True):
1404         assert info_dict.get('_type', 'video') == 'video'
1405
1406         if 'id' not in info_dict:
1407             raise ExtractorError('Missing "id" field in extractor result')
1408         if 'title' not in info_dict:
1409             raise ExtractorError('Missing "title" field in extractor result')
1410
1411         def report_force_conversion(field, field_not, conversion):
1412             self.report_warning(
1413                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1414                 % (field, field_not, conversion))
1415
1416         def sanitize_string_field(info, string_field):
1417             field = info.get(string_field)
1418             if field is None or isinstance(field, compat_str):
1419                 return
1420             report_force_conversion(string_field, 'a string', 'string')
1421             info[string_field] = compat_str(field)
1422
1423         def sanitize_numeric_fields(info):
1424             for numeric_field in self._NUMERIC_FIELDS:
1425                 field = info.get(numeric_field)
1426                 if field is None or isinstance(field, compat_numeric_types):
1427                     continue
1428                 report_force_conversion(numeric_field, 'numeric', 'int')
1429                 info[numeric_field] = int_or_none(field)
1430
1431         sanitize_string_field(info_dict, 'id')
1432         sanitize_numeric_fields(info_dict)
1433
1434         if 'playlist' not in info_dict:
1435             # It isn't part of a playlist
1436             info_dict['playlist'] = None
1437             info_dict['playlist_index'] = None
1438
1439         thumbnails = info_dict.get('thumbnails')
1440         if thumbnails is None:
1441             thumbnail = info_dict.get('thumbnail')
1442             if thumbnail:
1443                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1444         if thumbnails:
1445             thumbnails.sort(key=lambda t: (
1446                 t.get('preference') if t.get('preference') is not None else -1,
1447                 t.get('width') if t.get('width') is not None else -1,
1448                 t.get('height') if t.get('height') is not None else -1,
1449                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1450             for i, t in enumerate(thumbnails):
1451                 t['url'] = sanitize_url(t['url'])
1452                 if t.get('width') and t.get('height'):
1453                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1454                 if t.get('id') is None:
1455                     t['id'] = '%d' % i
1456
1457         if self.params.get('list_thumbnails'):
1458             self.list_thumbnails(info_dict)
1459             return
1460
1461         thumbnail = info_dict.get('thumbnail')
1462         if thumbnail:
1463             info_dict['thumbnail'] = sanitize_url(thumbnail)
1464         elif thumbnails:
1465             info_dict['thumbnail'] = thumbnails[-1]['url']
1466
1467         if 'display_id' not in info_dict and 'id' in info_dict:
1468             info_dict['display_id'] = info_dict['id']
1469
1470         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1471             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1472             # see http://bugs.python.org/issue1646728)
1473             try:
1474                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1475                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1476             except (ValueError, OverflowError, OSError):
1477                 pass
1478
1479         # Auto generate title fields corresponding to the *_number fields when missing
1480         # in order to always have clean titles. This is very common for TV series.
1481         for field in ('chapter', 'season', 'episode'):
1482             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1483                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1484
1485         subtitles = info_dict.get('subtitles')
1486         if subtitles:
1487             for _, subtitle in subtitles.items():
1488                 for subtitle_format in subtitle:
1489                     if subtitle_format.get('url'):
1490                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1491                     if subtitle_format.get('ext') is None:
1492                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1493
1494         if self.params.get('listsubtitles', False):
1495             if 'automatic_captions' in info_dict:
1496                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1497             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1498             return
1499         info_dict['requested_subtitles'] = self.process_subtitles(
1500             info_dict['id'], subtitles,
1501             info_dict.get('automatic_captions'))
1502
1503         # We now pick which formats have to be downloaded
1504         if info_dict.get('formats') is None:
1505             # There's only one format available
1506             formats = [info_dict]
1507         else:
1508             formats = info_dict['formats']
1509
1510         if not formats:
1511             raise ExtractorError('No video formats found!')
1512
1513         def is_wellformed(f):
1514             url = f.get('url')
1515             if not url:
1516                 self.report_warning(
1517                     '"url" field is missing or empty - skipping format, '
1518                     'there is an error in extractor')
1519                 return False
1520             if isinstance(url, bytes):
1521                 sanitize_string_field(f, 'url')
1522             return True
1523
1524         # Filter out malformed formats for better extraction robustness
1525         formats = list(filter(is_wellformed, formats))
1526
1527         formats_dict = {}
1528
1529         # We check that all the formats have the format and format_id fields
1530         for i, format in enumerate(formats):
1531             sanitize_string_field(format, 'format_id')
1532             sanitize_numeric_fields(format)
1533             format['url'] = sanitize_url(format['url'])
1534             if not format.get('format_id'):
1535                 format['format_id'] = compat_str(i)
1536             else:
1537                 # Sanitize format_id from characters used in format selector expression
1538                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1539             format_id = format['format_id']
1540             if format_id not in formats_dict:
1541                 formats_dict[format_id] = []
1542             formats_dict[format_id].append(format)
1543
1544         # Make sure all formats have unique format_id
1545         for format_id, ambiguous_formats in formats_dict.items():
1546             if len(ambiguous_formats) > 1:
1547                 for i, format in enumerate(ambiguous_formats):
1548                     format['format_id'] = '%s-%d' % (format_id, i)
1549
1550         for i, format in enumerate(formats):
1551             if format.get('format') is None:
1552                 format['format'] = '{id} - {res}{note}'.format(
1553                     id=format['format_id'],
1554                     res=self.format_resolution(format),
1555                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1556                 )
1557             # Automatically determine file extension if missing
1558             if format.get('ext') is None:
1559                 format['ext'] = determine_ext(format['url']).lower()
1560             # Automatically determine protocol if missing (useful for format
1561             # selection purposes)
1562             if format.get('protocol') is None:
1563                 format['protocol'] = determine_protocol(format)
1564             # Add HTTP headers, so that external programs can use them from the
1565             # json output
1566             full_format_info = info_dict.copy()
1567             full_format_info.update(format)
1568             format['http_headers'] = self._calc_headers(full_format_info)
1569         # Remove private housekeeping stuff
1570         if '__x_forwarded_for_ip' in info_dict:
1571             del info_dict['__x_forwarded_for_ip']
1572
1573         # TODO Central sorting goes here
1574
1575         if formats[0] is not info_dict:
1576             # only set the 'formats' fields if the original info_dict list them
1577             # otherwise we end up with a circular reference, the first (and unique)
1578             # element in the 'formats' field in info_dict is info_dict itself,
1579             # which can't be exported to json
1580             info_dict['formats'] = formats
1581         if self.params.get('listformats'):
1582             self.list_formats(info_dict)
1583             return
1584
1585         req_format = self.params.get('format')
1586         if req_format is None:
1587             req_format = self._default_format_spec(info_dict, download=download)
1588             if self.params.get('verbose'):
1589                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1590
1591         format_selector = self.build_format_selector(req_format)
1592
1593         # While in format selection we may need to have an access to the original
1594         # format set in order to calculate some metrics or do some processing.
1595         # For now we need to be able to guess whether original formats provided
1596         # by extractor are incomplete or not (i.e. whether extractor provides only
1597         # video-only or audio-only formats) for proper formats selection for
1598         # extractors with such incomplete formats (see
1599         # https://github.com/rg3/youtube-dl/pull/5556).
1600         # Since formats may be filtered during format selection and may not match
1601         # the original formats the results may be incorrect. Thus original formats
1602         # or pre-calculated metrics should be passed to format selection routines
1603         # as well.
1604         # We will pass a context object containing all necessary additional data
1605         # instead of just formats.
1606         # This fixes incorrect format selection issue (see
1607         # https://github.com/rg3/youtube-dl/issues/10083).
1608         incomplete_formats = (
1609             # All formats are video-only or
1610             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1611             # all formats are audio-only
1612             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1613
1614         ctx = {
1615             'formats': formats,
1616             'incomplete_formats': incomplete_formats,
1617         }
1618
1619         formats_to_download = list(format_selector(ctx))
1620         if not formats_to_download:
1621             raise ExtractorError('requested format not available',
1622                                  expected=True)
1623
1624         if download:
1625             if len(formats_to_download) > 1:
1626                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1627             for format in formats_to_download:
1628                 new_info = dict(info_dict)
1629                 new_info.update(format)
1630                 self.process_info(new_info)
1631         # We update the info dict with the best quality format (backwards compatibility)
1632         info_dict.update(formats_to_download[-1])
1633         return info_dict
1634
1635     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1636         """Select the requested subtitles and their format"""
1637         available_subs = {}
1638         if normal_subtitles and self.params.get('writesubtitles'):
1639             available_subs.update(normal_subtitles)
1640         if automatic_captions and self.params.get('writeautomaticsub'):
1641             for lang, cap_info in automatic_captions.items():
1642                 if lang not in available_subs:
1643                     available_subs[lang] = cap_info
1644
1645         if (not self.params.get('writesubtitles') and not
1646                 self.params.get('writeautomaticsub') or not
1647                 available_subs):
1648             return None
1649
1650         if self.params.get('allsubtitles', False):
1651             requested_langs = available_subs.keys()
1652         else:
1653             if self.params.get('subtitleslangs', False):
1654                 requested_langs = self.params.get('subtitleslangs')
1655             elif 'en' in available_subs:
1656                 requested_langs = ['en']
1657             else:
1658                 requested_langs = [list(available_subs.keys())[0]]
1659
1660         formats_query = self.params.get('subtitlesformat', 'best')
1661         formats_preference = formats_query.split('/') if formats_query else []
1662         subs = {}
1663         for lang in requested_langs:
1664             formats = available_subs.get(lang)
1665             if formats is None:
1666                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1667                 continue
1668             for ext in formats_preference:
1669                 if ext == 'best':
1670                     f = formats[-1]
1671                     break
1672                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1673                 if matches:
1674                     f = matches[-1]
1675                     break
1676             else:
1677                 f = formats[-1]
1678                 self.report_warning(
1679                     'No subtitle format found matching "%s" for language %s, '
1680                     'using %s' % (formats_query, lang, f['ext']))
1681             subs[lang] = f
1682         return subs
1683
1684     def process_info(self, info_dict):
1685         """Process a single resolved IE result."""
1686
1687         assert info_dict.get('_type', 'video') == 'video'
1688
1689         max_downloads = self.params.get('max_downloads')
1690         if max_downloads is not None:
1691             if self._num_downloads >= int(max_downloads):
1692                 raise MaxDownloadsReached()
1693
1694         info_dict['fulltitle'] = info_dict['title']
1695         if len(info_dict['title']) > 200:
1696             info_dict['title'] = info_dict['title'][:197] + '...'
1697
1698         if 'format' not in info_dict:
1699             info_dict['format'] = info_dict['ext']
1700
1701         reason = self._match_entry(info_dict, incomplete=False)
1702         if reason is not None:
1703             self.to_screen('[download] ' + reason)
1704             return
1705
1706         self._num_downloads += 1
1707
1708         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1709
1710         # Forced printings
1711         if self.params.get('forcetitle', False):
1712             self.to_stdout(info_dict['fulltitle'])
1713         if self.params.get('forceid', False):
1714             self.to_stdout(info_dict['id'])
1715         if self.params.get('forceurl', False):
1716             if info_dict.get('requested_formats') is not None:
1717                 for f in info_dict['requested_formats']:
1718                     self.to_stdout(f['url'] + f.get('play_path', ''))
1719             else:
1720                 # For RTMP URLs, also include the playpath
1721                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1722         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1723             self.to_stdout(info_dict['thumbnail'])
1724         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1725             self.to_stdout(info_dict['description'])
1726         if self.params.get('forcefilename', False) and filename is not None:
1727             self.to_stdout(filename)
1728         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1729             self.to_stdout(formatSeconds(info_dict['duration']))
1730         if self.params.get('forceformat', False):
1731             self.to_stdout(info_dict['format'])
1732         if self.params.get('forcejson', False):
1733             self.to_stdout(json.dumps(info_dict))
1734
1735         # Do nothing else if in simulate mode
1736         if self.params.get('simulate', False):
1737             return
1738
1739         if filename is None:
1740             return
1741
1742         def ensure_dir_exists(path):
1743             try:
1744                 dn = os.path.dirname(path)
1745                 if dn and not os.path.exists(dn):
1746                     os.makedirs(dn)
1747                 return True
1748             except (OSError, IOError) as err:
1749                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1750                 return False
1751
1752         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1753             return
1754
1755         if self.params.get('writedescription', False):
1756             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1757             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1758                 self.to_screen('[info] Video description is already present')
1759             elif info_dict.get('description') is None:
1760                 self.report_warning('There\'s no description to write.')
1761             else:
1762                 try:
1763                     self.to_screen('[info] Writing video description to: ' + descfn)
1764                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1765                         descfile.write(info_dict['description'])
1766                 except (OSError, IOError):
1767                     self.report_error('Cannot write description file ' + descfn)
1768                     return
1769
1770         if self.params.get('writeannotations', False):
1771             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1772             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1773                 self.to_screen('[info] Video annotations are already present')
1774             else:
1775                 try:
1776                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1777                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1778                         annofile.write(info_dict['annotations'])
1779                 except (KeyError, TypeError):
1780                     self.report_warning('There are no annotations to write.')
1781                 except (OSError, IOError):
1782                     self.report_error('Cannot write annotations file: ' + annofn)
1783                     return
1784
1785         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1786                                        self.params.get('writeautomaticsub')])
1787
1788         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1789             # subtitles download errors are already managed as troubles in relevant IE
1790             # that way it will silently go on when used with unsupporting IE
1791             subtitles = info_dict['requested_subtitles']
1792             ie = self.get_info_extractor(info_dict['extractor_key'])
1793             for sub_lang, sub_info in subtitles.items():
1794                 sub_format = sub_info['ext']
1795                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1796                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1797                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1798                 else:
1799                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1800                     if sub_info.get('data') is not None:
1801                         try:
1802                             # Use newline='' to prevent conversion of newline characters
1803                             # See https://github.com/rg3/youtube-dl/issues/10268
1804                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1805                                 subfile.write(sub_info['data'])
1806                         except (OSError, IOError):
1807                             self.report_error('Cannot write subtitles file ' + sub_filename)
1808                             return
1809                     else:
1810                         try:
1811                             sub_data = ie._request_webpage(
1812                                 sub_info['url'], info_dict['id'], note=False).read()
1813                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1814                                 subfile.write(sub_data)
1815                         except (ExtractorError, IOError, OSError, ValueError) as err:
1816                             self.report_warning('Unable to download subtitle for "%s": %s' %
1817                                                 (sub_lang, error_to_compat_str(err)))
1818                             continue
1819
1820         if self.params.get('writeinfojson', False):
1821             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1822             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1823                 self.to_screen('[info] Video description metadata is already present')
1824             else:
1825                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1826                 try:
1827                     write_json_file(self.filter_requested_info(info_dict), infofn)
1828                 except (OSError, IOError):
1829                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1830                     return
1831
1832         self._write_thumbnails(info_dict, filename)
1833
1834         if not self.params.get('skip_download', False):
1835             try:
1836                 def dl(name, info):
1837                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1838                     for ph in self._progress_hooks:
1839                         fd.add_progress_hook(ph)
1840                     if self.params.get('verbose'):
1841                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1842                     return fd.download(name, info)
1843
1844                 if info_dict.get('requested_formats') is not None:
1845                     downloaded = []
1846                     success = True
1847                     merger = FFmpegMergerPP(self)
1848                     if not merger.available:
1849                         postprocessors = []
1850                         self.report_warning('You have requested multiple '
1851                                             'formats but ffmpeg or avconv are not installed.'
1852                                             ' The formats won\'t be merged.')
1853                     else:
1854                         postprocessors = [merger]
1855
1856                     def compatible_formats(formats):
1857                         video, audio = formats
1858                         # Check extension
1859                         video_ext, audio_ext = video.get('ext'), audio.get('ext')
1860                         if video_ext and audio_ext:
1861                             COMPATIBLE_EXTS = (
1862                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1863                                 ('webm')
1864                             )
1865                             for exts in COMPATIBLE_EXTS:
1866                                 if video_ext in exts and audio_ext in exts:
1867                                     return True
1868                         # TODO: Check acodec/vcodec
1869                         return False
1870
1871                     filename_real_ext = os.path.splitext(filename)[1][1:]
1872                     filename_wo_ext = (
1873                         os.path.splitext(filename)[0]
1874                         if filename_real_ext == info_dict['ext']
1875                         else filename)
1876                     requested_formats = info_dict['requested_formats']
1877                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1878                         info_dict['ext'] = 'mkv'
1879                         self.report_warning(
1880                             'Requested formats are incompatible for merge and will be merged into mkv.')
1881                     # Ensure filename always has a correct extension for successful merge
1882                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1883                     if os.path.exists(encodeFilename(filename)):
1884                         self.to_screen(
1885                             '[download] %s has already been downloaded and '
1886                             'merged' % filename)
1887                     else:
1888                         for f in requested_formats:
1889                             new_info = dict(info_dict)
1890                             new_info.update(f)
1891                             fname = prepend_extension(
1892                                 self.prepare_filename(new_info),
1893                                 'f%s' % f['format_id'], new_info['ext'])
1894                             if not ensure_dir_exists(fname):
1895                                 return
1896                             downloaded.append(fname)
1897                             partial_success = dl(fname, new_info)
1898                             success = success and partial_success
1899                         info_dict['__postprocessors'] = postprocessors
1900                         info_dict['__files_to_merge'] = downloaded
1901                 else:
1902                     # Just a single file
1903                     success = dl(filename, info_dict)
1904             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1906                 return
1907             except (OSError, IOError) as err:
1908                 raise UnavailableVideoError(err)
1909             except (ContentTooShortError, ) as err:
1910                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1911                 return
1912
1913             if success and filename != '-':
1914                 # Fixup content
1915                 fixup_policy = self.params.get('fixup')
1916                 if fixup_policy is None:
1917                     fixup_policy = 'detect_or_warn'
1918
1919                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1920
1921                 stretched_ratio = info_dict.get('stretched_ratio')
1922                 if stretched_ratio is not None and stretched_ratio != 1:
1923                     if fixup_policy == 'warn':
1924                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1925                             info_dict['id'], stretched_ratio))
1926                     elif fixup_policy == 'detect_or_warn':
1927                         stretched_pp = FFmpegFixupStretchedPP(self)
1928                         if stretched_pp.available:
1929                             info_dict.setdefault('__postprocessors', [])
1930                             info_dict['__postprocessors'].append(stretched_pp)
1931                         else:
1932                             self.report_warning(
1933                                 '%s: Non-uniform pixel ratio (%s). %s'
1934                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1935                     else:
1936                         assert fixup_policy in ('ignore', 'never')
1937
1938                 if (info_dict.get('requested_formats') is None and
1939                         info_dict.get('container') == 'm4a_dash'):
1940                     if fixup_policy == 'warn':
1941                         self.report_warning(
1942                             '%s: writing DASH m4a. '
1943                             'Only some players support this container.'
1944                             % info_dict['id'])
1945                     elif fixup_policy == 'detect_or_warn':
1946                         fixup_pp = FFmpegFixupM4aPP(self)
1947                         if fixup_pp.available:
1948                             info_dict.setdefault('__postprocessors', [])
1949                             info_dict['__postprocessors'].append(fixup_pp)
1950                         else:
1951                             self.report_warning(
1952                                 '%s: writing DASH m4a. '
1953                                 'Only some players support this container. %s'
1954                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1955                     else:
1956                         assert fixup_policy in ('ignore', 'never')
1957
1958                 if (info_dict.get('protocol') == 'm3u8_native' or
1959                         info_dict.get('protocol') == 'm3u8' and
1960                         self.params.get('hls_prefer_native')):
1961                     if fixup_policy == 'warn':
1962                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1963                             info_dict['id']))
1964                     elif fixup_policy == 'detect_or_warn':
1965                         fixup_pp = FFmpegFixupM3u8PP(self)
1966                         if fixup_pp.available:
1967                             info_dict.setdefault('__postprocessors', [])
1968                             info_dict['__postprocessors'].append(fixup_pp)
1969                         else:
1970                             self.report_warning(
1971                                 '%s: malformed AAC bitstream detected. %s'
1972                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1973                     else:
1974                         assert fixup_policy in ('ignore', 'never')
1975
1976                 try:
1977                     self.post_process(filename, info_dict)
1978                 except (PostProcessingError) as err:
1979                     self.report_error('postprocessing: %s' % str(err))
1980                     return
1981                 self.record_download_archive(info_dict)
1982
1983     def download(self, url_list):
1984         """Download a given list of URLs."""
1985         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1986         if (len(url_list) > 1 and
1987                 outtmpl != '-' and
1988                 '%' not in outtmpl and
1989                 self.params.get('max_downloads') != 1):
1990             raise SameFileError(outtmpl)
1991
1992         for url in url_list:
1993             try:
1994                 # It also downloads the videos
1995                 res = self.extract_info(
1996                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1997             except UnavailableVideoError:
1998                 self.report_error('unable to download video')
1999             except MaxDownloadsReached:
2000                 self.to_screen('[info] Maximum number of downloaded files reached.')
2001                 raise
2002             else:
2003                 if self.params.get('dump_single_json', False):
2004                     self.to_stdout(json.dumps(res))
2005
2006         return self._download_retcode
2007
2008     def download_with_info_file(self, info_filename):
2009         with contextlib.closing(fileinput.FileInput(
2010                 [info_filename], mode='r',
2011                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2012             # FileInput doesn't have a read method, we can't call json.load
2013             info = self.filter_requested_info(json.loads('\n'.join(f)))
2014         try:
2015             self.process_ie_result(info, download=True)
2016         except DownloadError:
2017             webpage_url = info.get('webpage_url')
2018             if webpage_url is not None:
2019                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2020                 return self.download([webpage_url])
2021             else:
2022                 raise
2023         return self._download_retcode
2024
2025     @staticmethod
2026     def filter_requested_info(info_dict):
2027         return dict(
2028             (k, v) for k, v in info_dict.items()
2029             if k not in ['requested_formats', 'requested_subtitles'])
2030
2031     def post_process(self, filename, ie_info):
2032         """Run all the postprocessors on the given file."""
2033         info = dict(ie_info)
2034         info['filepath'] = filename
2035         pps_chain = []
2036         if ie_info.get('__postprocessors') is not None:
2037             pps_chain.extend(ie_info['__postprocessors'])
2038         pps_chain.extend(self._pps)
2039         for pp in pps_chain:
2040             files_to_delete = []
2041             try:
2042                 files_to_delete, info = pp.run(info)
2043             except PostProcessingError as e:
2044                 self.report_error(e.msg)
2045             if files_to_delete and not self.params.get('keepvideo', False):
2046                 for old_filename in files_to_delete:
2047                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2048                     try:
2049                         os.remove(encodeFilename(old_filename))
2050                     except (IOError, OSError):
2051                         self.report_warning('Unable to remove downloaded original file')
2052
2053     def _make_archive_id(self, info_dict):
2054         # Future-proof against any change in case
2055         # and backwards compatibility with prior versions
2056         extractor = info_dict.get('extractor_key')
2057         if extractor is None:
2058             if 'id' in info_dict:
2059                 extractor = info_dict.get('ie_key')  # key in a playlist
2060         if extractor is None:
2061             return None  # Incomplete video information
2062         return extractor.lower() + ' ' + info_dict['id']
2063
2064     def in_download_archive(self, info_dict):
2065         fn = self.params.get('download_archive')
2066         if fn is None:
2067             return False
2068
2069         vid_id = self._make_archive_id(info_dict)
2070         if vid_id is None:
2071             return False  # Incomplete video information
2072
2073         try:
2074             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2075                 for line in archive_file:
2076                     if line.strip() == vid_id:
2077                         return True
2078         except IOError as ioe:
2079             if ioe.errno != errno.ENOENT:
2080                 raise
2081         return False
2082
2083     def record_download_archive(self, info_dict):
2084         fn = self.params.get('download_archive')
2085         if fn is None:
2086             return
2087         vid_id = self._make_archive_id(info_dict)
2088         assert vid_id
2089         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2090             archive_file.write(vid_id + '\n')
2091
2092     @staticmethod
2093     def format_resolution(format, default='unknown'):
2094         if format.get('vcodec') == 'none':
2095             return 'audio only'
2096         if format.get('resolution') is not None:
2097             return format['resolution']
2098         if format.get('height') is not None:
2099             if format.get('width') is not None:
2100                 res = '%sx%s' % (format['width'], format['height'])
2101             else:
2102                 res = '%sp' % format['height']
2103         elif format.get('width') is not None:
2104             res = '%dx?' % format['width']
2105         else:
2106             res = default
2107         return res
2108
2109     def _format_note(self, fdict):
2110         res = ''
2111         if fdict.get('ext') in ['f4f', 'f4m']:
2112             res += '(unsupported) '
2113         if fdict.get('language'):
2114             if res:
2115                 res += ' '
2116             res += '[%s] ' % fdict['language']
2117         if fdict.get('format_note') is not None:
2118             res += fdict['format_note'] + ' '
2119         if fdict.get('tbr') is not None:
2120             res += '%4dk ' % fdict['tbr']
2121         if fdict.get('container') is not None:
2122             if res:
2123                 res += ', '
2124             res += '%s container' % fdict['container']
2125         if (fdict.get('vcodec') is not None and
2126                 fdict.get('vcodec') != 'none'):
2127             if res:
2128                 res += ', '
2129             res += fdict['vcodec']
2130             if fdict.get('vbr') is not None:
2131                 res += '@'
2132         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2133             res += 'video@'
2134         if fdict.get('vbr') is not None:
2135             res += '%4dk' % fdict['vbr']
2136         if fdict.get('fps') is not None:
2137             if res:
2138                 res += ', '
2139             res += '%sfps' % fdict['fps']
2140         if fdict.get('acodec') is not None:
2141             if res:
2142                 res += ', '
2143             if fdict['acodec'] == 'none':
2144                 res += 'video only'
2145             else:
2146                 res += '%-5s' % fdict['acodec']
2147         elif fdict.get('abr') is not None:
2148             if res:
2149                 res += ', '
2150             res += 'audio'
2151         if fdict.get('abr') is not None:
2152             res += '@%3dk' % fdict['abr']
2153         if fdict.get('asr') is not None:
2154             res += ' (%5dHz)' % fdict['asr']
2155         if fdict.get('filesize') is not None:
2156             if res:
2157                 res += ', '
2158             res += format_bytes(fdict['filesize'])
2159         elif fdict.get('filesize_approx') is not None:
2160             if res:
2161                 res += ', '
2162             res += '~' + format_bytes(fdict['filesize_approx'])
2163         return res
2164
2165     def list_formats(self, info_dict):
2166         formats = info_dict.get('formats', [info_dict])
2167         table = [
2168             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2169             for f in formats
2170             if f.get('preference') is None or f['preference'] >= -1000]
2171         if len(formats) > 1:
2172             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2173
2174         header_line = ['format code', 'extension', 'resolution', 'note']
2175         self.to_screen(
2176             '[info] Available formats for %s:\n%s' %
2177             (info_dict['id'], render_table(header_line, table)))
2178
2179     def list_thumbnails(self, info_dict):
2180         thumbnails = info_dict.get('thumbnails')
2181         if not thumbnails:
2182             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2183             return
2184
2185         self.to_screen(
2186             '[info] Thumbnails for %s:' % info_dict['id'])
2187         self.to_screen(render_table(
2188             ['ID', 'width', 'height', 'URL'],
2189             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2190
2191     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2192         if not subtitles:
2193             self.to_screen('%s has no %s' % (video_id, name))
2194             return
2195         self.to_screen(
2196             'Available %s for %s:' % (name, video_id))
2197         self.to_screen(render_table(
2198             ['Language', 'formats'],
2199             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2200                 for lang, formats in subtitles.items()]))
2201
2202     def urlopen(self, req):
2203         """ Start an HTTP download """
2204         if isinstance(req, compat_basestring):
2205             req = sanitized_Request(req)
2206         return self._opener.open(req, timeout=self._socket_timeout)
2207
2208     def print_debug_header(self):
2209         if not self.params.get('verbose'):
2210             return
2211
2212         if type('') is not compat_str:
2213             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2214             self.report_warning(
2215                 'Your Python is broken! Update to a newer and supported version')
2216
2217         stdout_encoding = getattr(
2218             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2219         encoding_str = (
2220             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2221                 locale.getpreferredencoding(),
2222                 sys.getfilesystemencoding(),
2223                 stdout_encoding,
2224                 self.get_encoding()))
2225         write_string(encoding_str, encoding=None)
2226
2227         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2228         if _LAZY_LOADER:
2229             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2230         try:
2231             sp = subprocess.Popen(
2232                 ['git', 'rev-parse', '--short', 'HEAD'],
2233                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2234                 cwd=os.path.dirname(os.path.abspath(__file__)))
2235             out, err = sp.communicate()
2236             out = out.decode().strip()
2237             if re.match('[0-9a-f]+', out):
2238                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2239         except Exception:
2240             try:
2241                 sys.exc_clear()
2242             except Exception:
2243                 pass
2244
2245         def python_implementation():
2246             impl_name = platform.python_implementation()
2247             if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2248                 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2249             return impl_name
2250
2251         self._write_string('[debug] Python version %s (%s) - %s\n' % (
2252             platform.python_version(), python_implementation(),
2253             platform_name()))
2254
2255         exe_versions = FFmpegPostProcessor.get_versions(self)
2256         exe_versions['rtmpdump'] = rtmpdump_version()
2257         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2258         exe_str = ', '.join(
2259             '%s %s' % (exe, v)
2260             for exe, v in sorted(exe_versions.items())
2261             if v
2262         )
2263         if not exe_str:
2264             exe_str = 'none'
2265         self._write_string('[debug] exe versions: %s\n' % exe_str)
2266
2267         proxy_map = {}
2268         for handler in self._opener.handlers:
2269             if hasattr(handler, 'proxies'):
2270                 proxy_map.update(handler.proxies)
2271         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2272
2273         if self.params.get('call_home', False):
2274             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2275             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2276             latest_version = self.urlopen(
2277                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2278             if version_tuple(latest_version) > version_tuple(__version__):
2279                 self.report_warning(
2280                     'You are using an outdated version (newest version: %s)! '
2281                     'See https://yt-dl.org/update if you need help updating.' %
2282                     latest_version)
2283
2284     def _setup_opener(self):
2285         timeout_val = self.params.get('socket_timeout')
2286         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2287
2288         opts_cookiefile = self.params.get('cookiefile')
2289         opts_proxy = self.params.get('proxy')
2290
2291         if opts_cookiefile is None:
2292             self.cookiejar = compat_cookiejar.CookieJar()
2293         else:
2294             opts_cookiefile = expand_path(opts_cookiefile)
2295             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2296                 opts_cookiefile)
2297             if os.access(opts_cookiefile, os.R_OK):
2298                 self.cookiejar.load()
2299
2300         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2301         if opts_proxy is not None:
2302             if opts_proxy == '':
2303                 proxies = {}
2304             else:
2305                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2306         else:
2307             proxies = compat_urllib_request.getproxies()
2308             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2309             if 'http' in proxies and 'https' not in proxies:
2310                 proxies['https'] = proxies['http']
2311         proxy_handler = PerRequestProxyHandler(proxies)
2312
2313         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2314         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2315         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2316         data_handler = compat_urllib_request_DataHandler()
2317
2318         # When passing our own FileHandler instance, build_opener won't add the
2319         # default FileHandler and allows us to disable the file protocol, which
2320         # can be used for malicious purposes (see
2321         # https://github.com/rg3/youtube-dl/issues/8227)
2322         file_handler = compat_urllib_request.FileHandler()
2323
2324         def file_open(*args, **kwargs):
2325             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2326         file_handler.file_open = file_open
2327
2328         opener = compat_urllib_request.build_opener(
2329             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2330
2331         # Delete the default user-agent header, which would otherwise apply in
2332         # cases where our custom HTTP handler doesn't come into play
2333         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2334         opener.addheaders = []
2335         self._opener = opener
2336
2337     def encode(self, s):
2338         if isinstance(s, bytes):
2339             return s  # Already encoded
2340
2341         try:
2342             return s.encode(self.get_encoding())
2343         except UnicodeEncodeError as err:
2344             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2345             raise
2346
2347     def get_encoding(self):
2348         encoding = self.params.get('encoding')
2349         if encoding is None:
2350             encoding = preferredencoding()
2351         return encoding
2352
2353     def _write_thumbnails(self, info_dict, filename):
2354         if self.params.get('writethumbnail', False):
2355             thumbnails = info_dict.get('thumbnails')
2356             if thumbnails:
2357                 thumbnails = [thumbnails[-1]]
2358         elif self.params.get('write_all_thumbnails', False):
2359             thumbnails = info_dict.get('thumbnails')
2360         else:
2361             return
2362
2363         if not thumbnails:
2364             # No thumbnails present, so return immediately
2365             return
2366
2367         for t in thumbnails:
2368             thumb_ext = determine_ext(t['url'], 'jpg')
2369             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2370             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2371             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2372
2373             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2374                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2375                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2376             else:
2377                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2378                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2379                 try:
2380                     uf = self.urlopen(t['url'])
2381                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2382                         shutil.copyfileobj(uf, thumbf)
2383                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2384                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2385                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386                     self.report_warning('Unable to download thumbnail "%s": %s' %
2387                                         (t['url'], error_to_compat_str(err)))