[YoutubeDL] Sanitize outtmpl as it may contain forbidden characters
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import traceback
25
26 if os.name == 'nt':
27     import ctypes
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_expanduser,
33     compat_get_terminal_size,
34     compat_http_client,
35     compat_kwargs,
36     compat_str,
37     compat_urllib_error,
38     compat_urllib_request,
39 )
40 from .utils import (
41     escape_url,
42     ContentTooShortError,
43     date_from_str,
44     DateRange,
45     DEFAULT_OUTTMPL,
46     determine_ext,
47     DownloadError,
48     encodeFilename,
49     ExtractorError,
50     format_bytes,
51     formatSeconds,
52     locked_file,
53     make_HTTPS_handler,
54     MaxDownloadsReached,
55     PagedList,
56     parse_filesize,
57     PerRequestProxyHandler,
58     PostProcessingError,
59     platform_name,
60     preferredencoding,
61     render_table,
62     SameFileError,
63     sanitize_filename,
64     std_headers,
65     subtitles_filename,
66     takewhile_inclusive,
67     UnavailableVideoError,
68     url_basename,
69     version_tuple,
70     write_json_file,
71     write_string,
72     YoutubeDLHandler,
73     prepend_extension,
74     args_to_str,
75     age_restricted,
76 )
77 from .cache import Cache
78 from .extractor import get_info_extractor, gen_extractors
79 from .downloader import get_suitable_downloader
80 from .downloader.rtmp import rtmpdump_version
81 from .postprocessor import (
82     FFmpegFixupM4aPP,
83     FFmpegFixupStretchedPP,
84     FFmpegMergerPP,
85     FFmpegPostProcessor,
86     get_postprocessor,
87 )
88 from .version import __version__
89
90
91 class YoutubeDL(object):
92     """YoutubeDL class.
93
94     YoutubeDL objects are the ones responsible of downloading the
95     actual video file and writing it to disk if the user has requested
96     it, among some other tasks. In most cases there should be one per
97     program. As, given a video URL, the downloader doesn't know how to
98     extract all the needed information, task that InfoExtractors do, it
99     has to pass the URL to one of them.
100
101     For this, YoutubeDL objects have a method that allows
102     InfoExtractors to be registered in a given order. When it is passed
103     a URL, the YoutubeDL object handles it to the first InfoExtractor it
104     finds that reports being able to handle it. The InfoExtractor extracts
105     all the information about the video or videos the URL refers to, and
106     YoutubeDL process the extracted information, possibly using a File
107     Downloader to download the video.
108
109     YoutubeDL objects accept a lot of parameters. In order not to saturate
110     the object constructor with arguments, it receives a dictionary of
111     options instead. These options are available through the params
112     attribute for the InfoExtractors to use. The YoutubeDL also
113     registers itself as the downloader in charge for the InfoExtractors
114     that are added to it, so this is a "mutual registration".
115
116     Available options:
117
118     username:          Username for authentication purposes.
119     password:          Password for authentication purposes.
120     videopassword:     Password for acces a video.
121     usenetrc:          Use netrc for authentication instead.
122     verbose:           Print additional info to stdout.
123     quiet:             Do not print messages to stdout.
124     no_warnings:       Do not print out anything for warnings.
125     forceurl:          Force printing final URL.
126     forcetitle:        Force printing title.
127     forceid:           Force printing ID.
128     forcethumbnail:    Force printing thumbnail URL.
129     forcedescription:  Force printing description.
130     forcefilename:     Force printing final filename.
131     forceduration:     Force printing duration.
132     forcejson:         Force printing info_dict as JSON.
133     dump_single_json:  Force printing the info_dict of the whole playlist
134                        (or video) as a single JSON line.
135     simulate:          Do not download the video files.
136     format:            Video format code. See options.py for more information.
137     format_limit:      Highest quality format to try.
138     outtmpl:           Template for output names.
139     restrictfilenames: Do not allow "&" and spaces in file names
140     ignoreerrors:      Do not stop on download errors.
141     nooverwrites:      Prevent overwriting files.
142     playliststart:     Playlist item to start at.
143     playlistend:       Playlist item to end at.
144     playlist_items:    Specific indices of playlist to download.
145     playlistreverse:   Download playlist items in reverse order.
146     matchtitle:        Download only matching titles.
147     rejecttitle:       Reject downloads for matching titles.
148     logger:            Log messages to a logging.Logger instance.
149     logtostderr:       Log messages to stderr instead of stdout.
150     writedescription:  Write the video description to a .description file
151     writeinfojson:     Write the video description to a .info.json file
152     writeannotations:  Write the video annotations to a .annotations.xml file
153     writethumbnail:    Write the thumbnail image to a file
154     write_all_thumbnails:  Write all thumbnail formats to files
155     writesubtitles:    Write the video subtitles to a file
156     writeautomaticsub: Write the automatic subtitles to a file
157     allsubtitles:      Downloads all the subtitles of the video
158                        (requires writesubtitles or writeautomaticsub)
159     listsubtitles:     Lists all available subtitles for the video
160     subtitlesformat:   The format code for subtitles
161     subtitleslangs:    List of languages of the subtitles to download
162     keepvideo:         Keep the video file after post-processing
163     daterange:         A DateRange object, download only if the upload_date is in the range.
164     skip_download:     Skip the actual download of the video file
165     cachedir:          Location of the cache files in the filesystem.
166                        False to disable filesystem cache.
167     noplaylist:        Download single video instead of a playlist if in doubt.
168     age_limit:         An integer representing the user's age in years.
169                        Unsuitable videos for the given age are skipped.
170     min_views:         An integer representing the minimum view count the video
171                        must have in order to not be skipped.
172                        Videos without view count information are always
173                        downloaded. None for no limit.
174     max_views:         An integer representing the maximum view count.
175                        Videos that are more popular than that are not
176                        downloaded.
177                        Videos without view count information are always
178                        downloaded. None for no limit.
179     download_archive:  File name of a file where all downloads are recorded.
180                        Videos already present in the file are not downloaded
181                        again.
182     cookiefile:        File name where cookies should be read from and dumped to.
183     nocheckcertificate:Do not verify SSL certificates
184     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
185                        At the moment, this is only supported by YouTube.
186     proxy:             URL of the proxy server to use
187     cn_verification_proxy:  URL of the proxy to use for IP address verification
188                        on Chinese sites. (Experimental)
189     socket_timeout:    Time to wait for unresponsive hosts, in seconds
190     bidi_workaround:   Work around buggy terminals without bidirectional text
191                        support, using fridibi
192     debug_printtraffic:Print out sent and received HTTP traffic
193     include_ads:       Download ads as well
194     default_search:    Prepend this string if an input url is not valid.
195                        'auto' for elaborate guessing
196     encoding:          Use this encoding instead of the system-specified.
197     extract_flat:      Do not resolve URLs, return the immediate result.
198                        Pass in 'in_playlist' to only show this behavior for
199                        playlist items.
200     postprocessors:    A list of dictionaries, each with an entry
201                        * key:  The name of the postprocessor. See
202                                youtube_dl/postprocessor/__init__.py for a list.
203                        as well as any further keyword arguments for the
204                        postprocessor.
205     progress_hooks:    A list of functions that get called on download
206                        progress, with a dictionary with the entries
207                        * status: One of "downloading", "error", or "finished".
208                                  Check this first and ignore unknown values.
209
210                        If status is one of "downloading", or "finished", the
211                        following properties may also be present:
212                        * filename: The final filename (always present)
213                        * tmpfilename: The filename we're currently writing to
214                        * downloaded_bytes: Bytes on disk
215                        * total_bytes: Size of the whole file, None if unknown
216                        * total_bytes_estimate: Guess of the eventual file size,
217                                                None if unavailable.
218                        * elapsed: The number of seconds since download started.
219                        * eta: The estimated time in seconds, None if unknown
220                        * speed: The download speed in bytes/second, None if
221                                 unknown
222                        * fragment_index: The counter of the currently
223                                          downloaded video fragment.
224                        * fragment_count: The number of fragments (= individual
225                                          files that will be merged)
226
227                        Progress hooks are guaranteed to be called at least once
228                        (with status "finished") if the download is successful.
229     merge_output_format: Extension to use when merging formats.
230     fixup:             Automatically correct known faults of the file.
231                        One of:
232                        - "never": do nothing
233                        - "warn": only emit a warning
234                        - "detect_or_warn": check whether we can do anything
235                                            about it, warn otherwise (default)
236     source_address:    (Experimental) Client-side IP address to bind to.
237     call_home:         Boolean, true iff we are allowed to contact the
238                        youtube-dl servers for debugging.
239     sleep_interval:    Number of seconds to sleep before each download.
240     listformats:       Print an overview of available video formats and exit.
241     list_thumbnails:   Print a table of all thumbnails and exit.
242     match_filter:      A function that gets called with the info_dict of
243                        every video.
244                        If it returns a message, the video is ignored.
245                        If it returns None, the video is downloaded.
246                        match_filter_func in utils.py is one example for this.
247     no_color:          Do not emit color codes in output.
248
249     The following options determine which downloader is picked:
250     external_downloader: Executable of the external downloader to call.
251                        None or unset for standard (built-in) downloader.
252     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
253
254     The following parameters are not used by YoutubeDL itself, they are used by
255     the downloader (see youtube_dl/downloader/common.py):
256     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
257     noresizebuffer, retries, continuedl, noprogress, consoletitle,
258     xattr_set_filesize, external_downloader_args.
259
260     The following options are used by the post processors:
261     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
262                        otherwise prefer avconv.
263     exec_cmd:          Arbitrary command to run after downloading
264     """
265
266     params = None
267     _ies = []
268     _pps = []
269     _download_retcode = None
270     _num_downloads = None
271     _screen_file = None
272
273     def __init__(self, params=None, auto_init=True):
274         """Create a FileDownloader object with the given options."""
275         if params is None:
276             params = {}
277         self._ies = []
278         self._ies_instances = {}
279         self._pps = []
280         self._progress_hooks = []
281         self._download_retcode = 0
282         self._num_downloads = 0
283         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
284         self._err_file = sys.stderr
285         self.params = params
286         self.cache = Cache(self)
287
288         if params.get('bidi_workaround', False):
289             try:
290                 import pty
291                 master, slave = pty.openpty()
292                 width = compat_get_terminal_size().columns
293                 if width is None:
294                     width_args = []
295                 else:
296                     width_args = ['-w', str(width)]
297                 sp_kwargs = dict(
298                     stdin=subprocess.PIPE,
299                     stdout=slave,
300                     stderr=self._err_file)
301                 try:
302                     self._output_process = subprocess.Popen(
303                         ['bidiv'] + width_args, **sp_kwargs
304                     )
305                 except OSError:
306                     self._output_process = subprocess.Popen(
307                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
308                 self._output_channel = os.fdopen(master, 'rb')
309             except OSError as ose:
310                 if ose.errno == 2:
311                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
312                 else:
313                     raise
314
315         if (sys.version_info >= (3,) and sys.platform != 'win32' and
316                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
317                 not params.get('restrictfilenames', False)):
318             # On Python 3, the Unicode filesystem API will throw errors (#1474)
319             self.report_warning(
320                 'Assuming --restrict-filenames since file system encoding '
321                 'cannot encode all characters. '
322                 'Set the LC_ALL environment variable to fix this.')
323             self.params['restrictfilenames'] = True
324
325         if '%(stitle)s' in self.params.get('outtmpl', ''):
326             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
327
328         self._setup_opener()
329
330         if auto_init:
331             self.print_debug_header()
332             self.add_default_info_extractors()
333
334         for pp_def_raw in self.params.get('postprocessors', []):
335             pp_class = get_postprocessor(pp_def_raw['key'])
336             pp_def = dict(pp_def_raw)
337             del pp_def['key']
338             pp = pp_class(self, **compat_kwargs(pp_def))
339             self.add_post_processor(pp)
340
341         for ph in self.params.get('progress_hooks', []):
342             self.add_progress_hook(ph)
343
344     def warn_if_short_id(self, argv):
345         # short YouTube ID starting with dash?
346         idxs = [
347             i for i, a in enumerate(argv)
348             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
349         if idxs:
350             correct_argv = (
351                 ['youtube-dl'] +
352                 [a for i, a in enumerate(argv) if i not in idxs] +
353                 ['--'] + [argv[i] for i in idxs]
354             )
355             self.report_warning(
356                 'Long argument string detected. '
357                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
358                 args_to_str(correct_argv))
359
360     def add_info_extractor(self, ie):
361         """Add an InfoExtractor object to the end of the list."""
362         self._ies.append(ie)
363         self._ies_instances[ie.ie_key()] = ie
364         ie.set_downloader(self)
365
366     def get_info_extractor(self, ie_key):
367         """
368         Get an instance of an IE with name ie_key, it will try to get one from
369         the _ies list, if there's no instance it will create a new one and add
370         it to the extractor list.
371         """
372         ie = self._ies_instances.get(ie_key)
373         if ie is None:
374             ie = get_info_extractor(ie_key)()
375             self.add_info_extractor(ie)
376         return ie
377
378     def add_default_info_extractors(self):
379         """
380         Add the InfoExtractors returned by gen_extractors to the end of the list
381         """
382         for ie in gen_extractors():
383             self.add_info_extractor(ie)
384
385     def add_post_processor(self, pp):
386         """Add a PostProcessor object to the end of the chain."""
387         self._pps.append(pp)
388         pp.set_downloader(self)
389
390     def add_progress_hook(self, ph):
391         """Add the progress hook (currently only for the file downloader)"""
392         self._progress_hooks.append(ph)
393
394     def _bidi_workaround(self, message):
395         if not hasattr(self, '_output_channel'):
396             return message
397
398         assert hasattr(self, '_output_process')
399         assert isinstance(message, compat_str)
400         line_count = message.count('\n') + 1
401         self._output_process.stdin.write((message + '\n').encode('utf-8'))
402         self._output_process.stdin.flush()
403         res = ''.join(self._output_channel.readline().decode('utf-8')
404                       for _ in range(line_count))
405         return res[:-len('\n')]
406
407     def to_screen(self, message, skip_eol=False):
408         """Print message to stdout if not in quiet mode."""
409         return self.to_stdout(message, skip_eol, check_quiet=True)
410
411     def _write_string(self, s, out=None):
412         write_string(s, out=out, encoding=self.params.get('encoding'))
413
414     def to_stdout(self, message, skip_eol=False, check_quiet=False):
415         """Print message to stdout if not in quiet mode."""
416         if self.params.get('logger'):
417             self.params['logger'].debug(message)
418         elif not check_quiet or not self.params.get('quiet', False):
419             message = self._bidi_workaround(message)
420             terminator = ['\n', ''][skip_eol]
421             output = message + terminator
422
423             self._write_string(output, self._screen_file)
424
425     def to_stderr(self, message):
426         """Print message to stderr."""
427         assert isinstance(message, compat_str)
428         if self.params.get('logger'):
429             self.params['logger'].error(message)
430         else:
431             message = self._bidi_workaround(message)
432             output = message + '\n'
433             self._write_string(output, self._err_file)
434
435     def to_console_title(self, message):
436         if not self.params.get('consoletitle', False):
437             return
438         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
439             # c_wchar_p() might not be necessary if `message` is
440             # already of type unicode()
441             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
442         elif 'TERM' in os.environ:
443             self._write_string('\033]0;%s\007' % message, self._screen_file)
444
445     def save_console_title(self):
446         if not self.params.get('consoletitle', False):
447             return
448         if 'TERM' in os.environ:
449             # Save the title on stack
450             self._write_string('\033[22;0t', self._screen_file)
451
452     def restore_console_title(self):
453         if not self.params.get('consoletitle', False):
454             return
455         if 'TERM' in os.environ:
456             # Restore the title from stack
457             self._write_string('\033[23;0t', self._screen_file)
458
459     def __enter__(self):
460         self.save_console_title()
461         return self
462
463     def __exit__(self, *args):
464         self.restore_console_title()
465
466         if self.params.get('cookiefile') is not None:
467             self.cookiejar.save()
468
469     def trouble(self, message=None, tb=None):
470         """Determine action to take when a download problem appears.
471
472         Depending on if the downloader has been configured to ignore
473         download errors or not, this method may throw an exception or
474         not when errors are found, after printing the message.
475
476         tb, if given, is additional traceback information.
477         """
478         if message is not None:
479             self.to_stderr(message)
480         if self.params.get('verbose'):
481             if tb is None:
482                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
483                     tb = ''
484                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
485                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
486                     tb += compat_str(traceback.format_exc())
487                 else:
488                     tb_data = traceback.format_list(traceback.extract_stack())
489                     tb = ''.join(tb_data)
490             self.to_stderr(tb)
491         if not self.params.get('ignoreerrors', False):
492             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
493                 exc_info = sys.exc_info()[1].exc_info
494             else:
495                 exc_info = sys.exc_info()
496             raise DownloadError(message, exc_info)
497         self._download_retcode = 1
498
499     def report_warning(self, message):
500         '''
501         Print the message to stderr, it will be prefixed with 'WARNING:'
502         If stderr is a tty file the 'WARNING:' will be colored
503         '''
504         if self.params.get('logger') is not None:
505             self.params['logger'].warning(message)
506         else:
507             if self.params.get('no_warnings'):
508                 return
509             if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
510                 _msg_header = '\033[0;33mWARNING:\033[0m'
511             else:
512                 _msg_header = 'WARNING:'
513             warning_message = '%s %s' % (_msg_header, message)
514             self.to_stderr(warning_message)
515
516     def report_error(self, message, tb=None):
517         '''
518         Do the same as trouble, but prefixes the message with 'ERROR:', colored
519         in red if stderr is a tty file.
520         '''
521         if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
522             _msg_header = '\033[0;31mERROR:\033[0m'
523         else:
524             _msg_header = 'ERROR:'
525         error_message = '%s %s' % (_msg_header, message)
526         self.trouble(error_message, tb)
527
528     def report_file_already_downloaded(self, file_name):
529         """Report file has already been fully downloaded."""
530         try:
531             self.to_screen('[download] %s has already been downloaded' % file_name)
532         except UnicodeEncodeError:
533             self.to_screen('[download] The file has already been downloaded')
534
535     def prepare_filename(self, info_dict):
536         """Generate the output filename."""
537         try:
538             template_dict = dict(info_dict)
539
540             template_dict['epoch'] = int(time.time())
541             autonumber_size = self.params.get('autonumber_size')
542             if autonumber_size is None:
543                 autonumber_size = 5
544             autonumber_templ = '%0' + str(autonumber_size) + 'd'
545             template_dict['autonumber'] = autonumber_templ % self._num_downloads
546             if template_dict.get('playlist_index') is not None:
547                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
548             if template_dict.get('resolution') is None:
549                 if template_dict.get('width') and template_dict.get('height'):
550                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
551                 elif template_dict.get('height'):
552                     template_dict['resolution'] = '%sp' % template_dict['height']
553                 elif template_dict.get('width'):
554                     template_dict['resolution'] = '?x%d' % template_dict['width']
555
556             restrict_filenames = self.params.get('restrictfilenames')
557
558             sanitize = lambda k, v: sanitize_filename(
559                 compat_str(v),
560                 restricted=restrict_filenames,
561                 is_id=(k == 'id'))
562             template_dict = dict((k, sanitize(k, v))
563                                  for k, v in template_dict.items()
564                                  if v is not None)
565             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
566
567             outtmpl = sanitize_filename(
568                 self.params.get('outtmpl', DEFAULT_OUTTMPL),
569                 restricted=restrict_filenames)
570             tmpl = compat_expanduser(outtmpl)
571             filename = tmpl % template_dict
572             # Temporary fix for #4787
573             # 'Treat' all problem characters by passing filename through preferredencoding
574             # to workaround encoding issues with subprocess on python2 @ Windows
575             if sys.version_info < (3, 0) and sys.platform == 'win32':
576                 filename = encodeFilename(filename, True).decode(preferredencoding())
577             return filename
578         except ValueError as err:
579             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
580             return None
581
582     def _match_entry(self, info_dict, incomplete):
583         """ Returns None iff the file should be downloaded """
584
585         video_title = info_dict.get('title', info_dict.get('id', 'video'))
586         if 'title' in info_dict:
587             # This can happen when we're just evaluating the playlist
588             title = info_dict['title']
589             matchtitle = self.params.get('matchtitle', False)
590             if matchtitle:
591                 if not re.search(matchtitle, title, re.IGNORECASE):
592                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
593             rejecttitle = self.params.get('rejecttitle', False)
594             if rejecttitle:
595                 if re.search(rejecttitle, title, re.IGNORECASE):
596                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
597         date = info_dict.get('upload_date', None)
598         if date is not None:
599             dateRange = self.params.get('daterange', DateRange())
600             if date not in dateRange:
601                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
602         view_count = info_dict.get('view_count', None)
603         if view_count is not None:
604             min_views = self.params.get('min_views')
605             if min_views is not None and view_count < min_views:
606                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
607             max_views = self.params.get('max_views')
608             if max_views is not None and view_count > max_views:
609                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
610         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
611             return 'Skipping "%s" because it is age restricted' % video_title
612         if self.in_download_archive(info_dict):
613             return '%s has already been recorded in archive' % video_title
614
615         if not incomplete:
616             match_filter = self.params.get('match_filter')
617             if match_filter is not None:
618                 ret = match_filter(info_dict)
619                 if ret is not None:
620                     return ret
621
622         return None
623
624     @staticmethod
625     def add_extra_info(info_dict, extra_info):
626         '''Set the keys from extra_info in info dict if they are missing'''
627         for key, value in extra_info.items():
628             info_dict.setdefault(key, value)
629
630     def extract_info(self, url, download=True, ie_key=None, extra_info={},
631                      process=True):
632         '''
633         Returns a list with a dictionary for each video we find.
634         If 'download', also downloads the videos.
635         extra_info is a dict containing the extra values to add to each result
636          '''
637
638         if ie_key:
639             ies = [self.get_info_extractor(ie_key)]
640         else:
641             ies = self._ies
642
643         for ie in ies:
644             if not ie.suitable(url):
645                 continue
646
647             if not ie.working():
648                 self.report_warning('The program functionality for this site has been marked as broken, '
649                                     'and will probably not work.')
650
651             try:
652                 ie_result = ie.extract(url)
653                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
654                     break
655                 if isinstance(ie_result, list):
656                     # Backwards compatibility: old IE result format
657                     ie_result = {
658                         '_type': 'compat_list',
659                         'entries': ie_result,
660                     }
661                 self.add_default_extra_info(ie_result, ie, url)
662                 if process:
663                     return self.process_ie_result(ie_result, download, extra_info)
664                 else:
665                     return ie_result
666             except ExtractorError as de:  # An error we somewhat expected
667                 self.report_error(compat_str(de), de.format_traceback())
668                 break
669             except MaxDownloadsReached:
670                 raise
671             except Exception as e:
672                 if self.params.get('ignoreerrors', False):
673                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
674                     break
675                 else:
676                     raise
677         else:
678             self.report_error('no suitable InfoExtractor for URL %s' % url)
679
680     def add_default_extra_info(self, ie_result, ie, url):
681         self.add_extra_info(ie_result, {
682             'extractor': ie.IE_NAME,
683             'webpage_url': url,
684             'webpage_url_basename': url_basename(url),
685             'extractor_key': ie.ie_key(),
686         })
687
688     def process_ie_result(self, ie_result, download=True, extra_info={}):
689         """
690         Take the result of the ie(may be modified) and resolve all unresolved
691         references (URLs, playlist items).
692
693         It will also download the videos if 'download'.
694         Returns the resolved ie_result.
695         """
696
697         result_type = ie_result.get('_type', 'video')
698
699         if result_type in ('url', 'url_transparent'):
700             extract_flat = self.params.get('extract_flat', False)
701             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
702                     extract_flat is True):
703                 if self.params.get('forcejson', False):
704                     self.to_stdout(json.dumps(ie_result))
705                 return ie_result
706
707         if result_type == 'video':
708             self.add_extra_info(ie_result, extra_info)
709             return self.process_video_result(ie_result, download=download)
710         elif result_type == 'url':
711             # We have to add extra_info to the results because it may be
712             # contained in a playlist
713             return self.extract_info(ie_result['url'],
714                                      download,
715                                      ie_key=ie_result.get('ie_key'),
716                                      extra_info=extra_info)
717         elif result_type == 'url_transparent':
718             # Use the information from the embedding page
719             info = self.extract_info(
720                 ie_result['url'], ie_key=ie_result.get('ie_key'),
721                 extra_info=extra_info, download=False, process=False)
722
723             force_properties = dict(
724                 (k, v) for k, v in ie_result.items() if v is not None)
725             for f in ('_type', 'url'):
726                 if f in force_properties:
727                     del force_properties[f]
728             new_result = info.copy()
729             new_result.update(force_properties)
730
731             assert new_result.get('_type') != 'url_transparent'
732
733             return self.process_ie_result(
734                 new_result, download=download, extra_info=extra_info)
735         elif result_type == 'playlist' or result_type == 'multi_video':
736             # We process each entry in the playlist
737             playlist = ie_result.get('title', None) or ie_result.get('id', None)
738             self.to_screen('[download] Downloading playlist: %s' % playlist)
739
740             playlist_results = []
741
742             playliststart = self.params.get('playliststart', 1) - 1
743             playlistend = self.params.get('playlistend', None)
744             # For backwards compatibility, interpret -1 as whole list
745             if playlistend == -1:
746                 playlistend = None
747
748             playlistitems_str = self.params.get('playlist_items', None)
749             playlistitems = None
750             if playlistitems_str is not None:
751                 def iter_playlistitems(format):
752                     for string_segment in format.split(','):
753                         if '-' in string_segment:
754                             start, end = string_segment.split('-')
755                             for item in range(int(start), int(end) + 1):
756                                 yield int(item)
757                         else:
758                             yield int(string_segment)
759                 playlistitems = iter_playlistitems(playlistitems_str)
760
761             ie_entries = ie_result['entries']
762             if isinstance(ie_entries, list):
763                 n_all_entries = len(ie_entries)
764                 if playlistitems:
765                     entries = [ie_entries[i - 1] for i in playlistitems]
766                 else:
767                     entries = ie_entries[playliststart:playlistend]
768                 n_entries = len(entries)
769                 self.to_screen(
770                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
771                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
772             elif isinstance(ie_entries, PagedList):
773                 if playlistitems:
774                     entries = []
775                     for item in playlistitems:
776                         entries.extend(ie_entries.getslice(
777                             item - 1, item
778                         ))
779                 else:
780                     entries = ie_entries.getslice(
781                         playliststart, playlistend)
782                 n_entries = len(entries)
783                 self.to_screen(
784                     "[%s] playlist %s: Downloading %d videos" %
785                     (ie_result['extractor'], playlist, n_entries))
786             else:  # iterable
787                 if playlistitems:
788                     entry_list = list(ie_entries)
789                     entries = [entry_list[i - 1] for i in playlistitems]
790                 else:
791                     entries = list(itertools.islice(
792                         ie_entries, playliststart, playlistend))
793                 n_entries = len(entries)
794                 self.to_screen(
795                     "[%s] playlist %s: Downloading %d videos" %
796                     (ie_result['extractor'], playlist, n_entries))
797
798             if self.params.get('playlistreverse', False):
799                 entries = entries[::-1]
800
801             for i, entry in enumerate(entries, 1):
802                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
803                 extra = {
804                     'n_entries': n_entries,
805                     'playlist': playlist,
806                     'playlist_id': ie_result.get('id'),
807                     'playlist_title': ie_result.get('title'),
808                     'playlist_index': i + playliststart,
809                     'extractor': ie_result['extractor'],
810                     'webpage_url': ie_result['webpage_url'],
811                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
812                     'extractor_key': ie_result['extractor_key'],
813                 }
814
815                 reason = self._match_entry(entry, incomplete=True)
816                 if reason is not None:
817                     self.to_screen('[download] ' + reason)
818                     continue
819
820                 entry_result = self.process_ie_result(entry,
821                                                       download=download,
822                                                       extra_info=extra)
823                 playlist_results.append(entry_result)
824             ie_result['entries'] = playlist_results
825             return ie_result
826         elif result_type == 'compat_list':
827             self.report_warning(
828                 'Extractor %s returned a compat_list result. '
829                 'It needs to be updated.' % ie_result.get('extractor'))
830
831             def _fixup(r):
832                 self.add_extra_info(
833                     r,
834                     {
835                         'extractor': ie_result['extractor'],
836                         'webpage_url': ie_result['webpage_url'],
837                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
838                         'extractor_key': ie_result['extractor_key'],
839                     }
840                 )
841                 return r
842             ie_result['entries'] = [
843                 self.process_ie_result(_fixup(r), download, extra_info)
844                 for r in ie_result['entries']
845             ]
846             return ie_result
847         else:
848             raise Exception('Invalid result type: %s' % result_type)
849
850     def _apply_format_filter(self, format_spec, available_formats):
851         " Returns a tuple of the remaining format_spec and filtered formats "
852
853         OPERATORS = {
854             '<': operator.lt,
855             '<=': operator.le,
856             '>': operator.gt,
857             '>=': operator.ge,
858             '=': operator.eq,
859             '!=': operator.ne,
860         }
861         operator_rex = re.compile(r'''(?x)\s*\[
862             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
863             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
864             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
865             \]$
866             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
867         m = operator_rex.search(format_spec)
868         if m:
869             try:
870                 comparison_value = int(m.group('value'))
871             except ValueError:
872                 comparison_value = parse_filesize(m.group('value'))
873                 if comparison_value is None:
874                     comparison_value = parse_filesize(m.group('value') + 'B')
875                 if comparison_value is None:
876                     raise ValueError(
877                         'Invalid value %r in format specification %r' % (
878                             m.group('value'), format_spec))
879             op = OPERATORS[m.group('op')]
880
881         if not m:
882             STR_OPERATORS = {
883                 '=': operator.eq,
884                 '!=': operator.ne,
885             }
886             str_operator_rex = re.compile(r'''(?x)\s*\[
887                 \s*(?P<key>ext|acodec|vcodec|container|protocol)
888                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
889                 \s*(?P<value>[a-zA-Z0-9_-]+)
890                 \s*\]$
891                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
892             m = str_operator_rex.search(format_spec)
893             if m:
894                 comparison_value = m.group('value')
895                 op = STR_OPERATORS[m.group('op')]
896
897         if not m:
898             raise ValueError('Invalid format specification %r' % format_spec)
899
900         def _filter(f):
901             actual_value = f.get(m.group('key'))
902             if actual_value is None:
903                 return m.group('none_inclusive')
904             return op(actual_value, comparison_value)
905         new_formats = [f for f in available_formats if _filter(f)]
906
907         new_format_spec = format_spec[:-len(m.group(0))]
908         if not new_format_spec:
909             new_format_spec = 'best'
910
911         return (new_format_spec, new_formats)
912
913     def select_format(self, format_spec, available_formats):
914         while format_spec.endswith(']'):
915             format_spec, available_formats = self._apply_format_filter(
916                 format_spec, available_formats)
917         if not available_formats:
918             return None
919
920         if format_spec == 'best' or format_spec is None:
921             return available_formats[-1]
922         elif format_spec == 'worst':
923             return available_formats[0]
924         elif format_spec == 'bestaudio':
925             audio_formats = [
926                 f for f in available_formats
927                 if f.get('vcodec') == 'none']
928             if audio_formats:
929                 return audio_formats[-1]
930         elif format_spec == 'worstaudio':
931             audio_formats = [
932                 f for f in available_formats
933                 if f.get('vcodec') == 'none']
934             if audio_formats:
935                 return audio_formats[0]
936         elif format_spec == 'bestvideo':
937             video_formats = [
938                 f for f in available_formats
939                 if f.get('acodec') == 'none']
940             if video_formats:
941                 return video_formats[-1]
942         elif format_spec == 'worstvideo':
943             video_formats = [
944                 f for f in available_formats
945                 if f.get('acodec') == 'none']
946             if video_formats:
947                 return video_formats[0]
948         else:
949             extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
950             if format_spec in extensions:
951                 filter_f = lambda f: f['ext'] == format_spec
952             else:
953                 filter_f = lambda f: f['format_id'] == format_spec
954             matches = list(filter(filter_f, available_formats))
955             if matches:
956                 return matches[-1]
957         return None
958
959     def _calc_headers(self, info_dict):
960         res = std_headers.copy()
961
962         add_headers = info_dict.get('http_headers')
963         if add_headers:
964             res.update(add_headers)
965
966         cookies = self._calc_cookies(info_dict)
967         if cookies:
968             res['Cookie'] = cookies
969
970         return res
971
972     def _calc_cookies(self, info_dict):
973         pr = compat_urllib_request.Request(info_dict['url'])
974         self.cookiejar.add_cookie_header(pr)
975         return pr.get_header('Cookie')
976
977     def process_video_result(self, info_dict, download=True):
978         assert info_dict.get('_type', 'video') == 'video'
979
980         if 'id' not in info_dict:
981             raise ExtractorError('Missing "id" field in extractor result')
982         if 'title' not in info_dict:
983             raise ExtractorError('Missing "title" field in extractor result')
984
985         if 'playlist' not in info_dict:
986             # It isn't part of a playlist
987             info_dict['playlist'] = None
988             info_dict['playlist_index'] = None
989
990         thumbnails = info_dict.get('thumbnails')
991         if thumbnails is None:
992             thumbnail = info_dict.get('thumbnail')
993             if thumbnail:
994                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
995         if thumbnails:
996             thumbnails.sort(key=lambda t: (
997                 t.get('preference'), t.get('width'), t.get('height'),
998                 t.get('id'), t.get('url')))
999             for i, t in enumerate(thumbnails):
1000                 if 'width' in t and 'height' in t:
1001                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1002                 if t.get('id') is None:
1003                     t['id'] = '%d' % i
1004
1005         if thumbnails and 'thumbnail' not in info_dict:
1006             info_dict['thumbnail'] = thumbnails[-1]['url']
1007
1008         if 'display_id' not in info_dict and 'id' in info_dict:
1009             info_dict['display_id'] = info_dict['id']
1010
1011         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1012             # Working around negative timestamps in Windows
1013             # (see http://bugs.python.org/issue1646728)
1014             if info_dict['timestamp'] < 0 and os.name == 'nt':
1015                 info_dict['timestamp'] = 0
1016             upload_date = datetime.datetime.utcfromtimestamp(
1017                 info_dict['timestamp'])
1018             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1019
1020         if self.params.get('listsubtitles', False):
1021             if 'automatic_captions' in info_dict:
1022                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1023             self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
1024             return
1025         info_dict['requested_subtitles'] = self.process_subtitles(
1026             info_dict['id'], info_dict.get('subtitles'),
1027             info_dict.get('automatic_captions'))
1028
1029         # This extractors handle format selection themselves
1030         if info_dict['extractor'] in ['Youku']:
1031             if download:
1032                 self.process_info(info_dict)
1033             return info_dict
1034
1035         # We now pick which formats have to be downloaded
1036         if info_dict.get('formats') is None:
1037             # There's only one format available
1038             formats = [info_dict]
1039         else:
1040             formats = info_dict['formats']
1041
1042         if not formats:
1043             raise ExtractorError('No video formats found!')
1044
1045         # We check that all the formats have the format and format_id fields
1046         for i, format in enumerate(formats):
1047             if 'url' not in format:
1048                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1049
1050             if format.get('format_id') is None:
1051                 format['format_id'] = compat_str(i)
1052             if format.get('format') is None:
1053                 format['format'] = '{id} - {res}{note}'.format(
1054                     id=format['format_id'],
1055                     res=self.format_resolution(format),
1056                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1057                 )
1058             # Automatically determine file extension if missing
1059             if 'ext' not in format:
1060                 format['ext'] = determine_ext(format['url']).lower()
1061             # Add HTTP headers, so that external programs can use them from the
1062             # json output
1063             full_format_info = info_dict.copy()
1064             full_format_info.update(format)
1065             format['http_headers'] = self._calc_headers(full_format_info)
1066
1067         format_limit = self.params.get('format_limit', None)
1068         if format_limit:
1069             formats = list(takewhile_inclusive(
1070                 lambda f: f['format_id'] != format_limit, formats
1071             ))
1072
1073         # TODO Central sorting goes here
1074
1075         if formats[0] is not info_dict:
1076             # only set the 'formats' fields if the original info_dict list them
1077             # otherwise we end up with a circular reference, the first (and unique)
1078             # element in the 'formats' field in info_dict is info_dict itself,
1079             # wich can't be exported to json
1080             info_dict['formats'] = formats
1081         if self.params.get('listformats'):
1082             self.list_formats(info_dict)
1083             return
1084         if self.params.get('list_thumbnails'):
1085             self.list_thumbnails(info_dict)
1086             return
1087
1088         req_format = self.params.get('format')
1089         if req_format is None:
1090             req_format = 'best'
1091         formats_to_download = []
1092         # The -1 is for supporting YoutubeIE
1093         if req_format in ('-1', 'all'):
1094             formats_to_download = formats
1095         else:
1096             for rfstr in req_format.split(','):
1097                 # We can accept formats requested in the format: 34/5/best, we pick
1098                 # the first that is available, starting from left
1099                 req_formats = rfstr.split('/')
1100                 for rf in req_formats:
1101                     if re.match(r'.+?\+.+?', rf) is not None:
1102                         # Two formats have been requested like '137+139'
1103                         format_1, format_2 = rf.split('+')
1104                         formats_info = (self.select_format(format_1, formats),
1105                                         self.select_format(format_2, formats))
1106                         if all(formats_info):
1107                             # The first format must contain the video and the
1108                             # second the audio
1109                             if formats_info[0].get('vcodec') == 'none':
1110                                 self.report_error('The first format must '
1111                                                   'contain the video, try using '
1112                                                   '"-f %s+%s"' % (format_2, format_1))
1113                                 return
1114                             output_ext = (
1115                                 formats_info[0]['ext']
1116                                 if self.params.get('merge_output_format') is None
1117                                 else self.params['merge_output_format'])
1118                             selected_format = {
1119                                 'requested_formats': formats_info,
1120                                 'format': '%s+%s' % (formats_info[0].get('format'),
1121                                                      formats_info[1].get('format')),
1122                                 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1123                                                         formats_info[1].get('format_id')),
1124                                 'width': formats_info[0].get('width'),
1125                                 'height': formats_info[0].get('height'),
1126                                 'resolution': formats_info[0].get('resolution'),
1127                                 'fps': formats_info[0].get('fps'),
1128                                 'vcodec': formats_info[0].get('vcodec'),
1129                                 'vbr': formats_info[0].get('vbr'),
1130                                 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1131                                 'acodec': formats_info[1].get('acodec'),
1132                                 'abr': formats_info[1].get('abr'),
1133                                 'ext': output_ext,
1134                             }
1135                         else:
1136                             selected_format = None
1137                     else:
1138                         selected_format = self.select_format(rf, formats)
1139                     if selected_format is not None:
1140                         formats_to_download.append(selected_format)
1141                         break
1142         if not formats_to_download:
1143             raise ExtractorError('requested format not available',
1144                                  expected=True)
1145
1146         if download:
1147             if len(formats_to_download) > 1:
1148                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1149             for format in formats_to_download:
1150                 new_info = dict(info_dict)
1151                 new_info.update(format)
1152                 self.process_info(new_info)
1153         # We update the info dict with the best quality format (backwards compatibility)
1154         info_dict.update(formats_to_download[-1])
1155         return info_dict
1156
1157     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1158         """Select the requested subtitles and their format"""
1159         available_subs = {}
1160         if normal_subtitles and self.params.get('writesubtitles'):
1161             available_subs.update(normal_subtitles)
1162         if automatic_captions and self.params.get('writeautomaticsub'):
1163             for lang, cap_info in automatic_captions.items():
1164                 if lang not in available_subs:
1165                     available_subs[lang] = cap_info
1166
1167         if (not self.params.get('writesubtitles') and not
1168                 self.params.get('writeautomaticsub') or not
1169                 available_subs):
1170             return None
1171
1172         if self.params.get('allsubtitles', False):
1173             requested_langs = available_subs.keys()
1174         else:
1175             if self.params.get('subtitleslangs', False):
1176                 requested_langs = self.params.get('subtitleslangs')
1177             elif 'en' in available_subs:
1178                 requested_langs = ['en']
1179             else:
1180                 requested_langs = [list(available_subs.keys())[0]]
1181
1182         formats_query = self.params.get('subtitlesformat', 'best')
1183         formats_preference = formats_query.split('/') if formats_query else []
1184         subs = {}
1185         for lang in requested_langs:
1186             formats = available_subs.get(lang)
1187             if formats is None:
1188                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1189                 continue
1190             for ext in formats_preference:
1191                 if ext == 'best':
1192                     f = formats[-1]
1193                     break
1194                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1195                 if matches:
1196                     f = matches[-1]
1197                     break
1198             else:
1199                 f = formats[-1]
1200                 self.report_warning(
1201                     'No subtitle format found matching "%s" for language %s, '
1202                     'using %s' % (formats_query, lang, f['ext']))
1203             subs[lang] = f
1204         return subs
1205
1206     def process_info(self, info_dict):
1207         """Process a single resolved IE result."""
1208
1209         assert info_dict.get('_type', 'video') == 'video'
1210
1211         max_downloads = self.params.get('max_downloads')
1212         if max_downloads is not None:
1213             if self._num_downloads >= int(max_downloads):
1214                 raise MaxDownloadsReached()
1215
1216         info_dict['fulltitle'] = info_dict['title']
1217         if len(info_dict['title']) > 200:
1218             info_dict['title'] = info_dict['title'][:197] + '...'
1219
1220         # Keep for backwards compatibility
1221         info_dict['stitle'] = info_dict['title']
1222
1223         if 'format' not in info_dict:
1224             info_dict['format'] = info_dict['ext']
1225
1226         reason = self._match_entry(info_dict, incomplete=False)
1227         if reason is not None:
1228             self.to_screen('[download] ' + reason)
1229             return
1230
1231         self._num_downloads += 1
1232
1233         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1234
1235         # Forced printings
1236         if self.params.get('forcetitle', False):
1237             self.to_stdout(info_dict['fulltitle'])
1238         if self.params.get('forceid', False):
1239             self.to_stdout(info_dict['id'])
1240         if self.params.get('forceurl', False):
1241             if info_dict.get('requested_formats') is not None:
1242                 for f in info_dict['requested_formats']:
1243                     self.to_stdout(f['url'] + f.get('play_path', ''))
1244             else:
1245                 # For RTMP URLs, also include the playpath
1246                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1247         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1248             self.to_stdout(info_dict['thumbnail'])
1249         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1250             self.to_stdout(info_dict['description'])
1251         if self.params.get('forcefilename', False) and filename is not None:
1252             self.to_stdout(filename)
1253         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1254             self.to_stdout(formatSeconds(info_dict['duration']))
1255         if self.params.get('forceformat', False):
1256             self.to_stdout(info_dict['format'])
1257         if self.params.get('forcejson', False):
1258             self.to_stdout(json.dumps(info_dict))
1259
1260         # Do nothing else if in simulate mode
1261         if self.params.get('simulate', False):
1262             return
1263
1264         if filename is None:
1265             return
1266
1267         try:
1268             dn = os.path.dirname(encodeFilename(filename))
1269             if dn and not os.path.exists(dn):
1270                 os.makedirs(dn)
1271         except (OSError, IOError) as err:
1272             self.report_error('unable to create directory ' + compat_str(err))
1273             return
1274
1275         if self.params.get('writedescription', False):
1276             descfn = filename + '.description'
1277             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1278                 self.to_screen('[info] Video description is already present')
1279             elif info_dict.get('description') is None:
1280                 self.report_warning('There\'s no description to write.')
1281             else:
1282                 try:
1283                     self.to_screen('[info] Writing video description to: ' + descfn)
1284                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1285                         descfile.write(info_dict['description'])
1286                 except (OSError, IOError):
1287                     self.report_error('Cannot write description file ' + descfn)
1288                     return
1289
1290         if self.params.get('writeannotations', False):
1291             annofn = filename + '.annotations.xml'
1292             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1293                 self.to_screen('[info] Video annotations are already present')
1294             else:
1295                 try:
1296                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1297                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1298                         annofile.write(info_dict['annotations'])
1299                 except (KeyError, TypeError):
1300                     self.report_warning('There are no annotations to write.')
1301                 except (OSError, IOError):
1302                     self.report_error('Cannot write annotations file: ' + annofn)
1303                     return
1304
1305         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1306                                        self.params.get('writeautomaticsub')])
1307
1308         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1309             # subtitles download errors are already managed as troubles in relevant IE
1310             # that way it will silently go on when used with unsupporting IE
1311             subtitles = info_dict['requested_subtitles']
1312             ie = self.get_info_extractor(info_dict['extractor_key'])
1313             for sub_lang, sub_info in subtitles.items():
1314                 sub_format = sub_info['ext']
1315                 if sub_info.get('data') is not None:
1316                     sub_data = sub_info['data']
1317                 else:
1318                     try:
1319                         sub_data = ie._download_webpage(
1320                             sub_info['url'], info_dict['id'], note=False)
1321                     except ExtractorError as err:
1322                         self.report_warning('Unable to download subtitle for "%s": %s' %
1323                                             (sub_lang, compat_str(err.cause)))
1324                         continue
1325                 try:
1326                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1327                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1328                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1329                     else:
1330                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1331                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1332                             subfile.write(sub_data)
1333                 except (OSError, IOError):
1334                     self.report_error('Cannot write subtitles file ' + sub_filename)
1335                     return
1336
1337         if self.params.get('writeinfojson', False):
1338             infofn = os.path.splitext(filename)[0] + '.info.json'
1339             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1340                 self.to_screen('[info] Video description metadata is already present')
1341             else:
1342                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1343                 try:
1344                     write_json_file(info_dict, infofn)
1345                 except (OSError, IOError):
1346                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1347                     return
1348
1349         self._write_thumbnails(info_dict, filename)
1350
1351         if not self.params.get('skip_download', False):
1352             try:
1353                 def dl(name, info):
1354                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1355                     for ph in self._progress_hooks:
1356                         fd.add_progress_hook(ph)
1357                     if self.params.get('verbose'):
1358                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1359                     return fd.download(name, info)
1360
1361                 if info_dict.get('requested_formats') is not None:
1362                     downloaded = []
1363                     success = True
1364                     merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1365                     if not merger.available:
1366                         postprocessors = []
1367                         self.report_warning('You have requested multiple '
1368                                             'formats but ffmpeg or avconv are not installed.'
1369                                             ' The formats won\'t be merged')
1370                     else:
1371                         postprocessors = [merger]
1372                     for f in info_dict['requested_formats']:
1373                         new_info = dict(info_dict)
1374                         new_info.update(f)
1375                         fname = self.prepare_filename(new_info)
1376                         fname = prepend_extension(fname, 'f%s' % f['format_id'])
1377                         downloaded.append(fname)
1378                         partial_success = dl(fname, new_info)
1379                         success = success and partial_success
1380                     info_dict['__postprocessors'] = postprocessors
1381                     info_dict['__files_to_merge'] = downloaded
1382                 else:
1383                     # Just a single file
1384                     success = dl(filename, info_dict)
1385             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1386                 self.report_error('unable to download video data: %s' % str(err))
1387                 return
1388             except (OSError, IOError) as err:
1389                 raise UnavailableVideoError(err)
1390             except (ContentTooShortError, ) as err:
1391                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1392                 return
1393
1394             if success:
1395                 # Fixup content
1396                 fixup_policy = self.params.get('fixup')
1397                 if fixup_policy is None:
1398                     fixup_policy = 'detect_or_warn'
1399
1400                 stretched_ratio = info_dict.get('stretched_ratio')
1401                 if stretched_ratio is not None and stretched_ratio != 1:
1402                     if fixup_policy == 'warn':
1403                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1404                             info_dict['id'], stretched_ratio))
1405                     elif fixup_policy == 'detect_or_warn':
1406                         stretched_pp = FFmpegFixupStretchedPP(self)
1407                         if stretched_pp.available:
1408                             info_dict.setdefault('__postprocessors', [])
1409                             info_dict['__postprocessors'].append(stretched_pp)
1410                         else:
1411                             self.report_warning(
1412                                 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1413                                     info_dict['id'], stretched_ratio))
1414                     else:
1415                         assert fixup_policy in ('ignore', 'never')
1416
1417                 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1418                     if fixup_policy == 'warn':
1419                         self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1420                             info_dict['id']))
1421                     elif fixup_policy == 'detect_or_warn':
1422                         fixup_pp = FFmpegFixupM4aPP(self)
1423                         if fixup_pp.available:
1424                             info_dict.setdefault('__postprocessors', [])
1425                             info_dict['__postprocessors'].append(fixup_pp)
1426                         else:
1427                             self.report_warning(
1428                                 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1429                                     info_dict['id']))
1430                     else:
1431                         assert fixup_policy in ('ignore', 'never')
1432
1433                 try:
1434                     self.post_process(filename, info_dict)
1435                 except (PostProcessingError) as err:
1436                     self.report_error('postprocessing: %s' % str(err))
1437                     return
1438                 self.record_download_archive(info_dict)
1439
1440     def download(self, url_list):
1441         """Download a given list of URLs."""
1442         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1443         if (len(url_list) > 1 and
1444                 '%' not in outtmpl and
1445                 self.params.get('max_downloads') != 1):
1446             raise SameFileError(outtmpl)
1447
1448         for url in url_list:
1449             try:
1450                 # It also downloads the videos
1451                 res = self.extract_info(url)
1452             except UnavailableVideoError:
1453                 self.report_error('unable to download video')
1454             except MaxDownloadsReached:
1455                 self.to_screen('[info] Maximum number of downloaded files reached.')
1456                 raise
1457             else:
1458                 if self.params.get('dump_single_json', False):
1459                     self.to_stdout(json.dumps(res))
1460
1461         return self._download_retcode
1462
1463     def download_with_info_file(self, info_filename):
1464         with contextlib.closing(fileinput.FileInput(
1465                 [info_filename], mode='r',
1466                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1467             # FileInput doesn't have a read method, we can't call json.load
1468             info = json.loads('\n'.join(f))
1469         try:
1470             self.process_ie_result(info, download=True)
1471         except DownloadError:
1472             webpage_url = info.get('webpage_url')
1473             if webpage_url is not None:
1474                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1475                 return self.download([webpage_url])
1476             else:
1477                 raise
1478         return self._download_retcode
1479
1480     def post_process(self, filename, ie_info):
1481         """Run all the postprocessors on the given file."""
1482         info = dict(ie_info)
1483         info['filepath'] = filename
1484         pps_chain = []
1485         if ie_info.get('__postprocessors') is not None:
1486             pps_chain.extend(ie_info['__postprocessors'])
1487         pps_chain.extend(self._pps)
1488         for pp in pps_chain:
1489             keep_video = None
1490             old_filename = info['filepath']
1491             try:
1492                 keep_video_wish, info = pp.run(info)
1493                 if keep_video_wish is not None:
1494                     if keep_video_wish:
1495                         keep_video = keep_video_wish
1496                     elif keep_video is None:
1497                         # No clear decision yet, let IE decide
1498                         keep_video = keep_video_wish
1499             except PostProcessingError as e:
1500                 self.report_error(e.msg)
1501             if keep_video is False and not self.params.get('keepvideo', False):
1502                 try:
1503                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1504                     os.remove(encodeFilename(old_filename))
1505                 except (IOError, OSError):
1506                     self.report_warning('Unable to remove downloaded video file')
1507
1508     def _make_archive_id(self, info_dict):
1509         # Future-proof against any change in case
1510         # and backwards compatibility with prior versions
1511         extractor = info_dict.get('extractor_key')
1512         if extractor is None:
1513             if 'id' in info_dict:
1514                 extractor = info_dict.get('ie_key')  # key in a playlist
1515         if extractor is None:
1516             return None  # Incomplete video information
1517         return extractor.lower() + ' ' + info_dict['id']
1518
1519     def in_download_archive(self, info_dict):
1520         fn = self.params.get('download_archive')
1521         if fn is None:
1522             return False
1523
1524         vid_id = self._make_archive_id(info_dict)
1525         if vid_id is None:
1526             return False  # Incomplete video information
1527
1528         try:
1529             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1530                 for line in archive_file:
1531                     if line.strip() == vid_id:
1532                         return True
1533         except IOError as ioe:
1534             if ioe.errno != errno.ENOENT:
1535                 raise
1536         return False
1537
1538     def record_download_archive(self, info_dict):
1539         fn = self.params.get('download_archive')
1540         if fn is None:
1541             return
1542         vid_id = self._make_archive_id(info_dict)
1543         assert vid_id
1544         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1545             archive_file.write(vid_id + '\n')
1546
1547     @staticmethod
1548     def format_resolution(format, default='unknown'):
1549         if format.get('vcodec') == 'none':
1550             return 'audio only'
1551         if format.get('resolution') is not None:
1552             return format['resolution']
1553         if format.get('height') is not None:
1554             if format.get('width') is not None:
1555                 res = '%sx%s' % (format['width'], format['height'])
1556             else:
1557                 res = '%sp' % format['height']
1558         elif format.get('width') is not None:
1559             res = '?x%d' % format['width']
1560         else:
1561             res = default
1562         return res
1563
1564     def _format_note(self, fdict):
1565         res = ''
1566         if fdict.get('ext') in ['f4f', 'f4m']:
1567             res += '(unsupported) '
1568         if fdict.get('format_note') is not None:
1569             res += fdict['format_note'] + ' '
1570         if fdict.get('tbr') is not None:
1571             res += '%4dk ' % fdict['tbr']
1572         if fdict.get('container') is not None:
1573             if res:
1574                 res += ', '
1575             res += '%s container' % fdict['container']
1576         if (fdict.get('vcodec') is not None and
1577                 fdict.get('vcodec') != 'none'):
1578             if res:
1579                 res += ', '
1580             res += fdict['vcodec']
1581             if fdict.get('vbr') is not None:
1582                 res += '@'
1583         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1584             res += 'video@'
1585         if fdict.get('vbr') is not None:
1586             res += '%4dk' % fdict['vbr']
1587         if fdict.get('fps') is not None:
1588             res += ', %sfps' % fdict['fps']
1589         if fdict.get('acodec') is not None:
1590             if res:
1591                 res += ', '
1592             if fdict['acodec'] == 'none':
1593                 res += 'video only'
1594             else:
1595                 res += '%-5s' % fdict['acodec']
1596         elif fdict.get('abr') is not None:
1597             if res:
1598                 res += ', '
1599             res += 'audio'
1600         if fdict.get('abr') is not None:
1601             res += '@%3dk' % fdict['abr']
1602         if fdict.get('asr') is not None:
1603             res += ' (%5dHz)' % fdict['asr']
1604         if fdict.get('filesize') is not None:
1605             if res:
1606                 res += ', '
1607             res += format_bytes(fdict['filesize'])
1608         elif fdict.get('filesize_approx') is not None:
1609             if res:
1610                 res += ', '
1611             res += '~' + format_bytes(fdict['filesize_approx'])
1612         return res
1613
1614     def list_formats(self, info_dict):
1615         formats = info_dict.get('formats', [info_dict])
1616         table = [
1617             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1618             for f in formats
1619             if f.get('preference') is None or f['preference'] >= -1000]
1620         if len(formats) > 1:
1621             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1622
1623         header_line = ['format code', 'extension', 'resolution', 'note']
1624         self.to_screen(
1625             '[info] Available formats for %s:\n%s' %
1626             (info_dict['id'], render_table(header_line, table)))
1627
1628     def list_thumbnails(self, info_dict):
1629         thumbnails = info_dict.get('thumbnails')
1630         if not thumbnails:
1631             tn_url = info_dict.get('thumbnail')
1632             if tn_url:
1633                 thumbnails = [{'id': '0', 'url': tn_url}]
1634             else:
1635                 self.to_screen(
1636                     '[info] No thumbnails present for %s' % info_dict['id'])
1637                 return
1638
1639         self.to_screen(
1640             '[info] Thumbnails for %s:' % info_dict['id'])
1641         self.to_screen(render_table(
1642             ['ID', 'width', 'height', 'URL'],
1643             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1644
1645     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1646         if not subtitles:
1647             self.to_screen('%s has no %s' % (video_id, name))
1648             return
1649         self.to_screen(
1650             'Available %s for %s:' % (name, video_id))
1651         self.to_screen(render_table(
1652             ['Language', 'formats'],
1653             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1654                 for lang, formats in subtitles.items()]))
1655
1656     def urlopen(self, req):
1657         """ Start an HTTP download """
1658
1659         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1660         # always respected by websites, some tend to give out URLs with non percent-encoded
1661         # non-ASCII characters (see telemb.py, ard.py [#3412])
1662         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1663         # To work around aforementioned issue we will replace request's original URL with
1664         # percent-encoded one
1665         req_is_string = isinstance(req, compat_basestring)
1666         url = req if req_is_string else req.get_full_url()
1667         url_escaped = escape_url(url)
1668
1669         # Substitute URL if any change after escaping
1670         if url != url_escaped:
1671             if req_is_string:
1672                 req = url_escaped
1673             else:
1674                 req = compat_urllib_request.Request(
1675                     url_escaped, data=req.data, headers=req.headers,
1676                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1677
1678         return self._opener.open(req, timeout=self._socket_timeout)
1679
1680     def print_debug_header(self):
1681         if not self.params.get('verbose'):
1682             return
1683
1684         if type('') is not compat_str:
1685             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1686             self.report_warning(
1687                 'Your Python is broken! Update to a newer and supported version')
1688
1689         stdout_encoding = getattr(
1690             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1691         encoding_str = (
1692             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1693                 locale.getpreferredencoding(),
1694                 sys.getfilesystemencoding(),
1695                 stdout_encoding,
1696                 self.get_encoding()))
1697         write_string(encoding_str, encoding=None)
1698
1699         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1700         try:
1701             sp = subprocess.Popen(
1702                 ['git', 'rev-parse', '--short', 'HEAD'],
1703                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1704                 cwd=os.path.dirname(os.path.abspath(__file__)))
1705             out, err = sp.communicate()
1706             out = out.decode().strip()
1707             if re.match('[0-9a-f]+', out):
1708                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1709         except:
1710             try:
1711                 sys.exc_clear()
1712             except:
1713                 pass
1714         self._write_string('[debug] Python version %s - %s\n' % (
1715             platform.python_version(), platform_name()))
1716
1717         exe_versions = FFmpegPostProcessor.get_versions(self)
1718         exe_versions['rtmpdump'] = rtmpdump_version()
1719         exe_str = ', '.join(
1720             '%s %s' % (exe, v)
1721             for exe, v in sorted(exe_versions.items())
1722             if v
1723         )
1724         if not exe_str:
1725             exe_str = 'none'
1726         self._write_string('[debug] exe versions: %s\n' % exe_str)
1727
1728         proxy_map = {}
1729         for handler in self._opener.handlers:
1730             if hasattr(handler, 'proxies'):
1731                 proxy_map.update(handler.proxies)
1732         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1733
1734         if self.params.get('call_home', False):
1735             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1736             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1737             latest_version = self.urlopen(
1738                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1739             if version_tuple(latest_version) > version_tuple(__version__):
1740                 self.report_warning(
1741                     'You are using an outdated version (newest version: %s)! '
1742                     'See https://yt-dl.org/update if you need help updating.' %
1743                     latest_version)
1744
1745     def _setup_opener(self):
1746         timeout_val = self.params.get('socket_timeout')
1747         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1748
1749         opts_cookiefile = self.params.get('cookiefile')
1750         opts_proxy = self.params.get('proxy')
1751
1752         if opts_cookiefile is None:
1753             self.cookiejar = compat_cookiejar.CookieJar()
1754         else:
1755             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1756                 opts_cookiefile)
1757             if os.access(opts_cookiefile, os.R_OK):
1758                 self.cookiejar.load()
1759
1760         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1761             self.cookiejar)
1762         if opts_proxy is not None:
1763             if opts_proxy == '':
1764                 proxies = {}
1765             else:
1766                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1767         else:
1768             proxies = compat_urllib_request.getproxies()
1769             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1770             if 'http' in proxies and 'https' not in proxies:
1771                 proxies['https'] = proxies['http']
1772         proxy_handler = PerRequestProxyHandler(proxies)
1773
1774         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1775         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1776         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1777         opener = compat_urllib_request.build_opener(
1778             proxy_handler, https_handler, cookie_processor, ydlh)
1779
1780         # Delete the default user-agent header, which would otherwise apply in
1781         # cases where our custom HTTP handler doesn't come into play
1782         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1783         opener.addheaders = []
1784         self._opener = opener
1785
1786     def encode(self, s):
1787         if isinstance(s, bytes):
1788             return s  # Already encoded
1789
1790         try:
1791             return s.encode(self.get_encoding())
1792         except UnicodeEncodeError as err:
1793             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1794             raise
1795
1796     def get_encoding(self):
1797         encoding = self.params.get('encoding')
1798         if encoding is None:
1799             encoding = preferredencoding()
1800         return encoding
1801
1802     def _write_thumbnails(self, info_dict, filename):
1803         if self.params.get('writethumbnail', False):
1804             thumbnails = info_dict.get('thumbnails')
1805             if thumbnails:
1806                 thumbnails = [thumbnails[-1]]
1807         elif self.params.get('write_all_thumbnails', False):
1808             thumbnails = info_dict.get('thumbnails')
1809         else:
1810             return
1811
1812         if not thumbnails:
1813             # No thumbnails present, so return immediately
1814             return
1815
1816         for t in thumbnails:
1817             thumb_ext = determine_ext(t['url'], 'jpg')
1818             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
1819             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
1820             thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
1821
1822             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1823                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
1824                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1825             else:
1826                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
1827                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1828                 try:
1829                     uf = self.urlopen(t['url'])
1830                     with open(thumb_filename, 'wb') as thumbf:
1831                         shutil.copyfileobj(uf, thumbf)
1832                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
1833                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
1834                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835                     self.report_warning('Unable to download thumbnail "%s": %s' %
1836                                         (t['url'], compat_str(err)))