Add --max-sleep-interval (Closes #9930)
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27
28 from .compat import (
29     compat_basestring,
30     compat_cookiejar,
31     compat_expanduser,
32     compat_get_terminal_size,
33     compat_http_client,
34     compat_kwargs,
35     compat_os_name,
36     compat_str,
37     compat_tokenize_tokenize,
38     compat_urllib_error,
39     compat_urllib_request,
40     compat_urllib_request_DataHandler,
41 )
42 from .utils import (
43     age_restricted,
44     args_to_str,
45     ContentTooShortError,
46     date_from_str,
47     DateRange,
48     DEFAULT_OUTTMPL,
49     determine_ext,
50     determine_protocol,
51     DownloadError,
52     encode_compat_str,
53     encodeFilename,
54     error_to_compat_str,
55     ExtractorError,
56     format_bytes,
57     formatSeconds,
58     locked_file,
59     make_HTTPS_handler,
60     MaxDownloadsReached,
61     PagedList,
62     parse_filesize,
63     PerRequestProxyHandler,
64     platform_name,
65     PostProcessingError,
66     preferredencoding,
67     prepend_extension,
68     register_socks_protocols,
69     render_table,
70     replace_extension,
71     SameFileError,
72     sanitize_filename,
73     sanitize_path,
74     sanitize_url,
75     sanitized_Request,
76     std_headers,
77     subtitles_filename,
78     UnavailableVideoError,
79     url_basename,
80     version_tuple,
81     write_json_file,
82     write_string,
83     YoutubeDLCookieProcessor,
84     YoutubeDLHandler,
85 )
86 from .cache import Cache
87 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
88 from .downloader import get_suitable_downloader
89 from .downloader.rtmp import rtmpdump_version
90 from .postprocessor import (
91     FFmpegFixupM3u8PP,
92     FFmpegFixupM4aPP,
93     FFmpegFixupStretchedPP,
94     FFmpegMergerPP,
95     FFmpegPostProcessor,
96     get_postprocessor,
97 )
98 from .version import __version__
99
100 if compat_os_name == 'nt':
101     import ctypes
102
103
104 class YoutubeDL(object):
105     """YoutubeDL class.
106
107     YoutubeDL objects are the ones responsible of downloading the
108     actual video file and writing it to disk if the user has requested
109     it, among some other tasks. In most cases there should be one per
110     program. As, given a video URL, the downloader doesn't know how to
111     extract all the needed information, task that InfoExtractors do, it
112     has to pass the URL to one of them.
113
114     For this, YoutubeDL objects have a method that allows
115     InfoExtractors to be registered in a given order. When it is passed
116     a URL, the YoutubeDL object handles it to the first InfoExtractor it
117     finds that reports being able to handle it. The InfoExtractor extracts
118     all the information about the video or videos the URL refers to, and
119     YoutubeDL process the extracted information, possibly using a File
120     Downloader to download the video.
121
122     YoutubeDL objects accept a lot of parameters. In order not to saturate
123     the object constructor with arguments, it receives a dictionary of
124     options instead. These options are available through the params
125     attribute for the InfoExtractors to use. The YoutubeDL also
126     registers itself as the downloader in charge for the InfoExtractors
127     that are added to it, so this is a "mutual registration".
128
129     Available options:
130
131     username:          Username for authentication purposes.
132     password:          Password for authentication purposes.
133     videopassword:     Password for accessing a video.
134     usenetrc:          Use netrc for authentication instead.
135     verbose:           Print additional info to stdout.
136     quiet:             Do not print messages to stdout.
137     no_warnings:       Do not print out anything for warnings.
138     forceurl:          Force printing final URL.
139     forcetitle:        Force printing title.
140     forceid:           Force printing ID.
141     forcethumbnail:    Force printing thumbnail URL.
142     forcedescription:  Force printing description.
143     forcefilename:     Force printing final filename.
144     forceduration:     Force printing duration.
145     forcejson:         Force printing info_dict as JSON.
146     dump_single_json:  Force printing the info_dict of the whole playlist
147                        (or video) as a single JSON line.
148     simulate:          Do not download the video files.
149     format:            Video format code. See options.py for more information.
150     outtmpl:           Template for output names.
151     restrictfilenames: Do not allow "&" and spaces in file names
152     ignoreerrors:      Do not stop on download errors.
153     force_generic_extractor: Force downloader to use the generic extractor
154     nooverwrites:      Prevent overwriting files.
155     playliststart:     Playlist item to start at.
156     playlistend:       Playlist item to end at.
157     playlist_items:    Specific indices of playlist to download.
158     playlistreverse:   Download playlist items in reverse order.
159     matchtitle:        Download only matching titles.
160     rejecttitle:       Reject downloads for matching titles.
161     logger:            Log messages to a logging.Logger instance.
162     logtostderr:       Log messages to stderr instead of stdout.
163     writedescription:  Write the video description to a .description file
164     writeinfojson:     Write the video description to a .info.json file
165     writeannotations:  Write the video annotations to a .annotations.xml file
166     writethumbnail:    Write the thumbnail image to a file
167     write_all_thumbnails:  Write all thumbnail formats to files
168     writesubtitles:    Write the video subtitles to a file
169     writeautomaticsub: Write the automatically generated subtitles to a file
170     allsubtitles:      Downloads all the subtitles of the video
171                        (requires writesubtitles or writeautomaticsub)
172     listsubtitles:     Lists all available subtitles for the video
173     subtitlesformat:   The format code for subtitles
174     subtitleslangs:    List of languages of the subtitles to download
175     keepvideo:         Keep the video file after post-processing
176     daterange:         A DateRange object, download only if the upload_date is in the range.
177     skip_download:     Skip the actual download of the video file
178     cachedir:          Location of the cache files in the filesystem.
179                        False to disable filesystem cache.
180     noplaylist:        Download single video instead of a playlist if in doubt.
181     age_limit:         An integer representing the user's age in years.
182                        Unsuitable videos for the given age are skipped.
183     min_views:         An integer representing the minimum view count the video
184                        must have in order to not be skipped.
185                        Videos without view count information are always
186                        downloaded. None for no limit.
187     max_views:         An integer representing the maximum view count.
188                        Videos that are more popular than that are not
189                        downloaded.
190                        Videos without view count information are always
191                        downloaded. None for no limit.
192     download_archive:  File name of a file where all downloads are recorded.
193                        Videos already present in the file are not downloaded
194                        again.
195     cookiefile:        File name where cookies should be read from and dumped to.
196     nocheckcertificate:Do not verify SSL certificates
197     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
198                        At the moment, this is only supported by YouTube.
199     proxy:             URL of the proxy server to use
200     geo_verification_proxy:  URL of the proxy to use for IP address verification
201                        on geo-restricted sites. (Experimental)
202     socket_timeout:    Time to wait for unresponsive hosts, in seconds
203     bidi_workaround:   Work around buggy terminals without bidirectional text
204                        support, using fridibi
205     debug_printtraffic:Print out sent and received HTTP traffic
206     include_ads:       Download ads as well
207     default_search:    Prepend this string if an input url is not valid.
208                        'auto' for elaborate guessing
209     encoding:          Use this encoding instead of the system-specified.
210     extract_flat:      Do not resolve URLs, return the immediate result.
211                        Pass in 'in_playlist' to only show this behavior for
212                        playlist items.
213     postprocessors:    A list of dictionaries, each with an entry
214                        * key:  The name of the postprocessor. See
215                                youtube_dl/postprocessor/__init__.py for a list.
216                        as well as any further keyword arguments for the
217                        postprocessor.
218     progress_hooks:    A list of functions that get called on download
219                        progress, with a dictionary with the entries
220                        * status: One of "downloading", "error", or "finished".
221                                  Check this first and ignore unknown values.
222
223                        If status is one of "downloading", or "finished", the
224                        following properties may also be present:
225                        * filename: The final filename (always present)
226                        * tmpfilename: The filename we're currently writing to
227                        * downloaded_bytes: Bytes on disk
228                        * total_bytes: Size of the whole file, None if unknown
229                        * total_bytes_estimate: Guess of the eventual file size,
230                                                None if unavailable.
231                        * elapsed: The number of seconds since download started.
232                        * eta: The estimated time in seconds, None if unknown
233                        * speed: The download speed in bytes/second, None if
234                                 unknown
235                        * fragment_index: The counter of the currently
236                                          downloaded video fragment.
237                        * fragment_count: The number of fragments (= individual
238                                          files that will be merged)
239
240                        Progress hooks are guaranteed to be called at least once
241                        (with status "finished") if the download is successful.
242     merge_output_format: Extension to use when merging formats.
243     fixup:             Automatically correct known faults of the file.
244                        One of:
245                        - "never": do nothing
246                        - "warn": only emit a warning
247                        - "detect_or_warn": check whether we can do anything
248                                            about it, warn otherwise (default)
249     source_address:    (Experimental) Client-side IP address to bind to.
250     call_home:         Boolean, true iff we are allowed to contact the
251                        youtube-dl servers for debugging.
252     sleep_interval:    Minimum number of seconds to sleep before each download.
253                        Sleep will be for a random interval if --max-sleep-interval is also passed.
254     max_sleep_interval:Max number of seconds to sleep before each download.
255                        Sleep will be for a random interval if passed along with --min-sleep-interval
256                        or --sleep-interval, otherwise ignored.
257     listformats:       Print an overview of available video formats and exit.
258     list_thumbnails:   Print a table of all thumbnails and exit.
259     match_filter:      A function that gets called with the info_dict of
260                        every video.
261                        If it returns a message, the video is ignored.
262                        If it returns None, the video is downloaded.
263                        match_filter_func in utils.py is one example for this.
264     no_color:          Do not emit color codes in output.
265
266     The following options determine which downloader is picked:
267     external_downloader: Executable of the external downloader to call.
268                        None or unset for standard (built-in) downloader.
269     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
270                        if True, otherwise use ffmpeg/avconv if False, otherwise
271                        use downloader suggested by extractor if None.
272
273     The following parameters are not used by YoutubeDL itself, they are used by
274     the downloader (see youtube_dl/downloader/common.py):
275     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
276     noresizebuffer, retries, continuedl, noprogress, consoletitle,
277     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
278
279     The following options are used by the post processors:
280     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
281                        otherwise prefer avconv.
282     postprocessor_args: A list of additional command-line arguments for the
283                         postprocessor.
284     """
285
286     params = None
287     _ies = []
288     _pps = []
289     _download_retcode = None
290     _num_downloads = None
291     _screen_file = None
292
293     def __init__(self, params=None, auto_init=True):
294         """Create a FileDownloader object with the given options."""
295         if params is None:
296             params = {}
297         self._ies = []
298         self._ies_instances = {}
299         self._pps = []
300         self._progress_hooks = []
301         self._download_retcode = 0
302         self._num_downloads = 0
303         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
304         self._err_file = sys.stderr
305         self.params = {
306             # Default parameters
307             'nocheckcertificate': False,
308         }
309         self.params.update(params)
310         self.cache = Cache(self)
311
312         if self.params.get('cn_verification_proxy') is not None:
313             self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.')
314             if self.params.get('geo_verification_proxy') is None:
315                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
316
317         if params.get('bidi_workaround', False):
318             try:
319                 import pty
320                 master, slave = pty.openpty()
321                 width = compat_get_terminal_size().columns
322                 if width is None:
323                     width_args = []
324                 else:
325                     width_args = ['-w', str(width)]
326                 sp_kwargs = dict(
327                     stdin=subprocess.PIPE,
328                     stdout=slave,
329                     stderr=self._err_file)
330                 try:
331                     self._output_process = subprocess.Popen(
332                         ['bidiv'] + width_args, **sp_kwargs
333                     )
334                 except OSError:
335                     self._output_process = subprocess.Popen(
336                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
337                 self._output_channel = os.fdopen(master, 'rb')
338             except OSError as ose:
339                 if ose.errno == errno.ENOENT:
340                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
341                 else:
342                     raise
343
344         if (sys.version_info >= (3,) and sys.platform != 'win32' and
345                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
346                 not params.get('restrictfilenames', False)):
347             # On Python 3, the Unicode filesystem API will throw errors (#1474)
348             self.report_warning(
349                 'Assuming --restrict-filenames since file system encoding '
350                 'cannot encode all characters. '
351                 'Set the LC_ALL environment variable to fix this.')
352             self.params['restrictfilenames'] = True
353
354         if isinstance(params.get('outtmpl'), bytes):
355             self.report_warning(
356                 'Parameter outtmpl is bytes, but should be a unicode string. '
357                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
358
359         self._setup_opener()
360
361         if auto_init:
362             self.print_debug_header()
363             self.add_default_info_extractors()
364
365         for pp_def_raw in self.params.get('postprocessors', []):
366             pp_class = get_postprocessor(pp_def_raw['key'])
367             pp_def = dict(pp_def_raw)
368             del pp_def['key']
369             pp = pp_class(self, **compat_kwargs(pp_def))
370             self.add_post_processor(pp)
371
372         for ph in self.params.get('progress_hooks', []):
373             self.add_progress_hook(ph)
374
375         register_socks_protocols()
376
377     def warn_if_short_id(self, argv):
378         # short YouTube ID starting with dash?
379         idxs = [
380             i for i, a in enumerate(argv)
381             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
382         if idxs:
383             correct_argv = (
384                 ['youtube-dl'] +
385                 [a for i, a in enumerate(argv) if i not in idxs] +
386                 ['--'] + [argv[i] for i in idxs]
387             )
388             self.report_warning(
389                 'Long argument string detected. '
390                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
391                 args_to_str(correct_argv))
392
393     def add_info_extractor(self, ie):
394         """Add an InfoExtractor object to the end of the list."""
395         self._ies.append(ie)
396         if not isinstance(ie, type):
397             self._ies_instances[ie.ie_key()] = ie
398             ie.set_downloader(self)
399
400     def get_info_extractor(self, ie_key):
401         """
402         Get an instance of an IE with name ie_key, it will try to get one from
403         the _ies list, if there's no instance it will create a new one and add
404         it to the extractor list.
405         """
406         ie = self._ies_instances.get(ie_key)
407         if ie is None:
408             ie = get_info_extractor(ie_key)()
409             self.add_info_extractor(ie)
410         return ie
411
412     def add_default_info_extractors(self):
413         """
414         Add the InfoExtractors returned by gen_extractors to the end of the list
415         """
416         for ie in gen_extractor_classes():
417             self.add_info_extractor(ie)
418
419     def add_post_processor(self, pp):
420         """Add a PostProcessor object to the end of the chain."""
421         self._pps.append(pp)
422         pp.set_downloader(self)
423
424     def add_progress_hook(self, ph):
425         """Add the progress hook (currently only for the file downloader)"""
426         self._progress_hooks.append(ph)
427
428     def _bidi_workaround(self, message):
429         if not hasattr(self, '_output_channel'):
430             return message
431
432         assert hasattr(self, '_output_process')
433         assert isinstance(message, compat_str)
434         line_count = message.count('\n') + 1
435         self._output_process.stdin.write((message + '\n').encode('utf-8'))
436         self._output_process.stdin.flush()
437         res = ''.join(self._output_channel.readline().decode('utf-8')
438                       for _ in range(line_count))
439         return res[:-len('\n')]
440
441     def to_screen(self, message, skip_eol=False):
442         """Print message to stdout if not in quiet mode."""
443         return self.to_stdout(message, skip_eol, check_quiet=True)
444
445     def _write_string(self, s, out=None):
446         write_string(s, out=out, encoding=self.params.get('encoding'))
447
448     def to_stdout(self, message, skip_eol=False, check_quiet=False):
449         """Print message to stdout if not in quiet mode."""
450         if self.params.get('logger'):
451             self.params['logger'].debug(message)
452         elif not check_quiet or not self.params.get('quiet', False):
453             message = self._bidi_workaround(message)
454             terminator = ['\n', ''][skip_eol]
455             output = message + terminator
456
457             self._write_string(output, self._screen_file)
458
459     def to_stderr(self, message):
460         """Print message to stderr."""
461         assert isinstance(message, compat_str)
462         if self.params.get('logger'):
463             self.params['logger'].error(message)
464         else:
465             message = self._bidi_workaround(message)
466             output = message + '\n'
467             self._write_string(output, self._err_file)
468
469     def to_console_title(self, message):
470         if not self.params.get('consoletitle', False):
471             return
472         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
473             # c_wchar_p() might not be necessary if `message` is
474             # already of type unicode()
475             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
476         elif 'TERM' in os.environ:
477             self._write_string('\033]0;%s\007' % message, self._screen_file)
478
479     def save_console_title(self):
480         if not self.params.get('consoletitle', False):
481             return
482         if 'TERM' in os.environ:
483             # Save the title on stack
484             self._write_string('\033[22;0t', self._screen_file)
485
486     def restore_console_title(self):
487         if not self.params.get('consoletitle', False):
488             return
489         if 'TERM' in os.environ:
490             # Restore the title from stack
491             self._write_string('\033[23;0t', self._screen_file)
492
493     def __enter__(self):
494         self.save_console_title()
495         return self
496
497     def __exit__(self, *args):
498         self.restore_console_title()
499
500         if self.params.get('cookiefile') is not None:
501             self.cookiejar.save()
502
503     def trouble(self, message=None, tb=None):
504         """Determine action to take when a download problem appears.
505
506         Depending on if the downloader has been configured to ignore
507         download errors or not, this method may throw an exception or
508         not when errors are found, after printing the message.
509
510         tb, if given, is additional traceback information.
511         """
512         if message is not None:
513             self.to_stderr(message)
514         if self.params.get('verbose'):
515             if tb is None:
516                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
517                     tb = ''
518                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
519                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
520                     tb += encode_compat_str(traceback.format_exc())
521                 else:
522                     tb_data = traceback.format_list(traceback.extract_stack())
523                     tb = ''.join(tb_data)
524             self.to_stderr(tb)
525         if not self.params.get('ignoreerrors', False):
526             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
527                 exc_info = sys.exc_info()[1].exc_info
528             else:
529                 exc_info = sys.exc_info()
530             raise DownloadError(message, exc_info)
531         self._download_retcode = 1
532
533     def report_warning(self, message):
534         '''
535         Print the message to stderr, it will be prefixed with 'WARNING:'
536         If stderr is a tty file the 'WARNING:' will be colored
537         '''
538         if self.params.get('logger') is not None:
539             self.params['logger'].warning(message)
540         else:
541             if self.params.get('no_warnings'):
542                 return
543             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
544                 _msg_header = '\033[0;33mWARNING:\033[0m'
545             else:
546                 _msg_header = 'WARNING:'
547             warning_message = '%s %s' % (_msg_header, message)
548             self.to_stderr(warning_message)
549
550     def report_error(self, message, tb=None):
551         '''
552         Do the same as trouble, but prefixes the message with 'ERROR:', colored
553         in red if stderr is a tty file.
554         '''
555         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
556             _msg_header = '\033[0;31mERROR:\033[0m'
557         else:
558             _msg_header = 'ERROR:'
559         error_message = '%s %s' % (_msg_header, message)
560         self.trouble(error_message, tb)
561
562     def report_file_already_downloaded(self, file_name):
563         """Report file has already been fully downloaded."""
564         try:
565             self.to_screen('[download] %s has already been downloaded' % file_name)
566         except UnicodeEncodeError:
567             self.to_screen('[download] The file has already been downloaded')
568
569     def prepare_filename(self, info_dict):
570         """Generate the output filename."""
571         try:
572             template_dict = dict(info_dict)
573
574             template_dict['epoch'] = int(time.time())
575             autonumber_size = self.params.get('autonumber_size')
576             if autonumber_size is None:
577                 autonumber_size = 5
578             autonumber_templ = '%0' + str(autonumber_size) + 'd'
579             template_dict['autonumber'] = autonumber_templ % self._num_downloads
580             if template_dict.get('playlist_index') is not None:
581                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
582             if template_dict.get('resolution') is None:
583                 if template_dict.get('width') and template_dict.get('height'):
584                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
585                 elif template_dict.get('height'):
586                     template_dict['resolution'] = '%sp' % template_dict['height']
587                 elif template_dict.get('width'):
588                     template_dict['resolution'] = '%dx?' % template_dict['width']
589
590             sanitize = lambda k, v: sanitize_filename(
591                 compat_str(v),
592                 restricted=self.params.get('restrictfilenames'),
593                 is_id=(k == 'id'))
594             template_dict = dict((k, sanitize(k, v))
595                                  for k, v in template_dict.items()
596                                  if v is not None and not isinstance(v, (list, tuple, dict)))
597             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
598
599             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
600             tmpl = compat_expanduser(outtmpl)
601             filename = tmpl % template_dict
602             # Temporary fix for #4787
603             # 'Treat' all problem characters by passing filename through preferredencoding
604             # to workaround encoding issues with subprocess on python2 @ Windows
605             if sys.version_info < (3, 0) and sys.platform == 'win32':
606                 filename = encodeFilename(filename, True).decode(preferredencoding())
607             return sanitize_path(filename)
608         except ValueError as err:
609             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
610             return None
611
612     def _match_entry(self, info_dict, incomplete):
613         """ Returns None iff the file should be downloaded """
614
615         video_title = info_dict.get('title', info_dict.get('id', 'video'))
616         if 'title' in info_dict:
617             # This can happen when we're just evaluating the playlist
618             title = info_dict['title']
619             matchtitle = self.params.get('matchtitle', False)
620             if matchtitle:
621                 if not re.search(matchtitle, title, re.IGNORECASE):
622                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
623             rejecttitle = self.params.get('rejecttitle', False)
624             if rejecttitle:
625                 if re.search(rejecttitle, title, re.IGNORECASE):
626                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
627         date = info_dict.get('upload_date')
628         if date is not None:
629             dateRange = self.params.get('daterange', DateRange())
630             if date not in dateRange:
631                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
632         view_count = info_dict.get('view_count')
633         if view_count is not None:
634             min_views = self.params.get('min_views')
635             if min_views is not None and view_count < min_views:
636                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
637             max_views = self.params.get('max_views')
638             if max_views is not None and view_count > max_views:
639                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
640         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
641             return 'Skipping "%s" because it is age restricted' % video_title
642         if self.in_download_archive(info_dict):
643             return '%s has already been recorded in archive' % video_title
644
645         if not incomplete:
646             match_filter = self.params.get('match_filter')
647             if match_filter is not None:
648                 ret = match_filter(info_dict)
649                 if ret is not None:
650                     return ret
651
652         return None
653
654     @staticmethod
655     def add_extra_info(info_dict, extra_info):
656         '''Set the keys from extra_info in info dict if they are missing'''
657         for key, value in extra_info.items():
658             info_dict.setdefault(key, value)
659
660     def extract_info(self, url, download=True, ie_key=None, extra_info={},
661                      process=True, force_generic_extractor=False):
662         '''
663         Returns a list with a dictionary for each video we find.
664         If 'download', also downloads the videos.
665         extra_info is a dict containing the extra values to add to each result
666         '''
667
668         if not ie_key and force_generic_extractor:
669             ie_key = 'Generic'
670
671         if ie_key:
672             ies = [self.get_info_extractor(ie_key)]
673         else:
674             ies = self._ies
675
676         for ie in ies:
677             if not ie.suitable(url):
678                 continue
679
680             ie = self.get_info_extractor(ie.ie_key())
681             if not ie.working():
682                 self.report_warning('The program functionality for this site has been marked as broken, '
683                                     'and will probably not work.')
684
685             try:
686                 ie_result = ie.extract(url)
687                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
688                     break
689                 if isinstance(ie_result, list):
690                     # Backwards compatibility: old IE result format
691                     ie_result = {
692                         '_type': 'compat_list',
693                         'entries': ie_result,
694                     }
695                 self.add_default_extra_info(ie_result, ie, url)
696                 if process:
697                     return self.process_ie_result(ie_result, download, extra_info)
698                 else:
699                     return ie_result
700             except ExtractorError as e:  # An error we somewhat expected
701                 self.report_error(compat_str(e), e.format_traceback())
702                 break
703             except MaxDownloadsReached:
704                 raise
705             except Exception as e:
706                 if self.params.get('ignoreerrors', False):
707                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
708                     break
709                 else:
710                     raise
711         else:
712             self.report_error('no suitable InfoExtractor for URL %s' % url)
713
714     def add_default_extra_info(self, ie_result, ie, url):
715         self.add_extra_info(ie_result, {
716             'extractor': ie.IE_NAME,
717             'webpage_url': url,
718             'webpage_url_basename': url_basename(url),
719             'extractor_key': ie.ie_key(),
720         })
721
722     def process_ie_result(self, ie_result, download=True, extra_info={}):
723         """
724         Take the result of the ie(may be modified) and resolve all unresolved
725         references (URLs, playlist items).
726
727         It will also download the videos if 'download'.
728         Returns the resolved ie_result.
729         """
730         result_type = ie_result.get('_type', 'video')
731
732         if result_type in ('url', 'url_transparent'):
733             ie_result['url'] = sanitize_url(ie_result['url'])
734             extract_flat = self.params.get('extract_flat', False)
735             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
736                     extract_flat is True):
737                 if self.params.get('forcejson', False):
738                     self.to_stdout(json.dumps(ie_result))
739                 return ie_result
740
741         if result_type == 'video':
742             self.add_extra_info(ie_result, extra_info)
743             return self.process_video_result(ie_result, download=download)
744         elif result_type == 'url':
745             # We have to add extra_info to the results because it may be
746             # contained in a playlist
747             return self.extract_info(ie_result['url'],
748                                      download,
749                                      ie_key=ie_result.get('ie_key'),
750                                      extra_info=extra_info)
751         elif result_type == 'url_transparent':
752             # Use the information from the embedding page
753             info = self.extract_info(
754                 ie_result['url'], ie_key=ie_result.get('ie_key'),
755                 extra_info=extra_info, download=False, process=False)
756
757             force_properties = dict(
758                 (k, v) for k, v in ie_result.items() if v is not None)
759             for f in ('_type', 'url', 'ie_key'):
760                 if f in force_properties:
761                     del force_properties[f]
762             new_result = info.copy()
763             new_result.update(force_properties)
764
765             assert new_result.get('_type') != 'url_transparent'
766
767             return self.process_ie_result(
768                 new_result, download=download, extra_info=extra_info)
769         elif result_type == 'playlist' or result_type == 'multi_video':
770             # We process each entry in the playlist
771             playlist = ie_result.get('title') or ie_result.get('id')
772             self.to_screen('[download] Downloading playlist: %s' % playlist)
773
774             playlist_results = []
775
776             playliststart = self.params.get('playliststart', 1) - 1
777             playlistend = self.params.get('playlistend')
778             # For backwards compatibility, interpret -1 as whole list
779             if playlistend == -1:
780                 playlistend = None
781
782             playlistitems_str = self.params.get('playlist_items')
783             playlistitems = None
784             if playlistitems_str is not None:
785                 def iter_playlistitems(format):
786                     for string_segment in format.split(','):
787                         if '-' in string_segment:
788                             start, end = string_segment.split('-')
789                             for item in range(int(start), int(end) + 1):
790                                 yield int(item)
791                         else:
792                             yield int(string_segment)
793                 playlistitems = iter_playlistitems(playlistitems_str)
794
795             ie_entries = ie_result['entries']
796             if isinstance(ie_entries, list):
797                 n_all_entries = len(ie_entries)
798                 if playlistitems:
799                     entries = [
800                         ie_entries[i - 1] for i in playlistitems
801                         if -n_all_entries <= i - 1 < n_all_entries]
802                 else:
803                     entries = ie_entries[playliststart:playlistend]
804                 n_entries = len(entries)
805                 self.to_screen(
806                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
807                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
808             elif isinstance(ie_entries, PagedList):
809                 if playlistitems:
810                     entries = []
811                     for item in playlistitems:
812                         entries.extend(ie_entries.getslice(
813                             item - 1, item
814                         ))
815                 else:
816                     entries = ie_entries.getslice(
817                         playliststart, playlistend)
818                 n_entries = len(entries)
819                 self.to_screen(
820                     '[%s] playlist %s: Downloading %d videos' %
821                     (ie_result['extractor'], playlist, n_entries))
822             else:  # iterable
823                 if playlistitems:
824                     entry_list = list(ie_entries)
825                     entries = [entry_list[i - 1] for i in playlistitems]
826                 else:
827                     entries = list(itertools.islice(
828                         ie_entries, playliststart, playlistend))
829                 n_entries = len(entries)
830                 self.to_screen(
831                     '[%s] playlist %s: Downloading %d videos' %
832                     (ie_result['extractor'], playlist, n_entries))
833
834             if self.params.get('playlistreverse', False):
835                 entries = entries[::-1]
836
837             for i, entry in enumerate(entries, 1):
838                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
839                 extra = {
840                     'n_entries': n_entries,
841                     'playlist': playlist,
842                     'playlist_id': ie_result.get('id'),
843                     'playlist_title': ie_result.get('title'),
844                     'playlist_index': i + playliststart,
845                     'extractor': ie_result['extractor'],
846                     'webpage_url': ie_result['webpage_url'],
847                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
848                     'extractor_key': ie_result['extractor_key'],
849                 }
850
851                 reason = self._match_entry(entry, incomplete=True)
852                 if reason is not None:
853                     self.to_screen('[download] ' + reason)
854                     continue
855
856                 entry_result = self.process_ie_result(entry,
857                                                       download=download,
858                                                       extra_info=extra)
859                 playlist_results.append(entry_result)
860             ie_result['entries'] = playlist_results
861             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
862             return ie_result
863         elif result_type == 'compat_list':
864             self.report_warning(
865                 'Extractor %s returned a compat_list result. '
866                 'It needs to be updated.' % ie_result.get('extractor'))
867
868             def _fixup(r):
869                 self.add_extra_info(
870                     r,
871                     {
872                         'extractor': ie_result['extractor'],
873                         'webpage_url': ie_result['webpage_url'],
874                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
875                         'extractor_key': ie_result['extractor_key'],
876                     }
877                 )
878                 return r
879             ie_result['entries'] = [
880                 self.process_ie_result(_fixup(r), download, extra_info)
881                 for r in ie_result['entries']
882             ]
883             return ie_result
884         else:
885             raise Exception('Invalid result type: %s' % result_type)
886
887     def _build_format_filter(self, filter_spec):
888         " Returns a function to filter the formats according to the filter_spec "
889
890         OPERATORS = {
891             '<': operator.lt,
892             '<=': operator.le,
893             '>': operator.gt,
894             '>=': operator.ge,
895             '=': operator.eq,
896             '!=': operator.ne,
897         }
898         operator_rex = re.compile(r'''(?x)\s*
899             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
900             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
901             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
902             $
903             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
904         m = operator_rex.search(filter_spec)
905         if m:
906             try:
907                 comparison_value = int(m.group('value'))
908             except ValueError:
909                 comparison_value = parse_filesize(m.group('value'))
910                 if comparison_value is None:
911                     comparison_value = parse_filesize(m.group('value') + 'B')
912                 if comparison_value is None:
913                     raise ValueError(
914                         'Invalid value %r in format specification %r' % (
915                             m.group('value'), filter_spec))
916             op = OPERATORS[m.group('op')]
917
918         if not m:
919             STR_OPERATORS = {
920                 '=': operator.eq,
921                 '!=': operator.ne,
922                 '^=': lambda attr, value: attr.startswith(value),
923                 '$=': lambda attr, value: attr.endswith(value),
924                 '*=': lambda attr, value: value in attr,
925             }
926             str_operator_rex = re.compile(r'''(?x)
927                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
928                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
929                 \s*(?P<value>[a-zA-Z0-9._-]+)
930                 \s*$
931                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
932             m = str_operator_rex.search(filter_spec)
933             if m:
934                 comparison_value = m.group('value')
935                 op = STR_OPERATORS[m.group('op')]
936
937         if not m:
938             raise ValueError('Invalid filter specification %r' % filter_spec)
939
940         def _filter(f):
941             actual_value = f.get(m.group('key'))
942             if actual_value is None:
943                 return m.group('none_inclusive')
944             return op(actual_value, comparison_value)
945         return _filter
946
947     def build_format_selector(self, format_spec):
948         def syntax_error(note, start):
949             message = (
950                 'Invalid format specification: '
951                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
952             return SyntaxError(message)
953
954         PICKFIRST = 'PICKFIRST'
955         MERGE = 'MERGE'
956         SINGLE = 'SINGLE'
957         GROUP = 'GROUP'
958         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
959
960         def _parse_filter(tokens):
961             filter_parts = []
962             for type, string, start, _, _ in tokens:
963                 if type == tokenize.OP and string == ']':
964                     return ''.join(filter_parts)
965                 else:
966                     filter_parts.append(string)
967
968         def _remove_unused_ops(tokens):
969             # Remove operators that we don't use and join them with the surrounding strings
970             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
971             ALLOWED_OPS = ('/', '+', ',', '(', ')')
972             last_string, last_start, last_end, last_line = None, None, None, None
973             for type, string, start, end, line in tokens:
974                 if type == tokenize.OP and string == '[':
975                     if last_string:
976                         yield tokenize.NAME, last_string, last_start, last_end, last_line
977                         last_string = None
978                     yield type, string, start, end, line
979                     # everything inside brackets will be handled by _parse_filter
980                     for type, string, start, end, line in tokens:
981                         yield type, string, start, end, line
982                         if type == tokenize.OP and string == ']':
983                             break
984                 elif type == tokenize.OP and string in ALLOWED_OPS:
985                     if last_string:
986                         yield tokenize.NAME, last_string, last_start, last_end, last_line
987                         last_string = None
988                     yield type, string, start, end, line
989                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
990                     if not last_string:
991                         last_string = string
992                         last_start = start
993                         last_end = end
994                     else:
995                         last_string += string
996             if last_string:
997                 yield tokenize.NAME, last_string, last_start, last_end, last_line
998
999         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1000             selectors = []
1001             current_selector = None
1002             for type, string, start, _, _ in tokens:
1003                 # ENCODING is only defined in python 3.x
1004                 if type == getattr(tokenize, 'ENCODING', None):
1005                     continue
1006                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1007                     current_selector = FormatSelector(SINGLE, string, [])
1008                 elif type == tokenize.OP:
1009                     if string == ')':
1010                         if not inside_group:
1011                             # ')' will be handled by the parentheses group
1012                             tokens.restore_last_token()
1013                         break
1014                     elif inside_merge and string in ['/', ',']:
1015                         tokens.restore_last_token()
1016                         break
1017                     elif inside_choice and string == ',':
1018                         tokens.restore_last_token()
1019                         break
1020                     elif string == ',':
1021                         if not current_selector:
1022                             raise syntax_error('"," must follow a format selector', start)
1023                         selectors.append(current_selector)
1024                         current_selector = None
1025                     elif string == '/':
1026                         if not current_selector:
1027                             raise syntax_error('"/" must follow a format selector', start)
1028                         first_choice = current_selector
1029                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1030                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1031                     elif string == '[':
1032                         if not current_selector:
1033                             current_selector = FormatSelector(SINGLE, 'best', [])
1034                         format_filter = _parse_filter(tokens)
1035                         current_selector.filters.append(format_filter)
1036                     elif string == '(':
1037                         if current_selector:
1038                             raise syntax_error('Unexpected "("', start)
1039                         group = _parse_format_selection(tokens, inside_group=True)
1040                         current_selector = FormatSelector(GROUP, group, [])
1041                     elif string == '+':
1042                         video_selector = current_selector
1043                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1044                         if not video_selector or not audio_selector:
1045                             raise syntax_error('"+" must be between two format selectors', start)
1046                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1047                     else:
1048                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1049                 elif type == tokenize.ENDMARKER:
1050                     break
1051             if current_selector:
1052                 selectors.append(current_selector)
1053             return selectors
1054
1055         def _build_selector_function(selector):
1056             if isinstance(selector, list):
1057                 fs = [_build_selector_function(s) for s in selector]
1058
1059                 def selector_function(ctx):
1060                     for f in fs:
1061                         for format in f(ctx):
1062                             yield format
1063                 return selector_function
1064             elif selector.type == GROUP:
1065                 selector_function = _build_selector_function(selector.selector)
1066             elif selector.type == PICKFIRST:
1067                 fs = [_build_selector_function(s) for s in selector.selector]
1068
1069                 def selector_function(ctx):
1070                     for f in fs:
1071                         picked_formats = list(f(ctx))
1072                         if picked_formats:
1073                             return picked_formats
1074                     return []
1075             elif selector.type == SINGLE:
1076                 format_spec = selector.selector
1077
1078                 def selector_function(ctx):
1079                     formats = list(ctx['formats'])
1080                     if not formats:
1081                         return
1082                     if format_spec == 'all':
1083                         for f in formats:
1084                             yield f
1085                     elif format_spec in ['best', 'worst', None]:
1086                         format_idx = 0 if format_spec == 'worst' else -1
1087                         audiovideo_formats = [
1088                             f for f in formats
1089                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1090                         if audiovideo_formats:
1091                             yield audiovideo_formats[format_idx]
1092                         # for extractors with incomplete formats (audio only (soundcloud)
1093                         # or video only (imgur)) we will fallback to best/worst
1094                         # {video,audio}-only format
1095                         elif ctx['incomplete_formats']:
1096                             yield formats[format_idx]
1097                     elif format_spec == 'bestaudio':
1098                         audio_formats = [
1099                             f for f in formats
1100                             if f.get('vcodec') == 'none']
1101                         if audio_formats:
1102                             yield audio_formats[-1]
1103                     elif format_spec == 'worstaudio':
1104                         audio_formats = [
1105                             f for f in formats
1106                             if f.get('vcodec') == 'none']
1107                         if audio_formats:
1108                             yield audio_formats[0]
1109                     elif format_spec == 'bestvideo':
1110                         video_formats = [
1111                             f for f in formats
1112                             if f.get('acodec') == 'none']
1113                         if video_formats:
1114                             yield video_formats[-1]
1115                     elif format_spec == 'worstvideo':
1116                         video_formats = [
1117                             f for f in formats
1118                             if f.get('acodec') == 'none']
1119                         if video_formats:
1120                             yield video_formats[0]
1121                     else:
1122                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1123                         if format_spec in extensions:
1124                             filter_f = lambda f: f['ext'] == format_spec
1125                         else:
1126                             filter_f = lambda f: f['format_id'] == format_spec
1127                         matches = list(filter(filter_f, formats))
1128                         if matches:
1129                             yield matches[-1]
1130             elif selector.type == MERGE:
1131                 def _merge(formats_info):
1132                     format_1, format_2 = [f['format_id'] for f in formats_info]
1133                     # The first format must contain the video and the
1134                     # second the audio
1135                     if formats_info[0].get('vcodec') == 'none':
1136                         self.report_error('The first format must '
1137                                           'contain the video, try using '
1138                                           '"-f %s+%s"' % (format_2, format_1))
1139                         return
1140                     # Formats must be opposite (video+audio)
1141                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1142                         self.report_error(
1143                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1144                             % (format_1, format_2))
1145                         return
1146                     output_ext = (
1147                         formats_info[0]['ext']
1148                         if self.params.get('merge_output_format') is None
1149                         else self.params['merge_output_format'])
1150                     return {
1151                         'requested_formats': formats_info,
1152                         'format': '%s+%s' % (formats_info[0].get('format'),
1153                                              formats_info[1].get('format')),
1154                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1155                                                 formats_info[1].get('format_id')),
1156                         'width': formats_info[0].get('width'),
1157                         'height': formats_info[0].get('height'),
1158                         'resolution': formats_info[0].get('resolution'),
1159                         'fps': formats_info[0].get('fps'),
1160                         'vcodec': formats_info[0].get('vcodec'),
1161                         'vbr': formats_info[0].get('vbr'),
1162                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1163                         'acodec': formats_info[1].get('acodec'),
1164                         'abr': formats_info[1].get('abr'),
1165                         'ext': output_ext,
1166                     }
1167                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1168
1169                 def selector_function(ctx):
1170                     for pair in itertools.product(
1171                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1172                         yield _merge(pair)
1173
1174             filters = [self._build_format_filter(f) for f in selector.filters]
1175
1176             def final_selector(ctx):
1177                 ctx_copy = copy.deepcopy(ctx)
1178                 for _filter in filters:
1179                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1180                 return selector_function(ctx_copy)
1181             return final_selector
1182
1183         stream = io.BytesIO(format_spec.encode('utf-8'))
1184         try:
1185             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1186         except tokenize.TokenError:
1187             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1188
1189         class TokenIterator(object):
1190             def __init__(self, tokens):
1191                 self.tokens = tokens
1192                 self.counter = 0
1193
1194             def __iter__(self):
1195                 return self
1196
1197             def __next__(self):
1198                 if self.counter >= len(self.tokens):
1199                     raise StopIteration()
1200                 value = self.tokens[self.counter]
1201                 self.counter += 1
1202                 return value
1203
1204             next = __next__
1205
1206             def restore_last_token(self):
1207                 self.counter -= 1
1208
1209         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1210         return _build_selector_function(parsed_selector)
1211
1212     def _calc_headers(self, info_dict):
1213         res = std_headers.copy()
1214
1215         add_headers = info_dict.get('http_headers')
1216         if add_headers:
1217             res.update(add_headers)
1218
1219         cookies = self._calc_cookies(info_dict)
1220         if cookies:
1221             res['Cookie'] = cookies
1222
1223         return res
1224
1225     def _calc_cookies(self, info_dict):
1226         pr = sanitized_Request(info_dict['url'])
1227         self.cookiejar.add_cookie_header(pr)
1228         return pr.get_header('Cookie')
1229
1230     def process_video_result(self, info_dict, download=True):
1231         assert info_dict.get('_type', 'video') == 'video'
1232
1233         if 'id' not in info_dict:
1234             raise ExtractorError('Missing "id" field in extractor result')
1235         if 'title' not in info_dict:
1236             raise ExtractorError('Missing "title" field in extractor result')
1237
1238         if not isinstance(info_dict['id'], compat_str):
1239             self.report_warning('"id" field is not a string - forcing string conversion')
1240             info_dict['id'] = compat_str(info_dict['id'])
1241
1242         if 'playlist' not in info_dict:
1243             # It isn't part of a playlist
1244             info_dict['playlist'] = None
1245             info_dict['playlist_index'] = None
1246
1247         thumbnails = info_dict.get('thumbnails')
1248         if thumbnails is None:
1249             thumbnail = info_dict.get('thumbnail')
1250             if thumbnail:
1251                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1252         if thumbnails:
1253             thumbnails.sort(key=lambda t: (
1254                 t.get('preference'), t.get('width'), t.get('height'),
1255                 t.get('id'), t.get('url')))
1256             for i, t in enumerate(thumbnails):
1257                 t['url'] = sanitize_url(t['url'])
1258                 if t.get('width') and t.get('height'):
1259                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1260                 if t.get('id') is None:
1261                     t['id'] = '%d' % i
1262
1263         if self.params.get('list_thumbnails'):
1264             self.list_thumbnails(info_dict)
1265             return
1266
1267         thumbnail = info_dict.get('thumbnail')
1268         if thumbnail:
1269             info_dict['thumbnail'] = sanitize_url(thumbnail)
1270         elif thumbnails:
1271             info_dict['thumbnail'] = thumbnails[-1]['url']
1272
1273         if 'display_id' not in info_dict and 'id' in info_dict:
1274             info_dict['display_id'] = info_dict['id']
1275
1276         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1277             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1278             # see http://bugs.python.org/issue1646728)
1279             try:
1280                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1281                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1282             except (ValueError, OverflowError, OSError):
1283                 pass
1284
1285         # Auto generate title fields corresponding to the *_number fields when missing
1286         # in order to always have clean titles. This is very common for TV series.
1287         for field in ('chapter', 'season', 'episode'):
1288             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1289                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1290
1291         subtitles = info_dict.get('subtitles')
1292         if subtitles:
1293             for _, subtitle in subtitles.items():
1294                 for subtitle_format in subtitle:
1295                     if subtitle_format.get('url'):
1296                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1297                     if 'ext' not in subtitle_format:
1298                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1299
1300         if self.params.get('listsubtitles', False):
1301             if 'automatic_captions' in info_dict:
1302                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1303             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1304             return
1305         info_dict['requested_subtitles'] = self.process_subtitles(
1306             info_dict['id'], subtitles,
1307             info_dict.get('automatic_captions'))
1308
1309         # We now pick which formats have to be downloaded
1310         if info_dict.get('formats') is None:
1311             # There's only one format available
1312             formats = [info_dict]
1313         else:
1314             formats = info_dict['formats']
1315
1316         if not formats:
1317             raise ExtractorError('No video formats found!')
1318
1319         formats_dict = {}
1320
1321         # We check that all the formats have the format and format_id fields
1322         for i, format in enumerate(formats):
1323             if 'url' not in format:
1324                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1325
1326             format['url'] = sanitize_url(format['url'])
1327
1328             if format.get('format_id') is None:
1329                 format['format_id'] = compat_str(i)
1330             else:
1331                 # Sanitize format_id from characters used in format selector expression
1332                 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1333             format_id = format['format_id']
1334             if format_id not in formats_dict:
1335                 formats_dict[format_id] = []
1336             formats_dict[format_id].append(format)
1337
1338         # Make sure all formats have unique format_id
1339         for format_id, ambiguous_formats in formats_dict.items():
1340             if len(ambiguous_formats) > 1:
1341                 for i, format in enumerate(ambiguous_formats):
1342                     format['format_id'] = '%s-%d' % (format_id, i)
1343
1344         for i, format in enumerate(formats):
1345             if format.get('format') is None:
1346                 format['format'] = '{id} - {res}{note}'.format(
1347                     id=format['format_id'],
1348                     res=self.format_resolution(format),
1349                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1350                 )
1351             # Automatically determine file extension if missing
1352             if 'ext' not in format:
1353                 format['ext'] = determine_ext(format['url']).lower()
1354             # Automatically determine protocol if missing (useful for format
1355             # selection purposes)
1356             if 'protocol' not in format:
1357                 format['protocol'] = determine_protocol(format)
1358             # Add HTTP headers, so that external programs can use them from the
1359             # json output
1360             full_format_info = info_dict.copy()
1361             full_format_info.update(format)
1362             format['http_headers'] = self._calc_headers(full_format_info)
1363
1364         # TODO Central sorting goes here
1365
1366         if formats[0] is not info_dict:
1367             # only set the 'formats' fields if the original info_dict list them
1368             # otherwise we end up with a circular reference, the first (and unique)
1369             # element in the 'formats' field in info_dict is info_dict itself,
1370             # which can't be exported to json
1371             info_dict['formats'] = formats
1372         if self.params.get('listformats'):
1373             self.list_formats(info_dict)
1374             return
1375
1376         req_format = self.params.get('format')
1377         if req_format is None:
1378             req_format_list = []
1379             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1380                     not info_dict.get('is_live')):
1381                 merger = FFmpegMergerPP(self)
1382                 if merger.available and merger.can_merge():
1383                     req_format_list.append('bestvideo+bestaudio')
1384             req_format_list.append('best')
1385             req_format = '/'.join(req_format_list)
1386         format_selector = self.build_format_selector(req_format)
1387
1388         # While in format selection we may need to have an access to the original
1389         # format set in order to calculate some metrics or do some processing.
1390         # For now we need to be able to guess whether original formats provided
1391         # by extractor are incomplete or not (i.e. whether extractor provides only
1392         # video-only or audio-only formats) for proper formats selection for
1393         # extractors with such incomplete formats (see
1394         # https://github.com/rg3/youtube-dl/pull/5556).
1395         # Since formats may be filtered during format selection and may not match
1396         # the original formats the results may be incorrect. Thus original formats
1397         # or pre-calculated metrics should be passed to format selection routines
1398         # as well.
1399         # We will pass a context object containing all necessary additional data
1400         # instead of just formats.
1401         # This fixes incorrect format selection issue (see
1402         # https://github.com/rg3/youtube-dl/issues/10083).
1403         incomplete_formats = (
1404             # All formats are video-only or
1405             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1406             # all formats are audio-only
1407             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1408
1409         ctx = {
1410             'formats': formats,
1411             'incomplete_formats': incomplete_formats,
1412         }
1413
1414         formats_to_download = list(format_selector(ctx))
1415         if not formats_to_download:
1416             raise ExtractorError('requested format not available',
1417                                  expected=True)
1418
1419         if download:
1420             if len(formats_to_download) > 1:
1421                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1422             for format in formats_to_download:
1423                 new_info = dict(info_dict)
1424                 new_info.update(format)
1425                 self.process_info(new_info)
1426         # We update the info dict with the best quality format (backwards compatibility)
1427         info_dict.update(formats_to_download[-1])
1428         return info_dict
1429
1430     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1431         """Select the requested subtitles and their format"""
1432         available_subs = {}
1433         if normal_subtitles and self.params.get('writesubtitles'):
1434             available_subs.update(normal_subtitles)
1435         if automatic_captions and self.params.get('writeautomaticsub'):
1436             for lang, cap_info in automatic_captions.items():
1437                 if lang not in available_subs:
1438                     available_subs[lang] = cap_info
1439
1440         if (not self.params.get('writesubtitles') and not
1441                 self.params.get('writeautomaticsub') or not
1442                 available_subs):
1443             return None
1444
1445         if self.params.get('allsubtitles', False):
1446             requested_langs = available_subs.keys()
1447         else:
1448             if self.params.get('subtitleslangs', False):
1449                 requested_langs = self.params.get('subtitleslangs')
1450             elif 'en' in available_subs:
1451                 requested_langs = ['en']
1452             else:
1453                 requested_langs = [list(available_subs.keys())[0]]
1454
1455         formats_query = self.params.get('subtitlesformat', 'best')
1456         formats_preference = formats_query.split('/') if formats_query else []
1457         subs = {}
1458         for lang in requested_langs:
1459             formats = available_subs.get(lang)
1460             if formats is None:
1461                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1462                 continue
1463             for ext in formats_preference:
1464                 if ext == 'best':
1465                     f = formats[-1]
1466                     break
1467                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1468                 if matches:
1469                     f = matches[-1]
1470                     break
1471             else:
1472                 f = formats[-1]
1473                 self.report_warning(
1474                     'No subtitle format found matching "%s" for language %s, '
1475                     'using %s' % (formats_query, lang, f['ext']))
1476             subs[lang] = f
1477         return subs
1478
1479     def process_info(self, info_dict):
1480         """Process a single resolved IE result."""
1481
1482         assert info_dict.get('_type', 'video') == 'video'
1483
1484         max_downloads = self.params.get('max_downloads')
1485         if max_downloads is not None:
1486             if self._num_downloads >= int(max_downloads):
1487                 raise MaxDownloadsReached()
1488
1489         info_dict['fulltitle'] = info_dict['title']
1490         if len(info_dict['title']) > 200:
1491             info_dict['title'] = info_dict['title'][:197] + '...'
1492
1493         if 'format' not in info_dict:
1494             info_dict['format'] = info_dict['ext']
1495
1496         reason = self._match_entry(info_dict, incomplete=False)
1497         if reason is not None:
1498             self.to_screen('[download] ' + reason)
1499             return
1500
1501         self._num_downloads += 1
1502
1503         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1504
1505         # Forced printings
1506         if self.params.get('forcetitle', False):
1507             self.to_stdout(info_dict['fulltitle'])
1508         if self.params.get('forceid', False):
1509             self.to_stdout(info_dict['id'])
1510         if self.params.get('forceurl', False):
1511             if info_dict.get('requested_formats') is not None:
1512                 for f in info_dict['requested_formats']:
1513                     self.to_stdout(f['url'] + f.get('play_path', ''))
1514             else:
1515                 # For RTMP URLs, also include the playpath
1516                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1517         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1518             self.to_stdout(info_dict['thumbnail'])
1519         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1520             self.to_stdout(info_dict['description'])
1521         if self.params.get('forcefilename', False) and filename is not None:
1522             self.to_stdout(filename)
1523         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1524             self.to_stdout(formatSeconds(info_dict['duration']))
1525         if self.params.get('forceformat', False):
1526             self.to_stdout(info_dict['format'])
1527         if self.params.get('forcejson', False):
1528             self.to_stdout(json.dumps(info_dict))
1529
1530         # Do nothing else if in simulate mode
1531         if self.params.get('simulate', False):
1532             return
1533
1534         if filename is None:
1535             return
1536
1537         try:
1538             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1539             if dn and not os.path.exists(dn):
1540                 os.makedirs(dn)
1541         except (OSError, IOError) as err:
1542             self.report_error('unable to create directory ' + error_to_compat_str(err))
1543             return
1544
1545         if self.params.get('writedescription', False):
1546             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1547             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1548                 self.to_screen('[info] Video description is already present')
1549             elif info_dict.get('description') is None:
1550                 self.report_warning('There\'s no description to write.')
1551             else:
1552                 try:
1553                     self.to_screen('[info] Writing video description to: ' + descfn)
1554                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1555                         descfile.write(info_dict['description'])
1556                 except (OSError, IOError):
1557                     self.report_error('Cannot write description file ' + descfn)
1558                     return
1559
1560         if self.params.get('writeannotations', False):
1561             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1562             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1563                 self.to_screen('[info] Video annotations are already present')
1564             else:
1565                 try:
1566                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1567                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1568                         annofile.write(info_dict['annotations'])
1569                 except (KeyError, TypeError):
1570                     self.report_warning('There are no annotations to write.')
1571                 except (OSError, IOError):
1572                     self.report_error('Cannot write annotations file: ' + annofn)
1573                     return
1574
1575         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1576                                        self.params.get('writeautomaticsub')])
1577
1578         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1579             # subtitles download errors are already managed as troubles in relevant IE
1580             # that way it will silently go on when used with unsupporting IE
1581             subtitles = info_dict['requested_subtitles']
1582             ie = self.get_info_extractor(info_dict['extractor_key'])
1583             for sub_lang, sub_info in subtitles.items():
1584                 sub_format = sub_info['ext']
1585                 if sub_info.get('data') is not None:
1586                     sub_data = sub_info['data']
1587                 else:
1588                     try:
1589                         sub_data = ie._download_webpage(
1590                             sub_info['url'], info_dict['id'], note=False)
1591                     except ExtractorError as err:
1592                         self.report_warning('Unable to download subtitle for "%s": %s' %
1593                                             (sub_lang, error_to_compat_str(err.cause)))
1594                         continue
1595                 try:
1596                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1597                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1598                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1599                     else:
1600                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1601                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1602                             subfile.write(sub_data)
1603                 except (OSError, IOError):
1604                     self.report_error('Cannot write subtitles file ' + sub_filename)
1605                     return
1606
1607         if self.params.get('writeinfojson', False):
1608             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1609             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1610                 self.to_screen('[info] Video description metadata is already present')
1611             else:
1612                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1613                 try:
1614                     write_json_file(self.filter_requested_info(info_dict), infofn)
1615                 except (OSError, IOError):
1616                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1617                     return
1618
1619         self._write_thumbnails(info_dict, filename)
1620
1621         if not self.params.get('skip_download', False):
1622             try:
1623                 def dl(name, info):
1624                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1625                     for ph in self._progress_hooks:
1626                         fd.add_progress_hook(ph)
1627                     if self.params.get('verbose'):
1628                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1629                     return fd.download(name, info)
1630
1631                 if info_dict.get('requested_formats') is not None:
1632                     downloaded = []
1633                     success = True
1634                     merger = FFmpegMergerPP(self)
1635                     if not merger.available:
1636                         postprocessors = []
1637                         self.report_warning('You have requested multiple '
1638                                             'formats but ffmpeg or avconv are not installed.'
1639                                             ' The formats won\'t be merged.')
1640                     else:
1641                         postprocessors = [merger]
1642
1643                     def compatible_formats(formats):
1644                         video, audio = formats
1645                         # Check extension
1646                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1647                         if video_ext and audio_ext:
1648                             COMPATIBLE_EXTS = (
1649                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1650                                 ('webm')
1651                             )
1652                             for exts in COMPATIBLE_EXTS:
1653                                 if video_ext in exts and audio_ext in exts:
1654                                     return True
1655                         # TODO: Check acodec/vcodec
1656                         return False
1657
1658                     filename_real_ext = os.path.splitext(filename)[1][1:]
1659                     filename_wo_ext = (
1660                         os.path.splitext(filename)[0]
1661                         if filename_real_ext == info_dict['ext']
1662                         else filename)
1663                     requested_formats = info_dict['requested_formats']
1664                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1665                         info_dict['ext'] = 'mkv'
1666                         self.report_warning(
1667                             'Requested formats are incompatible for merge and will be merged into mkv.')
1668                     # Ensure filename always has a correct extension for successful merge
1669                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1670                     if os.path.exists(encodeFilename(filename)):
1671                         self.to_screen(
1672                             '[download] %s has already been downloaded and '
1673                             'merged' % filename)
1674                     else:
1675                         for f in requested_formats:
1676                             new_info = dict(info_dict)
1677                             new_info.update(f)
1678                             fname = self.prepare_filename(new_info)
1679                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1680                             downloaded.append(fname)
1681                             partial_success = dl(fname, new_info)
1682                             success = success and partial_success
1683                         info_dict['__postprocessors'] = postprocessors
1684                         info_dict['__files_to_merge'] = downloaded
1685                 else:
1686                     # Just a single file
1687                     success = dl(filename, info_dict)
1688             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1689                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1690                 return
1691             except (OSError, IOError) as err:
1692                 raise UnavailableVideoError(err)
1693             except (ContentTooShortError, ) as err:
1694                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1695                 return
1696
1697             if success and filename != '-':
1698                 # Fixup content
1699                 fixup_policy = self.params.get('fixup')
1700                 if fixup_policy is None:
1701                     fixup_policy = 'detect_or_warn'
1702
1703                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1704
1705                 stretched_ratio = info_dict.get('stretched_ratio')
1706                 if stretched_ratio is not None and stretched_ratio != 1:
1707                     if fixup_policy == 'warn':
1708                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1709                             info_dict['id'], stretched_ratio))
1710                     elif fixup_policy == 'detect_or_warn':
1711                         stretched_pp = FFmpegFixupStretchedPP(self)
1712                         if stretched_pp.available:
1713                             info_dict.setdefault('__postprocessors', [])
1714                             info_dict['__postprocessors'].append(stretched_pp)
1715                         else:
1716                             self.report_warning(
1717                                 '%s: Non-uniform pixel ratio (%s). %s'
1718                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1719                     else:
1720                         assert fixup_policy in ('ignore', 'never')
1721
1722                 if (info_dict.get('requested_formats') is None and
1723                         info_dict.get('container') == 'm4a_dash'):
1724                     if fixup_policy == 'warn':
1725                         self.report_warning(
1726                             '%s: writing DASH m4a. '
1727                             'Only some players support this container.'
1728                             % info_dict['id'])
1729                     elif fixup_policy == 'detect_or_warn':
1730                         fixup_pp = FFmpegFixupM4aPP(self)
1731                         if fixup_pp.available:
1732                             info_dict.setdefault('__postprocessors', [])
1733                             info_dict['__postprocessors'].append(fixup_pp)
1734                         else:
1735                             self.report_warning(
1736                                 '%s: writing DASH m4a. '
1737                                 'Only some players support this container. %s'
1738                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1739                     else:
1740                         assert fixup_policy in ('ignore', 'never')
1741
1742                 if (info_dict.get('protocol') == 'm3u8_native' or
1743                         info_dict.get('protocol') == 'm3u8' and
1744                         self.params.get('hls_prefer_native')):
1745                     if fixup_policy == 'warn':
1746                         self.report_warning('%s: malformated aac bitstream.' % (
1747                             info_dict['id']))
1748                     elif fixup_policy == 'detect_or_warn':
1749                         fixup_pp = FFmpegFixupM3u8PP(self)
1750                         if fixup_pp.available:
1751                             info_dict.setdefault('__postprocessors', [])
1752                             info_dict['__postprocessors'].append(fixup_pp)
1753                         else:
1754                             self.report_warning(
1755                                 '%s: malformated aac bitstream. %s'
1756                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1757                     else:
1758                         assert fixup_policy in ('ignore', 'never')
1759
1760                 try:
1761                     self.post_process(filename, info_dict)
1762                 except (PostProcessingError) as err:
1763                     self.report_error('postprocessing: %s' % str(err))
1764                     return
1765                 self.record_download_archive(info_dict)
1766
1767     def download(self, url_list):
1768         """Download a given list of URLs."""
1769         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1770         if (len(url_list) > 1 and
1771                 '%' not in outtmpl and
1772                 self.params.get('max_downloads') != 1):
1773             raise SameFileError(outtmpl)
1774
1775         for url in url_list:
1776             try:
1777                 # It also downloads the videos
1778                 res = self.extract_info(
1779                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1780             except UnavailableVideoError:
1781                 self.report_error('unable to download video')
1782             except MaxDownloadsReached:
1783                 self.to_screen('[info] Maximum number of downloaded files reached.')
1784                 raise
1785             else:
1786                 if self.params.get('dump_single_json', False):
1787                     self.to_stdout(json.dumps(res))
1788
1789         return self._download_retcode
1790
1791     def download_with_info_file(self, info_filename):
1792         with contextlib.closing(fileinput.FileInput(
1793                 [info_filename], mode='r',
1794                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1795             # FileInput doesn't have a read method, we can't call json.load
1796             info = self.filter_requested_info(json.loads('\n'.join(f)))
1797         try:
1798             self.process_ie_result(info, download=True)
1799         except DownloadError:
1800             webpage_url = info.get('webpage_url')
1801             if webpage_url is not None:
1802                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1803                 return self.download([webpage_url])
1804             else:
1805                 raise
1806         return self._download_retcode
1807
1808     @staticmethod
1809     def filter_requested_info(info_dict):
1810         return dict(
1811             (k, v) for k, v in info_dict.items()
1812             if k not in ['requested_formats', 'requested_subtitles'])
1813
1814     def post_process(self, filename, ie_info):
1815         """Run all the postprocessors on the given file."""
1816         info = dict(ie_info)
1817         info['filepath'] = filename
1818         pps_chain = []
1819         if ie_info.get('__postprocessors') is not None:
1820             pps_chain.extend(ie_info['__postprocessors'])
1821         pps_chain.extend(self._pps)
1822         for pp in pps_chain:
1823             files_to_delete = []
1824             try:
1825                 files_to_delete, info = pp.run(info)
1826             except PostProcessingError as e:
1827                 self.report_error(e.msg)
1828             if files_to_delete and not self.params.get('keepvideo', False):
1829                 for old_filename in files_to_delete:
1830                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1831                     try:
1832                         os.remove(encodeFilename(old_filename))
1833                     except (IOError, OSError):
1834                         self.report_warning('Unable to remove downloaded original file')
1835
1836     def _make_archive_id(self, info_dict):
1837         # Future-proof against any change in case
1838         # and backwards compatibility with prior versions
1839         extractor = info_dict.get('extractor_key')
1840         if extractor is None:
1841             if 'id' in info_dict:
1842                 extractor = info_dict.get('ie_key')  # key in a playlist
1843         if extractor is None:
1844             return None  # Incomplete video information
1845         return extractor.lower() + ' ' + info_dict['id']
1846
1847     def in_download_archive(self, info_dict):
1848         fn = self.params.get('download_archive')
1849         if fn is None:
1850             return False
1851
1852         vid_id = self._make_archive_id(info_dict)
1853         if vid_id is None:
1854             return False  # Incomplete video information
1855
1856         try:
1857             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1858                 for line in archive_file:
1859                     if line.strip() == vid_id:
1860                         return True
1861         except IOError as ioe:
1862             if ioe.errno != errno.ENOENT:
1863                 raise
1864         return False
1865
1866     def record_download_archive(self, info_dict):
1867         fn = self.params.get('download_archive')
1868         if fn is None:
1869             return
1870         vid_id = self._make_archive_id(info_dict)
1871         assert vid_id
1872         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1873             archive_file.write(vid_id + '\n')
1874
1875     @staticmethod
1876     def format_resolution(format, default='unknown'):
1877         if format.get('vcodec') == 'none':
1878             return 'audio only'
1879         if format.get('resolution') is not None:
1880             return format['resolution']
1881         if format.get('height') is not None:
1882             if format.get('width') is not None:
1883                 res = '%sx%s' % (format['width'], format['height'])
1884             else:
1885                 res = '%sp' % format['height']
1886         elif format.get('width') is not None:
1887             res = '%dx?' % format['width']
1888         else:
1889             res = default
1890         return res
1891
1892     def _format_note(self, fdict):
1893         res = ''
1894         if fdict.get('ext') in ['f4f', 'f4m']:
1895             res += '(unsupported) '
1896         if fdict.get('language'):
1897             if res:
1898                 res += ' '
1899             res += '[%s] ' % fdict['language']
1900         if fdict.get('format_note') is not None:
1901             res += fdict['format_note'] + ' '
1902         if fdict.get('tbr') is not None:
1903             res += '%4dk ' % fdict['tbr']
1904         if fdict.get('container') is not None:
1905             if res:
1906                 res += ', '
1907             res += '%s container' % fdict['container']
1908         if (fdict.get('vcodec') is not None and
1909                 fdict.get('vcodec') != 'none'):
1910             if res:
1911                 res += ', '
1912             res += fdict['vcodec']
1913             if fdict.get('vbr') is not None:
1914                 res += '@'
1915         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1916             res += 'video@'
1917         if fdict.get('vbr') is not None:
1918             res += '%4dk' % fdict['vbr']
1919         if fdict.get('fps') is not None:
1920             if res:
1921                 res += ', '
1922             res += '%sfps' % fdict['fps']
1923         if fdict.get('acodec') is not None:
1924             if res:
1925                 res += ', '
1926             if fdict['acodec'] == 'none':
1927                 res += 'video only'
1928             else:
1929                 res += '%-5s' % fdict['acodec']
1930         elif fdict.get('abr') is not None:
1931             if res:
1932                 res += ', '
1933             res += 'audio'
1934         if fdict.get('abr') is not None:
1935             res += '@%3dk' % fdict['abr']
1936         if fdict.get('asr') is not None:
1937             res += ' (%5dHz)' % fdict['asr']
1938         if fdict.get('filesize') is not None:
1939             if res:
1940                 res += ', '
1941             res += format_bytes(fdict['filesize'])
1942         elif fdict.get('filesize_approx') is not None:
1943             if res:
1944                 res += ', '
1945             res += '~' + format_bytes(fdict['filesize_approx'])
1946         return res
1947
1948     def list_formats(self, info_dict):
1949         formats = info_dict.get('formats', [info_dict])
1950         table = [
1951             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1952             for f in formats
1953             if f.get('preference') is None or f['preference'] >= -1000]
1954         if len(formats) > 1:
1955             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1956
1957         header_line = ['format code', 'extension', 'resolution', 'note']
1958         self.to_screen(
1959             '[info] Available formats for %s:\n%s' %
1960             (info_dict['id'], render_table(header_line, table)))
1961
1962     def list_thumbnails(self, info_dict):
1963         thumbnails = info_dict.get('thumbnails')
1964         if not thumbnails:
1965             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1966             return
1967
1968         self.to_screen(
1969             '[info] Thumbnails for %s:' % info_dict['id'])
1970         self.to_screen(render_table(
1971             ['ID', 'width', 'height', 'URL'],
1972             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1973
1974     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1975         if not subtitles:
1976             self.to_screen('%s has no %s' % (video_id, name))
1977             return
1978         self.to_screen(
1979             'Available %s for %s:' % (name, video_id))
1980         self.to_screen(render_table(
1981             ['Language', 'formats'],
1982             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1983                 for lang, formats in subtitles.items()]))
1984
1985     def urlopen(self, req):
1986         """ Start an HTTP download """
1987         if isinstance(req, compat_basestring):
1988             req = sanitized_Request(req)
1989         return self._opener.open(req, timeout=self._socket_timeout)
1990
1991     def print_debug_header(self):
1992         if not self.params.get('verbose'):
1993             return
1994
1995         if type('') is not compat_str:
1996             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1997             self.report_warning(
1998                 'Your Python is broken! Update to a newer and supported version')
1999
2000         stdout_encoding = getattr(
2001             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2002         encoding_str = (
2003             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2004                 locale.getpreferredencoding(),
2005                 sys.getfilesystemencoding(),
2006                 stdout_encoding,
2007                 self.get_encoding()))
2008         write_string(encoding_str, encoding=None)
2009
2010         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2011         if _LAZY_LOADER:
2012             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2013         try:
2014             sp = subprocess.Popen(
2015                 ['git', 'rev-parse', '--short', 'HEAD'],
2016                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2017                 cwd=os.path.dirname(os.path.abspath(__file__)))
2018             out, err = sp.communicate()
2019             out = out.decode().strip()
2020             if re.match('[0-9a-f]+', out):
2021                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2022         except Exception:
2023             try:
2024                 sys.exc_clear()
2025             except Exception:
2026                 pass
2027         self._write_string('[debug] Python version %s - %s\n' % (
2028             platform.python_version(), platform_name()))
2029
2030         exe_versions = FFmpegPostProcessor.get_versions(self)
2031         exe_versions['rtmpdump'] = rtmpdump_version()
2032         exe_str = ', '.join(
2033             '%s %s' % (exe, v)
2034             for exe, v in sorted(exe_versions.items())
2035             if v
2036         )
2037         if not exe_str:
2038             exe_str = 'none'
2039         self._write_string('[debug] exe versions: %s\n' % exe_str)
2040
2041         proxy_map = {}
2042         for handler in self._opener.handlers:
2043             if hasattr(handler, 'proxies'):
2044                 proxy_map.update(handler.proxies)
2045         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2046
2047         if self.params.get('call_home', False):
2048             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2049             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2050             latest_version = self.urlopen(
2051                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2052             if version_tuple(latest_version) > version_tuple(__version__):
2053                 self.report_warning(
2054                     'You are using an outdated version (newest version: %s)! '
2055                     'See https://yt-dl.org/update if you need help updating.' %
2056                     latest_version)
2057
2058     def _setup_opener(self):
2059         timeout_val = self.params.get('socket_timeout')
2060         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2061
2062         opts_cookiefile = self.params.get('cookiefile')
2063         opts_proxy = self.params.get('proxy')
2064
2065         if opts_cookiefile is None:
2066             self.cookiejar = compat_cookiejar.CookieJar()
2067         else:
2068             opts_cookiefile = compat_expanduser(opts_cookiefile)
2069             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2070                 opts_cookiefile)
2071             if os.access(opts_cookiefile, os.R_OK):
2072                 self.cookiejar.load()
2073
2074         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2075         if opts_proxy is not None:
2076             if opts_proxy == '':
2077                 proxies = {}
2078             else:
2079                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2080         else:
2081             proxies = compat_urllib_request.getproxies()
2082             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2083             if 'http' in proxies and 'https' not in proxies:
2084                 proxies['https'] = proxies['http']
2085         proxy_handler = PerRequestProxyHandler(proxies)
2086
2087         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2088         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2089         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2090         data_handler = compat_urllib_request_DataHandler()
2091
2092         # When passing our own FileHandler instance, build_opener won't add the
2093         # default FileHandler and allows us to disable the file protocol, which
2094         # can be used for malicious purposes (see
2095         # https://github.com/rg3/youtube-dl/issues/8227)
2096         file_handler = compat_urllib_request.FileHandler()
2097
2098         def file_open(*args, **kwargs):
2099             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2100         file_handler.file_open = file_open
2101
2102         opener = compat_urllib_request.build_opener(
2103             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2104
2105         # Delete the default user-agent header, which would otherwise apply in
2106         # cases where our custom HTTP handler doesn't come into play
2107         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2108         opener.addheaders = []
2109         self._opener = opener
2110
2111     def encode(self, s):
2112         if isinstance(s, bytes):
2113             return s  # Already encoded
2114
2115         try:
2116             return s.encode(self.get_encoding())
2117         except UnicodeEncodeError as err:
2118             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2119             raise
2120
2121     def get_encoding(self):
2122         encoding = self.params.get('encoding')
2123         if encoding is None:
2124             encoding = preferredencoding()
2125         return encoding
2126
2127     def _write_thumbnails(self, info_dict, filename):
2128         if self.params.get('writethumbnail', False):
2129             thumbnails = info_dict.get('thumbnails')
2130             if thumbnails:
2131                 thumbnails = [thumbnails[-1]]
2132         elif self.params.get('write_all_thumbnails', False):
2133             thumbnails = info_dict.get('thumbnails')
2134         else:
2135             return
2136
2137         if not thumbnails:
2138             # No thumbnails present, so return immediately
2139             return
2140
2141         for t in thumbnails:
2142             thumb_ext = determine_ext(t['url'], 'jpg')
2143             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2144             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2145             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2146
2147             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2148                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2149                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2150             else:
2151                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2152                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2153                 try:
2154                     uf = self.urlopen(t['url'])
2155                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2156                         shutil.copyfileobj(uf, thumbf)
2157                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2158                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2159                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2160                     self.report_warning('Unable to download thumbnail "%s": %s' %
2161                                         (t['url'], error_to_compat_str(err)))