[YoutubeDL] Fix format selection with filters (Closes #10083)
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27
28 from .compat import (
29     compat_basestring,
30     compat_cookiejar,
31     compat_expanduser,
32     compat_get_terminal_size,
33     compat_http_client,
34     compat_kwargs,
35     compat_os_name,
36     compat_str,
37     compat_tokenize_tokenize,
38     compat_urllib_error,
39     compat_urllib_request,
40     compat_urllib_request_DataHandler,
41 )
42 from .utils import (
43     age_restricted,
44     args_to_str,
45     ContentTooShortError,
46     date_from_str,
47     DateRange,
48     DEFAULT_OUTTMPL,
49     determine_ext,
50     determine_protocol,
51     DownloadError,
52     encode_compat_str,
53     encodeFilename,
54     error_to_compat_str,
55     ExtractorError,
56     format_bytes,
57     formatSeconds,
58     locked_file,
59     make_HTTPS_handler,
60     MaxDownloadsReached,
61     PagedList,
62     parse_filesize,
63     PerRequestProxyHandler,
64     platform_name,
65     PostProcessingError,
66     preferredencoding,
67     prepend_extension,
68     register_socks_protocols,
69     render_table,
70     replace_extension,
71     SameFileError,
72     sanitize_filename,
73     sanitize_path,
74     sanitize_url,
75     sanitized_Request,
76     std_headers,
77     subtitles_filename,
78     UnavailableVideoError,
79     url_basename,
80     version_tuple,
81     write_json_file,
82     write_string,
83     YoutubeDLCookieProcessor,
84     YoutubeDLHandler,
85 )
86 from .cache import Cache
87 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
88 from .downloader import get_suitable_downloader
89 from .downloader.rtmp import rtmpdump_version
90 from .postprocessor import (
91     FFmpegFixupM3u8PP,
92     FFmpegFixupM4aPP,
93     FFmpegFixupStretchedPP,
94     FFmpegMergerPP,
95     FFmpegPostProcessor,
96     get_postprocessor,
97 )
98 from .version import __version__
99
100 if compat_os_name == 'nt':
101     import ctypes
102
103
104 class YoutubeDL(object):
105     """YoutubeDL class.
106
107     YoutubeDL objects are the ones responsible of downloading the
108     actual video file and writing it to disk if the user has requested
109     it, among some other tasks. In most cases there should be one per
110     program. As, given a video URL, the downloader doesn't know how to
111     extract all the needed information, task that InfoExtractors do, it
112     has to pass the URL to one of them.
113
114     For this, YoutubeDL objects have a method that allows
115     InfoExtractors to be registered in a given order. When it is passed
116     a URL, the YoutubeDL object handles it to the first InfoExtractor it
117     finds that reports being able to handle it. The InfoExtractor extracts
118     all the information about the video or videos the URL refers to, and
119     YoutubeDL process the extracted information, possibly using a File
120     Downloader to download the video.
121
122     YoutubeDL objects accept a lot of parameters. In order not to saturate
123     the object constructor with arguments, it receives a dictionary of
124     options instead. These options are available through the params
125     attribute for the InfoExtractors to use. The YoutubeDL also
126     registers itself as the downloader in charge for the InfoExtractors
127     that are added to it, so this is a "mutual registration".
128
129     Available options:
130
131     username:          Username for authentication purposes.
132     password:          Password for authentication purposes.
133     videopassword:     Password for accessing a video.
134     usenetrc:          Use netrc for authentication instead.
135     verbose:           Print additional info to stdout.
136     quiet:             Do not print messages to stdout.
137     no_warnings:       Do not print out anything for warnings.
138     forceurl:          Force printing final URL.
139     forcetitle:        Force printing title.
140     forceid:           Force printing ID.
141     forcethumbnail:    Force printing thumbnail URL.
142     forcedescription:  Force printing description.
143     forcefilename:     Force printing final filename.
144     forceduration:     Force printing duration.
145     forcejson:         Force printing info_dict as JSON.
146     dump_single_json:  Force printing the info_dict of the whole playlist
147                        (or video) as a single JSON line.
148     simulate:          Do not download the video files.
149     format:            Video format code. See options.py for more information.
150     outtmpl:           Template for output names.
151     restrictfilenames: Do not allow "&" and spaces in file names
152     ignoreerrors:      Do not stop on download errors.
153     force_generic_extractor: Force downloader to use the generic extractor
154     nooverwrites:      Prevent overwriting files.
155     playliststart:     Playlist item to start at.
156     playlistend:       Playlist item to end at.
157     playlist_items:    Specific indices of playlist to download.
158     playlistreverse:   Download playlist items in reverse order.
159     matchtitle:        Download only matching titles.
160     rejecttitle:       Reject downloads for matching titles.
161     logger:            Log messages to a logging.Logger instance.
162     logtostderr:       Log messages to stderr instead of stdout.
163     writedescription:  Write the video description to a .description file
164     writeinfojson:     Write the video description to a .info.json file
165     writeannotations:  Write the video annotations to a .annotations.xml file
166     writethumbnail:    Write the thumbnail image to a file
167     write_all_thumbnails:  Write all thumbnail formats to files
168     writesubtitles:    Write the video subtitles to a file
169     writeautomaticsub: Write the automatically generated subtitles to a file
170     allsubtitles:      Downloads all the subtitles of the video
171                        (requires writesubtitles or writeautomaticsub)
172     listsubtitles:     Lists all available subtitles for the video
173     subtitlesformat:   The format code for subtitles
174     subtitleslangs:    List of languages of the subtitles to download
175     keepvideo:         Keep the video file after post-processing
176     daterange:         A DateRange object, download only if the upload_date is in the range.
177     skip_download:     Skip the actual download of the video file
178     cachedir:          Location of the cache files in the filesystem.
179                        False to disable filesystem cache.
180     noplaylist:        Download single video instead of a playlist if in doubt.
181     age_limit:         An integer representing the user's age in years.
182                        Unsuitable videos for the given age are skipped.
183     min_views:         An integer representing the minimum view count the video
184                        must have in order to not be skipped.
185                        Videos without view count information are always
186                        downloaded. None for no limit.
187     max_views:         An integer representing the maximum view count.
188                        Videos that are more popular than that are not
189                        downloaded.
190                        Videos without view count information are always
191                        downloaded. None for no limit.
192     download_archive:  File name of a file where all downloads are recorded.
193                        Videos already present in the file are not downloaded
194                        again.
195     cookiefile:        File name where cookies should be read from and dumped to.
196     nocheckcertificate:Do not verify SSL certificates
197     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
198                        At the moment, this is only supported by YouTube.
199     proxy:             URL of the proxy server to use
200     geo_verification_proxy:  URL of the proxy to use for IP address verification
201                        on geo-restricted sites. (Experimental)
202     socket_timeout:    Time to wait for unresponsive hosts, in seconds
203     bidi_workaround:   Work around buggy terminals without bidirectional text
204                        support, using fridibi
205     debug_printtraffic:Print out sent and received HTTP traffic
206     include_ads:       Download ads as well
207     default_search:    Prepend this string if an input url is not valid.
208                        'auto' for elaborate guessing
209     encoding:          Use this encoding instead of the system-specified.
210     extract_flat:      Do not resolve URLs, return the immediate result.
211                        Pass in 'in_playlist' to only show this behavior for
212                        playlist items.
213     postprocessors:    A list of dictionaries, each with an entry
214                        * key:  The name of the postprocessor. See
215                                youtube_dl/postprocessor/__init__.py for a list.
216                        as well as any further keyword arguments for the
217                        postprocessor.
218     progress_hooks:    A list of functions that get called on download
219                        progress, with a dictionary with the entries
220                        * status: One of "downloading", "error", or "finished".
221                                  Check this first and ignore unknown values.
222
223                        If status is one of "downloading", or "finished", the
224                        following properties may also be present:
225                        * filename: The final filename (always present)
226                        * tmpfilename: The filename we're currently writing to
227                        * downloaded_bytes: Bytes on disk
228                        * total_bytes: Size of the whole file, None if unknown
229                        * total_bytes_estimate: Guess of the eventual file size,
230                                                None if unavailable.
231                        * elapsed: The number of seconds since download started.
232                        * eta: The estimated time in seconds, None if unknown
233                        * speed: The download speed in bytes/second, None if
234                                 unknown
235                        * fragment_index: The counter of the currently
236                                          downloaded video fragment.
237                        * fragment_count: The number of fragments (= individual
238                                          files that will be merged)
239
240                        Progress hooks are guaranteed to be called at least once
241                        (with status "finished") if the download is successful.
242     merge_output_format: Extension to use when merging formats.
243     fixup:             Automatically correct known faults of the file.
244                        One of:
245                        - "never": do nothing
246                        - "warn": only emit a warning
247                        - "detect_or_warn": check whether we can do anything
248                                            about it, warn otherwise (default)
249     source_address:    (Experimental) Client-side IP address to bind to.
250     call_home:         Boolean, true iff we are allowed to contact the
251                        youtube-dl servers for debugging.
252     sleep_interval:    Number of seconds to sleep before each download.
253     listformats:       Print an overview of available video formats and exit.
254     list_thumbnails:   Print a table of all thumbnails and exit.
255     match_filter:      A function that gets called with the info_dict of
256                        every video.
257                        If it returns a message, the video is ignored.
258                        If it returns None, the video is downloaded.
259                        match_filter_func in utils.py is one example for this.
260     no_color:          Do not emit color codes in output.
261
262     The following options determine which downloader is picked:
263     external_downloader: Executable of the external downloader to call.
264                        None or unset for standard (built-in) downloader.
265     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
266                        if True, otherwise use ffmpeg/avconv if False, otherwise
267                        use downloader suggested by extractor if None.
268
269     The following parameters are not used by YoutubeDL itself, they are used by
270     the downloader (see youtube_dl/downloader/common.py):
271     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
272     noresizebuffer, retries, continuedl, noprogress, consoletitle,
273     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
274
275     The following options are used by the post processors:
276     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
277                        otherwise prefer avconv.
278     postprocessor_args: A list of additional command-line arguments for the
279                         postprocessor.
280     """
281
282     params = None
283     _ies = []
284     _pps = []
285     _download_retcode = None
286     _num_downloads = None
287     _screen_file = None
288
289     def __init__(self, params=None, auto_init=True):
290         """Create a FileDownloader object with the given options."""
291         if params is None:
292             params = {}
293         self._ies = []
294         self._ies_instances = {}
295         self._pps = []
296         self._progress_hooks = []
297         self._download_retcode = 0
298         self._num_downloads = 0
299         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
300         self._err_file = sys.stderr
301         self.params = {
302             # Default parameters
303             'nocheckcertificate': False,
304         }
305         self.params.update(params)
306         self.cache = Cache(self)
307
308         if self.params.get('cn_verification_proxy') is not None:
309             self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.')
310             if self.params.get('geo_verification_proxy') is None:
311                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
312
313         if params.get('bidi_workaround', False):
314             try:
315                 import pty
316                 master, slave = pty.openpty()
317                 width = compat_get_terminal_size().columns
318                 if width is None:
319                     width_args = []
320                 else:
321                     width_args = ['-w', str(width)]
322                 sp_kwargs = dict(
323                     stdin=subprocess.PIPE,
324                     stdout=slave,
325                     stderr=self._err_file)
326                 try:
327                     self._output_process = subprocess.Popen(
328                         ['bidiv'] + width_args, **sp_kwargs
329                     )
330                 except OSError:
331                     self._output_process = subprocess.Popen(
332                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
333                 self._output_channel = os.fdopen(master, 'rb')
334             except OSError as ose:
335                 if ose.errno == errno.ENOENT:
336                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
337                 else:
338                     raise
339
340         if (sys.version_info >= (3,) and sys.platform != 'win32' and
341                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
342                 not params.get('restrictfilenames', False)):
343             # On Python 3, the Unicode filesystem API will throw errors (#1474)
344             self.report_warning(
345                 'Assuming --restrict-filenames since file system encoding '
346                 'cannot encode all characters. '
347                 'Set the LC_ALL environment variable to fix this.')
348             self.params['restrictfilenames'] = True
349
350         if isinstance(params.get('outtmpl'), bytes):
351             self.report_warning(
352                 'Parameter outtmpl is bytes, but should be a unicode string. '
353                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
354
355         self._setup_opener()
356
357         if auto_init:
358             self.print_debug_header()
359             self.add_default_info_extractors()
360
361         for pp_def_raw in self.params.get('postprocessors', []):
362             pp_class = get_postprocessor(pp_def_raw['key'])
363             pp_def = dict(pp_def_raw)
364             del pp_def['key']
365             pp = pp_class(self, **compat_kwargs(pp_def))
366             self.add_post_processor(pp)
367
368         for ph in self.params.get('progress_hooks', []):
369             self.add_progress_hook(ph)
370
371         register_socks_protocols()
372
373     def warn_if_short_id(self, argv):
374         # short YouTube ID starting with dash?
375         idxs = [
376             i for i, a in enumerate(argv)
377             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
378         if idxs:
379             correct_argv = (
380                 ['youtube-dl'] +
381                 [a for i, a in enumerate(argv) if i not in idxs] +
382                 ['--'] + [argv[i] for i in idxs]
383             )
384             self.report_warning(
385                 'Long argument string detected. '
386                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
387                 args_to_str(correct_argv))
388
389     def add_info_extractor(self, ie):
390         """Add an InfoExtractor object to the end of the list."""
391         self._ies.append(ie)
392         if not isinstance(ie, type):
393             self._ies_instances[ie.ie_key()] = ie
394             ie.set_downloader(self)
395
396     def get_info_extractor(self, ie_key):
397         """
398         Get an instance of an IE with name ie_key, it will try to get one from
399         the _ies list, if there's no instance it will create a new one and add
400         it to the extractor list.
401         """
402         ie = self._ies_instances.get(ie_key)
403         if ie is None:
404             ie = get_info_extractor(ie_key)()
405             self.add_info_extractor(ie)
406         return ie
407
408     def add_default_info_extractors(self):
409         """
410         Add the InfoExtractors returned by gen_extractors to the end of the list
411         """
412         for ie in gen_extractor_classes():
413             self.add_info_extractor(ie)
414
415     def add_post_processor(self, pp):
416         """Add a PostProcessor object to the end of the chain."""
417         self._pps.append(pp)
418         pp.set_downloader(self)
419
420     def add_progress_hook(self, ph):
421         """Add the progress hook (currently only for the file downloader)"""
422         self._progress_hooks.append(ph)
423
424     def _bidi_workaround(self, message):
425         if not hasattr(self, '_output_channel'):
426             return message
427
428         assert hasattr(self, '_output_process')
429         assert isinstance(message, compat_str)
430         line_count = message.count('\n') + 1
431         self._output_process.stdin.write((message + '\n').encode('utf-8'))
432         self._output_process.stdin.flush()
433         res = ''.join(self._output_channel.readline().decode('utf-8')
434                       for _ in range(line_count))
435         return res[:-len('\n')]
436
437     def to_screen(self, message, skip_eol=False):
438         """Print message to stdout if not in quiet mode."""
439         return self.to_stdout(message, skip_eol, check_quiet=True)
440
441     def _write_string(self, s, out=None):
442         write_string(s, out=out, encoding=self.params.get('encoding'))
443
444     def to_stdout(self, message, skip_eol=False, check_quiet=False):
445         """Print message to stdout if not in quiet mode."""
446         if self.params.get('logger'):
447             self.params['logger'].debug(message)
448         elif not check_quiet or not self.params.get('quiet', False):
449             message = self._bidi_workaround(message)
450             terminator = ['\n', ''][skip_eol]
451             output = message + terminator
452
453             self._write_string(output, self._screen_file)
454
455     def to_stderr(self, message):
456         """Print message to stderr."""
457         assert isinstance(message, compat_str)
458         if self.params.get('logger'):
459             self.params['logger'].error(message)
460         else:
461             message = self._bidi_workaround(message)
462             output = message + '\n'
463             self._write_string(output, self._err_file)
464
465     def to_console_title(self, message):
466         if not self.params.get('consoletitle', False):
467             return
468         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
469             # c_wchar_p() might not be necessary if `message` is
470             # already of type unicode()
471             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
472         elif 'TERM' in os.environ:
473             self._write_string('\033]0;%s\007' % message, self._screen_file)
474
475     def save_console_title(self):
476         if not self.params.get('consoletitle', False):
477             return
478         if 'TERM' in os.environ:
479             # Save the title on stack
480             self._write_string('\033[22;0t', self._screen_file)
481
482     def restore_console_title(self):
483         if not self.params.get('consoletitle', False):
484             return
485         if 'TERM' in os.environ:
486             # Restore the title from stack
487             self._write_string('\033[23;0t', self._screen_file)
488
489     def __enter__(self):
490         self.save_console_title()
491         return self
492
493     def __exit__(self, *args):
494         self.restore_console_title()
495
496         if self.params.get('cookiefile') is not None:
497             self.cookiejar.save()
498
499     def trouble(self, message=None, tb=None):
500         """Determine action to take when a download problem appears.
501
502         Depending on if the downloader has been configured to ignore
503         download errors or not, this method may throw an exception or
504         not when errors are found, after printing the message.
505
506         tb, if given, is additional traceback information.
507         """
508         if message is not None:
509             self.to_stderr(message)
510         if self.params.get('verbose'):
511             if tb is None:
512                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
513                     tb = ''
514                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
515                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
516                     tb += encode_compat_str(traceback.format_exc())
517                 else:
518                     tb_data = traceback.format_list(traceback.extract_stack())
519                     tb = ''.join(tb_data)
520             self.to_stderr(tb)
521         if not self.params.get('ignoreerrors', False):
522             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
523                 exc_info = sys.exc_info()[1].exc_info
524             else:
525                 exc_info = sys.exc_info()
526             raise DownloadError(message, exc_info)
527         self._download_retcode = 1
528
529     def report_warning(self, message):
530         '''
531         Print the message to stderr, it will be prefixed with 'WARNING:'
532         If stderr is a tty file the 'WARNING:' will be colored
533         '''
534         if self.params.get('logger') is not None:
535             self.params['logger'].warning(message)
536         else:
537             if self.params.get('no_warnings'):
538                 return
539             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
540                 _msg_header = '\033[0;33mWARNING:\033[0m'
541             else:
542                 _msg_header = 'WARNING:'
543             warning_message = '%s %s' % (_msg_header, message)
544             self.to_stderr(warning_message)
545
546     def report_error(self, message, tb=None):
547         '''
548         Do the same as trouble, but prefixes the message with 'ERROR:', colored
549         in red if stderr is a tty file.
550         '''
551         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
552             _msg_header = '\033[0;31mERROR:\033[0m'
553         else:
554             _msg_header = 'ERROR:'
555         error_message = '%s %s' % (_msg_header, message)
556         self.trouble(error_message, tb)
557
558     def report_file_already_downloaded(self, file_name):
559         """Report file has already been fully downloaded."""
560         try:
561             self.to_screen('[download] %s has already been downloaded' % file_name)
562         except UnicodeEncodeError:
563             self.to_screen('[download] The file has already been downloaded')
564
565     def prepare_filename(self, info_dict):
566         """Generate the output filename."""
567         try:
568             template_dict = dict(info_dict)
569
570             template_dict['epoch'] = int(time.time())
571             autonumber_size = self.params.get('autonumber_size')
572             if autonumber_size is None:
573                 autonumber_size = 5
574             autonumber_templ = '%0' + str(autonumber_size) + 'd'
575             template_dict['autonumber'] = autonumber_templ % self._num_downloads
576             if template_dict.get('playlist_index') is not None:
577                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
578             if template_dict.get('resolution') is None:
579                 if template_dict.get('width') and template_dict.get('height'):
580                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
581                 elif template_dict.get('height'):
582                     template_dict['resolution'] = '%sp' % template_dict['height']
583                 elif template_dict.get('width'):
584                     template_dict['resolution'] = '%dx?' % template_dict['width']
585
586             sanitize = lambda k, v: sanitize_filename(
587                 compat_str(v),
588                 restricted=self.params.get('restrictfilenames'),
589                 is_id=(k == 'id'))
590             template_dict = dict((k, sanitize(k, v))
591                                  for k, v in template_dict.items()
592                                  if v is not None and not isinstance(v, (list, tuple, dict)))
593             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
594
595             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
596             tmpl = compat_expanduser(outtmpl)
597             filename = tmpl % template_dict
598             # Temporary fix for #4787
599             # 'Treat' all problem characters by passing filename through preferredencoding
600             # to workaround encoding issues with subprocess on python2 @ Windows
601             if sys.version_info < (3, 0) and sys.platform == 'win32':
602                 filename = encodeFilename(filename, True).decode(preferredencoding())
603             return sanitize_path(filename)
604         except ValueError as err:
605             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
606             return None
607
608     def _match_entry(self, info_dict, incomplete):
609         """ Returns None iff the file should be downloaded """
610
611         video_title = info_dict.get('title', info_dict.get('id', 'video'))
612         if 'title' in info_dict:
613             # This can happen when we're just evaluating the playlist
614             title = info_dict['title']
615             matchtitle = self.params.get('matchtitle', False)
616             if matchtitle:
617                 if not re.search(matchtitle, title, re.IGNORECASE):
618                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
619             rejecttitle = self.params.get('rejecttitle', False)
620             if rejecttitle:
621                 if re.search(rejecttitle, title, re.IGNORECASE):
622                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
623         date = info_dict.get('upload_date')
624         if date is not None:
625             dateRange = self.params.get('daterange', DateRange())
626             if date not in dateRange:
627                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
628         view_count = info_dict.get('view_count')
629         if view_count is not None:
630             min_views = self.params.get('min_views')
631             if min_views is not None and view_count < min_views:
632                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
633             max_views = self.params.get('max_views')
634             if max_views is not None and view_count > max_views:
635                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
636         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
637             return 'Skipping "%s" because it is age restricted' % video_title
638         if self.in_download_archive(info_dict):
639             return '%s has already been recorded in archive' % video_title
640
641         if not incomplete:
642             match_filter = self.params.get('match_filter')
643             if match_filter is not None:
644                 ret = match_filter(info_dict)
645                 if ret is not None:
646                     return ret
647
648         return None
649
650     @staticmethod
651     def add_extra_info(info_dict, extra_info):
652         '''Set the keys from extra_info in info dict if they are missing'''
653         for key, value in extra_info.items():
654             info_dict.setdefault(key, value)
655
656     def extract_info(self, url, download=True, ie_key=None, extra_info={},
657                      process=True, force_generic_extractor=False):
658         '''
659         Returns a list with a dictionary for each video we find.
660         If 'download', also downloads the videos.
661         extra_info is a dict containing the extra values to add to each result
662         '''
663
664         if not ie_key and force_generic_extractor:
665             ie_key = 'Generic'
666
667         if ie_key:
668             ies = [self.get_info_extractor(ie_key)]
669         else:
670             ies = self._ies
671
672         for ie in ies:
673             if not ie.suitable(url):
674                 continue
675
676             ie = self.get_info_extractor(ie.ie_key())
677             if not ie.working():
678                 self.report_warning('The program functionality for this site has been marked as broken, '
679                                     'and will probably not work.')
680
681             try:
682                 ie_result = ie.extract(url)
683                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
684                     break
685                 if isinstance(ie_result, list):
686                     # Backwards compatibility: old IE result format
687                     ie_result = {
688                         '_type': 'compat_list',
689                         'entries': ie_result,
690                     }
691                 self.add_default_extra_info(ie_result, ie, url)
692                 if process:
693                     return self.process_ie_result(ie_result, download, extra_info)
694                 else:
695                     return ie_result
696             except ExtractorError as e:  # An error we somewhat expected
697                 self.report_error(compat_str(e), e.format_traceback())
698                 break
699             except MaxDownloadsReached:
700                 raise
701             except Exception as e:
702                 if self.params.get('ignoreerrors', False):
703                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
704                     break
705                 else:
706                     raise
707         else:
708             self.report_error('no suitable InfoExtractor for URL %s' % url)
709
710     def add_default_extra_info(self, ie_result, ie, url):
711         self.add_extra_info(ie_result, {
712             'extractor': ie.IE_NAME,
713             'webpage_url': url,
714             'webpage_url_basename': url_basename(url),
715             'extractor_key': ie.ie_key(),
716         })
717
718     def process_ie_result(self, ie_result, download=True, extra_info={}):
719         """
720         Take the result of the ie(may be modified) and resolve all unresolved
721         references (URLs, playlist items).
722
723         It will also download the videos if 'download'.
724         Returns the resolved ie_result.
725         """
726         result_type = ie_result.get('_type', 'video')
727
728         if result_type in ('url', 'url_transparent'):
729             ie_result['url'] = sanitize_url(ie_result['url'])
730             extract_flat = self.params.get('extract_flat', False)
731             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
732                     extract_flat is True):
733                 if self.params.get('forcejson', False):
734                     self.to_stdout(json.dumps(ie_result))
735                 return ie_result
736
737         if result_type == 'video':
738             self.add_extra_info(ie_result, extra_info)
739             return self.process_video_result(ie_result, download=download)
740         elif result_type == 'url':
741             # We have to add extra_info to the results because it may be
742             # contained in a playlist
743             return self.extract_info(ie_result['url'],
744                                      download,
745                                      ie_key=ie_result.get('ie_key'),
746                                      extra_info=extra_info)
747         elif result_type == 'url_transparent':
748             # Use the information from the embedding page
749             info = self.extract_info(
750                 ie_result['url'], ie_key=ie_result.get('ie_key'),
751                 extra_info=extra_info, download=False, process=False)
752
753             force_properties = dict(
754                 (k, v) for k, v in ie_result.items() if v is not None)
755             for f in ('_type', 'url', 'ie_key'):
756                 if f in force_properties:
757                     del force_properties[f]
758             new_result = info.copy()
759             new_result.update(force_properties)
760
761             assert new_result.get('_type') != 'url_transparent'
762
763             return self.process_ie_result(
764                 new_result, download=download, extra_info=extra_info)
765         elif result_type == 'playlist' or result_type == 'multi_video':
766             # We process each entry in the playlist
767             playlist = ie_result.get('title') or ie_result.get('id')
768             self.to_screen('[download] Downloading playlist: %s' % playlist)
769
770             playlist_results = []
771
772             playliststart = self.params.get('playliststart', 1) - 1
773             playlistend = self.params.get('playlistend')
774             # For backwards compatibility, interpret -1 as whole list
775             if playlistend == -1:
776                 playlistend = None
777
778             playlistitems_str = self.params.get('playlist_items')
779             playlistitems = None
780             if playlistitems_str is not None:
781                 def iter_playlistitems(format):
782                     for string_segment in format.split(','):
783                         if '-' in string_segment:
784                             start, end = string_segment.split('-')
785                             for item in range(int(start), int(end) + 1):
786                                 yield int(item)
787                         else:
788                             yield int(string_segment)
789                 playlistitems = iter_playlistitems(playlistitems_str)
790
791             ie_entries = ie_result['entries']
792             if isinstance(ie_entries, list):
793                 n_all_entries = len(ie_entries)
794                 if playlistitems:
795                     entries = [
796                         ie_entries[i - 1] for i in playlistitems
797                         if -n_all_entries <= i - 1 < n_all_entries]
798                 else:
799                     entries = ie_entries[playliststart:playlistend]
800                 n_entries = len(entries)
801                 self.to_screen(
802                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
803                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
804             elif isinstance(ie_entries, PagedList):
805                 if playlistitems:
806                     entries = []
807                     for item in playlistitems:
808                         entries.extend(ie_entries.getslice(
809                             item - 1, item
810                         ))
811                 else:
812                     entries = ie_entries.getslice(
813                         playliststart, playlistend)
814                 n_entries = len(entries)
815                 self.to_screen(
816                     '[%s] playlist %s: Downloading %d videos' %
817                     (ie_result['extractor'], playlist, n_entries))
818             else:  # iterable
819                 if playlistitems:
820                     entry_list = list(ie_entries)
821                     entries = [entry_list[i - 1] for i in playlistitems]
822                 else:
823                     entries = list(itertools.islice(
824                         ie_entries, playliststart, playlistend))
825                 n_entries = len(entries)
826                 self.to_screen(
827                     '[%s] playlist %s: Downloading %d videos' %
828                     (ie_result['extractor'], playlist, n_entries))
829
830             if self.params.get('playlistreverse', False):
831                 entries = entries[::-1]
832
833             for i, entry in enumerate(entries, 1):
834                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
835                 extra = {
836                     'n_entries': n_entries,
837                     'playlist': playlist,
838                     'playlist_id': ie_result.get('id'),
839                     'playlist_title': ie_result.get('title'),
840                     'playlist_index': i + playliststart,
841                     'extractor': ie_result['extractor'],
842                     'webpage_url': ie_result['webpage_url'],
843                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
844                     'extractor_key': ie_result['extractor_key'],
845                 }
846
847                 reason = self._match_entry(entry, incomplete=True)
848                 if reason is not None:
849                     self.to_screen('[download] ' + reason)
850                     continue
851
852                 entry_result = self.process_ie_result(entry,
853                                                       download=download,
854                                                       extra_info=extra)
855                 playlist_results.append(entry_result)
856             ie_result['entries'] = playlist_results
857             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
858             return ie_result
859         elif result_type == 'compat_list':
860             self.report_warning(
861                 'Extractor %s returned a compat_list result. '
862                 'It needs to be updated.' % ie_result.get('extractor'))
863
864             def _fixup(r):
865                 self.add_extra_info(
866                     r,
867                     {
868                         'extractor': ie_result['extractor'],
869                         'webpage_url': ie_result['webpage_url'],
870                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
871                         'extractor_key': ie_result['extractor_key'],
872                     }
873                 )
874                 return r
875             ie_result['entries'] = [
876                 self.process_ie_result(_fixup(r), download, extra_info)
877                 for r in ie_result['entries']
878             ]
879             return ie_result
880         else:
881             raise Exception('Invalid result type: %s' % result_type)
882
883     def _build_format_filter(self, filter_spec):
884         " Returns a function to filter the formats according to the filter_spec "
885
886         OPERATORS = {
887             '<': operator.lt,
888             '<=': operator.le,
889             '>': operator.gt,
890             '>=': operator.ge,
891             '=': operator.eq,
892             '!=': operator.ne,
893         }
894         operator_rex = re.compile(r'''(?x)\s*
895             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
896             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
897             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
898             $
899             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
900         m = operator_rex.search(filter_spec)
901         if m:
902             try:
903                 comparison_value = int(m.group('value'))
904             except ValueError:
905                 comparison_value = parse_filesize(m.group('value'))
906                 if comparison_value is None:
907                     comparison_value = parse_filesize(m.group('value') + 'B')
908                 if comparison_value is None:
909                     raise ValueError(
910                         'Invalid value %r in format specification %r' % (
911                             m.group('value'), filter_spec))
912             op = OPERATORS[m.group('op')]
913
914         if not m:
915             STR_OPERATORS = {
916                 '=': operator.eq,
917                 '!=': operator.ne,
918                 '^=': lambda attr, value: attr.startswith(value),
919                 '$=': lambda attr, value: attr.endswith(value),
920                 '*=': lambda attr, value: value in attr,
921             }
922             str_operator_rex = re.compile(r'''(?x)
923                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
924                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
925                 \s*(?P<value>[a-zA-Z0-9._-]+)
926                 \s*$
927                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
928             m = str_operator_rex.search(filter_spec)
929             if m:
930                 comparison_value = m.group('value')
931                 op = STR_OPERATORS[m.group('op')]
932
933         if not m:
934             raise ValueError('Invalid filter specification %r' % filter_spec)
935
936         def _filter(f):
937             actual_value = f.get(m.group('key'))
938             if actual_value is None:
939                 return m.group('none_inclusive')
940             return op(actual_value, comparison_value)
941         return _filter
942
943     def build_format_selector(self, format_spec):
944         def syntax_error(note, start):
945             message = (
946                 'Invalid format specification: '
947                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
948             return SyntaxError(message)
949
950         PICKFIRST = 'PICKFIRST'
951         MERGE = 'MERGE'
952         SINGLE = 'SINGLE'
953         GROUP = 'GROUP'
954         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
955
956         def _parse_filter(tokens):
957             filter_parts = []
958             for type, string, start, _, _ in tokens:
959                 if type == tokenize.OP and string == ']':
960                     return ''.join(filter_parts)
961                 else:
962                     filter_parts.append(string)
963
964         def _remove_unused_ops(tokens):
965             # Remove operators that we don't use and join them with the surrounding strings
966             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
967             ALLOWED_OPS = ('/', '+', ',', '(', ')')
968             last_string, last_start, last_end, last_line = None, None, None, None
969             for type, string, start, end, line in tokens:
970                 if type == tokenize.OP and string == '[':
971                     if last_string:
972                         yield tokenize.NAME, last_string, last_start, last_end, last_line
973                         last_string = None
974                     yield type, string, start, end, line
975                     # everything inside brackets will be handled by _parse_filter
976                     for type, string, start, end, line in tokens:
977                         yield type, string, start, end, line
978                         if type == tokenize.OP and string == ']':
979                             break
980                 elif type == tokenize.OP and string in ALLOWED_OPS:
981                     if last_string:
982                         yield tokenize.NAME, last_string, last_start, last_end, last_line
983                         last_string = None
984                     yield type, string, start, end, line
985                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
986                     if not last_string:
987                         last_string = string
988                         last_start = start
989                         last_end = end
990                     else:
991                         last_string += string
992             if last_string:
993                 yield tokenize.NAME, last_string, last_start, last_end, last_line
994
995         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
996             selectors = []
997             current_selector = None
998             for type, string, start, _, _ in tokens:
999                 # ENCODING is only defined in python 3.x
1000                 if type == getattr(tokenize, 'ENCODING', None):
1001                     continue
1002                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1003                     current_selector = FormatSelector(SINGLE, string, [])
1004                 elif type == tokenize.OP:
1005                     if string == ')':
1006                         if not inside_group:
1007                             # ')' will be handled by the parentheses group
1008                             tokens.restore_last_token()
1009                         break
1010                     elif inside_merge and string in ['/', ',']:
1011                         tokens.restore_last_token()
1012                         break
1013                     elif inside_choice and string == ',':
1014                         tokens.restore_last_token()
1015                         break
1016                     elif string == ',':
1017                         if not current_selector:
1018                             raise syntax_error('"," must follow a format selector', start)
1019                         selectors.append(current_selector)
1020                         current_selector = None
1021                     elif string == '/':
1022                         if not current_selector:
1023                             raise syntax_error('"/" must follow a format selector', start)
1024                         first_choice = current_selector
1025                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1026                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1027                     elif string == '[':
1028                         if not current_selector:
1029                             current_selector = FormatSelector(SINGLE, 'best', [])
1030                         format_filter = _parse_filter(tokens)
1031                         current_selector.filters.append(format_filter)
1032                     elif string == '(':
1033                         if current_selector:
1034                             raise syntax_error('Unexpected "("', start)
1035                         group = _parse_format_selection(tokens, inside_group=True)
1036                         current_selector = FormatSelector(GROUP, group, [])
1037                     elif string == '+':
1038                         video_selector = current_selector
1039                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1040                         if not video_selector or not audio_selector:
1041                             raise syntax_error('"+" must be between two format selectors', start)
1042                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1043                     else:
1044                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1045                 elif type == tokenize.ENDMARKER:
1046                     break
1047             if current_selector:
1048                 selectors.append(current_selector)
1049             return selectors
1050
1051         def _build_selector_function(selector):
1052             if isinstance(selector, list):
1053                 fs = [_build_selector_function(s) for s in selector]
1054
1055                 def selector_function(ctx):
1056                     for f in fs:
1057                         for format in f(ctx):
1058                             yield format
1059                 return selector_function
1060             elif selector.type == GROUP:
1061                 selector_function = _build_selector_function(selector.selector)
1062             elif selector.type == PICKFIRST:
1063                 fs = [_build_selector_function(s) for s in selector.selector]
1064
1065                 def selector_function(ctx):
1066                     for f in fs:
1067                         picked_formats = list(f(ctx))
1068                         if picked_formats:
1069                             return picked_formats
1070                     return []
1071             elif selector.type == SINGLE:
1072                 format_spec = selector.selector
1073
1074                 def selector_function(ctx):
1075                     formats = list(ctx['formats'])
1076                     if not formats:
1077                         return
1078                     if format_spec == 'all':
1079                         for f in formats:
1080                             yield f
1081                     elif format_spec in ['best', 'worst', None]:
1082                         format_idx = 0 if format_spec == 'worst' else -1
1083                         audiovideo_formats = [
1084                             f for f in formats
1085                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1086                         if audiovideo_formats:
1087                             yield audiovideo_formats[format_idx]
1088                         # for extractors with incomplete formats (audio only (soundcloud)
1089                         # or video only (imgur)) we will fallback to best/worst
1090                         # {video,audio}-only format
1091                         elif ctx['incomplete_formats']:
1092                             yield formats[format_idx]
1093                     elif format_spec == 'bestaudio':
1094                         audio_formats = [
1095                             f for f in formats
1096                             if f.get('vcodec') == 'none']
1097                         if audio_formats:
1098                             yield audio_formats[-1]
1099                     elif format_spec == 'worstaudio':
1100                         audio_formats = [
1101                             f for f in formats
1102                             if f.get('vcodec') == 'none']
1103                         if audio_formats:
1104                             yield audio_formats[0]
1105                     elif format_spec == 'bestvideo':
1106                         video_formats = [
1107                             f for f in formats
1108                             if f.get('acodec') == 'none']
1109                         if video_formats:
1110                             yield video_formats[-1]
1111                     elif format_spec == 'worstvideo':
1112                         video_formats = [
1113                             f for f in formats
1114                             if f.get('acodec') == 'none']
1115                         if video_formats:
1116                             yield video_formats[0]
1117                     else:
1118                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1119                         if format_spec in extensions:
1120                             filter_f = lambda f: f['ext'] == format_spec
1121                         else:
1122                             filter_f = lambda f: f['format_id'] == format_spec
1123                         matches = list(filter(filter_f, formats))
1124                         if matches:
1125                             yield matches[-1]
1126             elif selector.type == MERGE:
1127                 def _merge(formats_info):
1128                     format_1, format_2 = [f['format_id'] for f in formats_info]
1129                     # The first format must contain the video and the
1130                     # second the audio
1131                     if formats_info[0].get('vcodec') == 'none':
1132                         self.report_error('The first format must '
1133                                           'contain the video, try using '
1134                                           '"-f %s+%s"' % (format_2, format_1))
1135                         return
1136                     # Formats must be opposite (video+audio)
1137                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1138                         self.report_error(
1139                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1140                             % (format_1, format_2))
1141                         return
1142                     output_ext = (
1143                         formats_info[0]['ext']
1144                         if self.params.get('merge_output_format') is None
1145                         else self.params['merge_output_format'])
1146                     return {
1147                         'requested_formats': formats_info,
1148                         'format': '%s+%s' % (formats_info[0].get('format'),
1149                                              formats_info[1].get('format')),
1150                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1151                                                 formats_info[1].get('format_id')),
1152                         'width': formats_info[0].get('width'),
1153                         'height': formats_info[0].get('height'),
1154                         'resolution': formats_info[0].get('resolution'),
1155                         'fps': formats_info[0].get('fps'),
1156                         'vcodec': formats_info[0].get('vcodec'),
1157                         'vbr': formats_info[0].get('vbr'),
1158                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1159                         'acodec': formats_info[1].get('acodec'),
1160                         'abr': formats_info[1].get('abr'),
1161                         'ext': output_ext,
1162                     }
1163                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1164
1165                 def selector_function(ctx):
1166                     for pair in itertools.product(
1167                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1168                         yield _merge(pair)
1169
1170             filters = [self._build_format_filter(f) for f in selector.filters]
1171
1172             def final_selector(ctx):
1173                 ctx_copy = copy.deepcopy(ctx)
1174                 for _filter in filters:
1175                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1176                 return selector_function(ctx_copy)
1177             return final_selector
1178
1179         stream = io.BytesIO(format_spec.encode('utf-8'))
1180         try:
1181             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1182         except tokenize.TokenError:
1183             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1184
1185         class TokenIterator(object):
1186             def __init__(self, tokens):
1187                 self.tokens = tokens
1188                 self.counter = 0
1189
1190             def __iter__(self):
1191                 return self
1192
1193             def __next__(self):
1194                 if self.counter >= len(self.tokens):
1195                     raise StopIteration()
1196                 value = self.tokens[self.counter]
1197                 self.counter += 1
1198                 return value
1199
1200             next = __next__
1201
1202             def restore_last_token(self):
1203                 self.counter -= 1
1204
1205         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1206         return _build_selector_function(parsed_selector)
1207
1208     def _calc_headers(self, info_dict):
1209         res = std_headers.copy()
1210
1211         add_headers = info_dict.get('http_headers')
1212         if add_headers:
1213             res.update(add_headers)
1214
1215         cookies = self._calc_cookies(info_dict)
1216         if cookies:
1217             res['Cookie'] = cookies
1218
1219         return res
1220
1221     def _calc_cookies(self, info_dict):
1222         pr = sanitized_Request(info_dict['url'])
1223         self.cookiejar.add_cookie_header(pr)
1224         return pr.get_header('Cookie')
1225
1226     def process_video_result(self, info_dict, download=True):
1227         assert info_dict.get('_type', 'video') == 'video'
1228
1229         if 'id' not in info_dict:
1230             raise ExtractorError('Missing "id" field in extractor result')
1231         if 'title' not in info_dict:
1232             raise ExtractorError('Missing "title" field in extractor result')
1233
1234         if not isinstance(info_dict['id'], compat_str):
1235             self.report_warning('"id" field is not a string - forcing string conversion')
1236             info_dict['id'] = compat_str(info_dict['id'])
1237
1238         if 'playlist' not in info_dict:
1239             # It isn't part of a playlist
1240             info_dict['playlist'] = None
1241             info_dict['playlist_index'] = None
1242
1243         thumbnails = info_dict.get('thumbnails')
1244         if thumbnails is None:
1245             thumbnail = info_dict.get('thumbnail')
1246             if thumbnail:
1247                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1248         if thumbnails:
1249             thumbnails.sort(key=lambda t: (
1250                 t.get('preference'), t.get('width'), t.get('height'),
1251                 t.get('id'), t.get('url')))
1252             for i, t in enumerate(thumbnails):
1253                 t['url'] = sanitize_url(t['url'])
1254                 if t.get('width') and t.get('height'):
1255                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1256                 if t.get('id') is None:
1257                     t['id'] = '%d' % i
1258
1259         if self.params.get('list_thumbnails'):
1260             self.list_thumbnails(info_dict)
1261             return
1262
1263         thumbnail = info_dict.get('thumbnail')
1264         if thumbnail:
1265             info_dict['thumbnail'] = sanitize_url(thumbnail)
1266         elif thumbnails:
1267             info_dict['thumbnail'] = thumbnails[-1]['url']
1268
1269         if 'display_id' not in info_dict and 'id' in info_dict:
1270             info_dict['display_id'] = info_dict['id']
1271
1272         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1273             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1274             # see http://bugs.python.org/issue1646728)
1275             try:
1276                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1277                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1278             except (ValueError, OverflowError, OSError):
1279                 pass
1280
1281         # Auto generate title fields corresponding to the *_number fields when missing
1282         # in order to always have clean titles. This is very common for TV series.
1283         for field in ('chapter', 'season', 'episode'):
1284             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1285                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1286
1287         subtitles = info_dict.get('subtitles')
1288         if subtitles:
1289             for _, subtitle in subtitles.items():
1290                 for subtitle_format in subtitle:
1291                     if subtitle_format.get('url'):
1292                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1293                     if 'ext' not in subtitle_format:
1294                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1295
1296         if self.params.get('listsubtitles', False):
1297             if 'automatic_captions' in info_dict:
1298                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1299             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1300             return
1301         info_dict['requested_subtitles'] = self.process_subtitles(
1302             info_dict['id'], subtitles,
1303             info_dict.get('automatic_captions'))
1304
1305         # We now pick which formats have to be downloaded
1306         if info_dict.get('formats') is None:
1307             # There's only one format available
1308             formats = [info_dict]
1309         else:
1310             formats = info_dict['formats']
1311
1312         if not formats:
1313             raise ExtractorError('No video formats found!')
1314
1315         formats_dict = {}
1316
1317         # We check that all the formats have the format and format_id fields
1318         for i, format in enumerate(formats):
1319             if 'url' not in format:
1320                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1321
1322             format['url'] = sanitize_url(format['url'])
1323
1324             if format.get('format_id') is None:
1325                 format['format_id'] = compat_str(i)
1326             else:
1327                 # Sanitize format_id from characters used in format selector expression
1328                 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1329             format_id = format['format_id']
1330             if format_id not in formats_dict:
1331                 formats_dict[format_id] = []
1332             formats_dict[format_id].append(format)
1333
1334         # Make sure all formats have unique format_id
1335         for format_id, ambiguous_formats in formats_dict.items():
1336             if len(ambiguous_formats) > 1:
1337                 for i, format in enumerate(ambiguous_formats):
1338                     format['format_id'] = '%s-%d' % (format_id, i)
1339
1340         for i, format in enumerate(formats):
1341             if format.get('format') is None:
1342                 format['format'] = '{id} - {res}{note}'.format(
1343                     id=format['format_id'],
1344                     res=self.format_resolution(format),
1345                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1346                 )
1347             # Automatically determine file extension if missing
1348             if 'ext' not in format:
1349                 format['ext'] = determine_ext(format['url']).lower()
1350             # Automatically determine protocol if missing (useful for format
1351             # selection purposes)
1352             if 'protocol' not in format:
1353                 format['protocol'] = determine_protocol(format)
1354             # Add HTTP headers, so that external programs can use them from the
1355             # json output
1356             full_format_info = info_dict.copy()
1357             full_format_info.update(format)
1358             format['http_headers'] = self._calc_headers(full_format_info)
1359
1360         # TODO Central sorting goes here
1361
1362         if formats[0] is not info_dict:
1363             # only set the 'formats' fields if the original info_dict list them
1364             # otherwise we end up with a circular reference, the first (and unique)
1365             # element in the 'formats' field in info_dict is info_dict itself,
1366             # which can't be exported to json
1367             info_dict['formats'] = formats
1368         if self.params.get('listformats'):
1369             self.list_formats(info_dict)
1370             return
1371
1372         req_format = self.params.get('format')
1373         if req_format is None:
1374             req_format_list = []
1375             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1376                     not info_dict.get('is_live')):
1377                 merger = FFmpegMergerPP(self)
1378                 if merger.available and merger.can_merge():
1379                     req_format_list.append('bestvideo+bestaudio')
1380             req_format_list.append('best')
1381             req_format = '/'.join(req_format_list)
1382         format_selector = self.build_format_selector(req_format)
1383
1384         # While in format selection we may need to have an access to the original
1385         # format set in order to calculate some metrics or do some processing.
1386         # For now we need to be able to guess whether original formats provided
1387         # by extractor are incomplete or not (i.e. whether extractor provides only
1388         # video-only or audio-only formats) for proper formats selection for
1389         # extractors with such incomplete formats (see
1390         # https://github.com/rg3/youtube-dl/pull/5556).
1391         # Since formats may be filtered during format selection and may not match
1392         # the original formats the results may be incorrect. Thus original formats
1393         # or pre-calculated metrics should be passed to format selection routines
1394         # as well.
1395         # We will pass a context object containing all necessary additional data
1396         # instead of just formats.
1397         # This fixes incorrect format selection issue (see
1398         # https://github.com/rg3/youtube-dl/issues/10083).
1399         incomplete_formats = all(
1400             # All formats are video-only or
1401             f.get('vcodec') != 'none' and f.get('acodec') == 'none' or
1402             # all formats are audio-only
1403             f.get('vcodec') == 'none' and f.get('acodec') != 'none'
1404             for f in formats)
1405
1406         ctx = {
1407             'formats': formats,
1408             'incomplete_formats': incomplete_formats,
1409         }
1410
1411         formats_to_download = list(format_selector(ctx))
1412         if not formats_to_download:
1413             raise ExtractorError('requested format not available',
1414                                  expected=True)
1415
1416         if download:
1417             if len(formats_to_download) > 1:
1418                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1419             for format in formats_to_download:
1420                 new_info = dict(info_dict)
1421                 new_info.update(format)
1422                 self.process_info(new_info)
1423         # We update the info dict with the best quality format (backwards compatibility)
1424         info_dict.update(formats_to_download[-1])
1425         return info_dict
1426
1427     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1428         """Select the requested subtitles and their format"""
1429         available_subs = {}
1430         if normal_subtitles and self.params.get('writesubtitles'):
1431             available_subs.update(normal_subtitles)
1432         if automatic_captions and self.params.get('writeautomaticsub'):
1433             for lang, cap_info in automatic_captions.items():
1434                 if lang not in available_subs:
1435                     available_subs[lang] = cap_info
1436
1437         if (not self.params.get('writesubtitles') and not
1438                 self.params.get('writeautomaticsub') or not
1439                 available_subs):
1440             return None
1441
1442         if self.params.get('allsubtitles', False):
1443             requested_langs = available_subs.keys()
1444         else:
1445             if self.params.get('subtitleslangs', False):
1446                 requested_langs = self.params.get('subtitleslangs')
1447             elif 'en' in available_subs:
1448                 requested_langs = ['en']
1449             else:
1450                 requested_langs = [list(available_subs.keys())[0]]
1451
1452         formats_query = self.params.get('subtitlesformat', 'best')
1453         formats_preference = formats_query.split('/') if formats_query else []
1454         subs = {}
1455         for lang in requested_langs:
1456             formats = available_subs.get(lang)
1457             if formats is None:
1458                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1459                 continue
1460             for ext in formats_preference:
1461                 if ext == 'best':
1462                     f = formats[-1]
1463                     break
1464                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1465                 if matches:
1466                     f = matches[-1]
1467                     break
1468             else:
1469                 f = formats[-1]
1470                 self.report_warning(
1471                     'No subtitle format found matching "%s" for language %s, '
1472                     'using %s' % (formats_query, lang, f['ext']))
1473             subs[lang] = f
1474         return subs
1475
1476     def process_info(self, info_dict):
1477         """Process a single resolved IE result."""
1478
1479         assert info_dict.get('_type', 'video') == 'video'
1480
1481         max_downloads = self.params.get('max_downloads')
1482         if max_downloads is not None:
1483             if self._num_downloads >= int(max_downloads):
1484                 raise MaxDownloadsReached()
1485
1486         info_dict['fulltitle'] = info_dict['title']
1487         if len(info_dict['title']) > 200:
1488             info_dict['title'] = info_dict['title'][:197] + '...'
1489
1490         if 'format' not in info_dict:
1491             info_dict['format'] = info_dict['ext']
1492
1493         reason = self._match_entry(info_dict, incomplete=False)
1494         if reason is not None:
1495             self.to_screen('[download] ' + reason)
1496             return
1497
1498         self._num_downloads += 1
1499
1500         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1501
1502         # Forced printings
1503         if self.params.get('forcetitle', False):
1504             self.to_stdout(info_dict['fulltitle'])
1505         if self.params.get('forceid', False):
1506             self.to_stdout(info_dict['id'])
1507         if self.params.get('forceurl', False):
1508             if info_dict.get('requested_formats') is not None:
1509                 for f in info_dict['requested_formats']:
1510                     self.to_stdout(f['url'] + f.get('play_path', ''))
1511             else:
1512                 # For RTMP URLs, also include the playpath
1513                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1514         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1515             self.to_stdout(info_dict['thumbnail'])
1516         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1517             self.to_stdout(info_dict['description'])
1518         if self.params.get('forcefilename', False) and filename is not None:
1519             self.to_stdout(filename)
1520         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1521             self.to_stdout(formatSeconds(info_dict['duration']))
1522         if self.params.get('forceformat', False):
1523             self.to_stdout(info_dict['format'])
1524         if self.params.get('forcejson', False):
1525             self.to_stdout(json.dumps(info_dict))
1526
1527         # Do nothing else if in simulate mode
1528         if self.params.get('simulate', False):
1529             return
1530
1531         if filename is None:
1532             return
1533
1534         try:
1535             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1536             if dn and not os.path.exists(dn):
1537                 os.makedirs(dn)
1538         except (OSError, IOError) as err:
1539             self.report_error('unable to create directory ' + error_to_compat_str(err))
1540             return
1541
1542         if self.params.get('writedescription', False):
1543             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1544             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1545                 self.to_screen('[info] Video description is already present')
1546             elif info_dict.get('description') is None:
1547                 self.report_warning('There\'s no description to write.')
1548             else:
1549                 try:
1550                     self.to_screen('[info] Writing video description to: ' + descfn)
1551                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1552                         descfile.write(info_dict['description'])
1553                 except (OSError, IOError):
1554                     self.report_error('Cannot write description file ' + descfn)
1555                     return
1556
1557         if self.params.get('writeannotations', False):
1558             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1559             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1560                 self.to_screen('[info] Video annotations are already present')
1561             else:
1562                 try:
1563                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1564                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1565                         annofile.write(info_dict['annotations'])
1566                 except (KeyError, TypeError):
1567                     self.report_warning('There are no annotations to write.')
1568                 except (OSError, IOError):
1569                     self.report_error('Cannot write annotations file: ' + annofn)
1570                     return
1571
1572         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1573                                        self.params.get('writeautomaticsub')])
1574
1575         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1576             # subtitles download errors are already managed as troubles in relevant IE
1577             # that way it will silently go on when used with unsupporting IE
1578             subtitles = info_dict['requested_subtitles']
1579             ie = self.get_info_extractor(info_dict['extractor_key'])
1580             for sub_lang, sub_info in subtitles.items():
1581                 sub_format = sub_info['ext']
1582                 if sub_info.get('data') is not None:
1583                     sub_data = sub_info['data']
1584                 else:
1585                     try:
1586                         sub_data = ie._download_webpage(
1587                             sub_info['url'], info_dict['id'], note=False)
1588                     except ExtractorError as err:
1589                         self.report_warning('Unable to download subtitle for "%s": %s' %
1590                                             (sub_lang, error_to_compat_str(err.cause)))
1591                         continue
1592                 try:
1593                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1594                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1595                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1596                     else:
1597                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1598                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1599                             subfile.write(sub_data)
1600                 except (OSError, IOError):
1601                     self.report_error('Cannot write subtitles file ' + sub_filename)
1602                     return
1603
1604         if self.params.get('writeinfojson', False):
1605             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1606             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1607                 self.to_screen('[info] Video description metadata is already present')
1608             else:
1609                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1610                 try:
1611                     write_json_file(self.filter_requested_info(info_dict), infofn)
1612                 except (OSError, IOError):
1613                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1614                     return
1615
1616         self._write_thumbnails(info_dict, filename)
1617
1618         if not self.params.get('skip_download', False):
1619             try:
1620                 def dl(name, info):
1621                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1622                     for ph in self._progress_hooks:
1623                         fd.add_progress_hook(ph)
1624                     if self.params.get('verbose'):
1625                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1626                     return fd.download(name, info)
1627
1628                 if info_dict.get('requested_formats') is not None:
1629                     downloaded = []
1630                     success = True
1631                     merger = FFmpegMergerPP(self)
1632                     if not merger.available:
1633                         postprocessors = []
1634                         self.report_warning('You have requested multiple '
1635                                             'formats but ffmpeg or avconv are not installed.'
1636                                             ' The formats won\'t be merged.')
1637                     else:
1638                         postprocessors = [merger]
1639
1640                     def compatible_formats(formats):
1641                         video, audio = formats
1642                         # Check extension
1643                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1644                         if video_ext and audio_ext:
1645                             COMPATIBLE_EXTS = (
1646                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1647                                 ('webm')
1648                             )
1649                             for exts in COMPATIBLE_EXTS:
1650                                 if video_ext in exts and audio_ext in exts:
1651                                     return True
1652                         # TODO: Check acodec/vcodec
1653                         return False
1654
1655                     filename_real_ext = os.path.splitext(filename)[1][1:]
1656                     filename_wo_ext = (
1657                         os.path.splitext(filename)[0]
1658                         if filename_real_ext == info_dict['ext']
1659                         else filename)
1660                     requested_formats = info_dict['requested_formats']
1661                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1662                         info_dict['ext'] = 'mkv'
1663                         self.report_warning(
1664                             'Requested formats are incompatible for merge and will be merged into mkv.')
1665                     # Ensure filename always has a correct extension for successful merge
1666                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1667                     if os.path.exists(encodeFilename(filename)):
1668                         self.to_screen(
1669                             '[download] %s has already been downloaded and '
1670                             'merged' % filename)
1671                     else:
1672                         for f in requested_formats:
1673                             new_info = dict(info_dict)
1674                             new_info.update(f)
1675                             fname = self.prepare_filename(new_info)
1676                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1677                             downloaded.append(fname)
1678                             partial_success = dl(fname, new_info)
1679                             success = success and partial_success
1680                         info_dict['__postprocessors'] = postprocessors
1681                         info_dict['__files_to_merge'] = downloaded
1682                 else:
1683                     # Just a single file
1684                     success = dl(filename, info_dict)
1685             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1686                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1687                 return
1688             except (OSError, IOError) as err:
1689                 raise UnavailableVideoError(err)
1690             except (ContentTooShortError, ) as err:
1691                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1692                 return
1693
1694             if success and filename != '-':
1695                 # Fixup content
1696                 fixup_policy = self.params.get('fixup')
1697                 if fixup_policy is None:
1698                     fixup_policy = 'detect_or_warn'
1699
1700                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1701
1702                 stretched_ratio = info_dict.get('stretched_ratio')
1703                 if stretched_ratio is not None and stretched_ratio != 1:
1704                     if fixup_policy == 'warn':
1705                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1706                             info_dict['id'], stretched_ratio))
1707                     elif fixup_policy == 'detect_or_warn':
1708                         stretched_pp = FFmpegFixupStretchedPP(self)
1709                         if stretched_pp.available:
1710                             info_dict.setdefault('__postprocessors', [])
1711                             info_dict['__postprocessors'].append(stretched_pp)
1712                         else:
1713                             self.report_warning(
1714                                 '%s: Non-uniform pixel ratio (%s). %s'
1715                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1716                     else:
1717                         assert fixup_policy in ('ignore', 'never')
1718
1719                 if (info_dict.get('requested_formats') is None and
1720                         info_dict.get('container') == 'm4a_dash'):
1721                     if fixup_policy == 'warn':
1722                         self.report_warning(
1723                             '%s: writing DASH m4a. '
1724                             'Only some players support this container.'
1725                             % info_dict['id'])
1726                     elif fixup_policy == 'detect_or_warn':
1727                         fixup_pp = FFmpegFixupM4aPP(self)
1728                         if fixup_pp.available:
1729                             info_dict.setdefault('__postprocessors', [])
1730                             info_dict['__postprocessors'].append(fixup_pp)
1731                         else:
1732                             self.report_warning(
1733                                 '%s: writing DASH m4a. '
1734                                 'Only some players support this container. %s'
1735                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1736                     else:
1737                         assert fixup_policy in ('ignore', 'never')
1738
1739                 if (info_dict.get('protocol') == 'm3u8_native' or
1740                         info_dict.get('protocol') == 'm3u8' and
1741                         self.params.get('hls_prefer_native')):
1742                     if fixup_policy == 'warn':
1743                         self.report_warning('%s: malformated aac bitstream.' % (
1744                             info_dict['id']))
1745                     elif fixup_policy == 'detect_or_warn':
1746                         fixup_pp = FFmpegFixupM3u8PP(self)
1747                         if fixup_pp.available:
1748                             info_dict.setdefault('__postprocessors', [])
1749                             info_dict['__postprocessors'].append(fixup_pp)
1750                         else:
1751                             self.report_warning(
1752                                 '%s: malformated aac bitstream. %s'
1753                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1754                     else:
1755                         assert fixup_policy in ('ignore', 'never')
1756
1757                 try:
1758                     self.post_process(filename, info_dict)
1759                 except (PostProcessingError) as err:
1760                     self.report_error('postprocessing: %s' % str(err))
1761                     return
1762                 self.record_download_archive(info_dict)
1763
1764     def download(self, url_list):
1765         """Download a given list of URLs."""
1766         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1767         if (len(url_list) > 1 and
1768                 '%' not in outtmpl and
1769                 self.params.get('max_downloads') != 1):
1770             raise SameFileError(outtmpl)
1771
1772         for url in url_list:
1773             try:
1774                 # It also downloads the videos
1775                 res = self.extract_info(
1776                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1777             except UnavailableVideoError:
1778                 self.report_error('unable to download video')
1779             except MaxDownloadsReached:
1780                 self.to_screen('[info] Maximum number of downloaded files reached.')
1781                 raise
1782             else:
1783                 if self.params.get('dump_single_json', False):
1784                     self.to_stdout(json.dumps(res))
1785
1786         return self._download_retcode
1787
1788     def download_with_info_file(self, info_filename):
1789         with contextlib.closing(fileinput.FileInput(
1790                 [info_filename], mode='r',
1791                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1792             # FileInput doesn't have a read method, we can't call json.load
1793             info = self.filter_requested_info(json.loads('\n'.join(f)))
1794         try:
1795             self.process_ie_result(info, download=True)
1796         except DownloadError:
1797             webpage_url = info.get('webpage_url')
1798             if webpage_url is not None:
1799                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1800                 return self.download([webpage_url])
1801             else:
1802                 raise
1803         return self._download_retcode
1804
1805     @staticmethod
1806     def filter_requested_info(info_dict):
1807         return dict(
1808             (k, v) for k, v in info_dict.items()
1809             if k not in ['requested_formats', 'requested_subtitles'])
1810
1811     def post_process(self, filename, ie_info):
1812         """Run all the postprocessors on the given file."""
1813         info = dict(ie_info)
1814         info['filepath'] = filename
1815         pps_chain = []
1816         if ie_info.get('__postprocessors') is not None:
1817             pps_chain.extend(ie_info['__postprocessors'])
1818         pps_chain.extend(self._pps)
1819         for pp in pps_chain:
1820             files_to_delete = []
1821             try:
1822                 files_to_delete, info = pp.run(info)
1823             except PostProcessingError as e:
1824                 self.report_error(e.msg)
1825             if files_to_delete and not self.params.get('keepvideo', False):
1826                 for old_filename in files_to_delete:
1827                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1828                     try:
1829                         os.remove(encodeFilename(old_filename))
1830                     except (IOError, OSError):
1831                         self.report_warning('Unable to remove downloaded original file')
1832
1833     def _make_archive_id(self, info_dict):
1834         # Future-proof against any change in case
1835         # and backwards compatibility with prior versions
1836         extractor = info_dict.get('extractor_key')
1837         if extractor is None:
1838             if 'id' in info_dict:
1839                 extractor = info_dict.get('ie_key')  # key in a playlist
1840         if extractor is None:
1841             return None  # Incomplete video information
1842         return extractor.lower() + ' ' + info_dict['id']
1843
1844     def in_download_archive(self, info_dict):
1845         fn = self.params.get('download_archive')
1846         if fn is None:
1847             return False
1848
1849         vid_id = self._make_archive_id(info_dict)
1850         if vid_id is None:
1851             return False  # Incomplete video information
1852
1853         try:
1854             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1855                 for line in archive_file:
1856                     if line.strip() == vid_id:
1857                         return True
1858         except IOError as ioe:
1859             if ioe.errno != errno.ENOENT:
1860                 raise
1861         return False
1862
1863     def record_download_archive(self, info_dict):
1864         fn = self.params.get('download_archive')
1865         if fn is None:
1866             return
1867         vid_id = self._make_archive_id(info_dict)
1868         assert vid_id
1869         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1870             archive_file.write(vid_id + '\n')
1871
1872     @staticmethod
1873     def format_resolution(format, default='unknown'):
1874         if format.get('vcodec') == 'none':
1875             return 'audio only'
1876         if format.get('resolution') is not None:
1877             return format['resolution']
1878         if format.get('height') is not None:
1879             if format.get('width') is not None:
1880                 res = '%sx%s' % (format['width'], format['height'])
1881             else:
1882                 res = '%sp' % format['height']
1883         elif format.get('width') is not None:
1884             res = '%dx?' % format['width']
1885         else:
1886             res = default
1887         return res
1888
1889     def _format_note(self, fdict):
1890         res = ''
1891         if fdict.get('ext') in ['f4f', 'f4m']:
1892             res += '(unsupported) '
1893         if fdict.get('language'):
1894             if res:
1895                 res += ' '
1896             res += '[%s] ' % fdict['language']
1897         if fdict.get('format_note') is not None:
1898             res += fdict['format_note'] + ' '
1899         if fdict.get('tbr') is not None:
1900             res += '%4dk ' % fdict['tbr']
1901         if fdict.get('container') is not None:
1902             if res:
1903                 res += ', '
1904             res += '%s container' % fdict['container']
1905         if (fdict.get('vcodec') is not None and
1906                 fdict.get('vcodec') != 'none'):
1907             if res:
1908                 res += ', '
1909             res += fdict['vcodec']
1910             if fdict.get('vbr') is not None:
1911                 res += '@'
1912         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1913             res += 'video@'
1914         if fdict.get('vbr') is not None:
1915             res += '%4dk' % fdict['vbr']
1916         if fdict.get('fps') is not None:
1917             if res:
1918                 res += ', '
1919             res += '%sfps' % fdict['fps']
1920         if fdict.get('acodec') is not None:
1921             if res:
1922                 res += ', '
1923             if fdict['acodec'] == 'none':
1924                 res += 'video only'
1925             else:
1926                 res += '%-5s' % fdict['acodec']
1927         elif fdict.get('abr') is not None:
1928             if res:
1929                 res += ', '
1930             res += 'audio'
1931         if fdict.get('abr') is not None:
1932             res += '@%3dk' % fdict['abr']
1933         if fdict.get('asr') is not None:
1934             res += ' (%5dHz)' % fdict['asr']
1935         if fdict.get('filesize') is not None:
1936             if res:
1937                 res += ', '
1938             res += format_bytes(fdict['filesize'])
1939         elif fdict.get('filesize_approx') is not None:
1940             if res:
1941                 res += ', '
1942             res += '~' + format_bytes(fdict['filesize_approx'])
1943         return res
1944
1945     def list_formats(self, info_dict):
1946         formats = info_dict.get('formats', [info_dict])
1947         table = [
1948             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1949             for f in formats
1950             if f.get('preference') is None or f['preference'] >= -1000]
1951         if len(formats) > 1:
1952             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1953
1954         header_line = ['format code', 'extension', 'resolution', 'note']
1955         self.to_screen(
1956             '[info] Available formats for %s:\n%s' %
1957             (info_dict['id'], render_table(header_line, table)))
1958
1959     def list_thumbnails(self, info_dict):
1960         thumbnails = info_dict.get('thumbnails')
1961         if not thumbnails:
1962             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1963             return
1964
1965         self.to_screen(
1966             '[info] Thumbnails for %s:' % info_dict['id'])
1967         self.to_screen(render_table(
1968             ['ID', 'width', 'height', 'URL'],
1969             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1970
1971     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1972         if not subtitles:
1973             self.to_screen('%s has no %s' % (video_id, name))
1974             return
1975         self.to_screen(
1976             'Available %s for %s:' % (name, video_id))
1977         self.to_screen(render_table(
1978             ['Language', 'formats'],
1979             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1980                 for lang, formats in subtitles.items()]))
1981
1982     def urlopen(self, req):
1983         """ Start an HTTP download """
1984         if isinstance(req, compat_basestring):
1985             req = sanitized_Request(req)
1986         return self._opener.open(req, timeout=self._socket_timeout)
1987
1988     def print_debug_header(self):
1989         if not self.params.get('verbose'):
1990             return
1991
1992         if type('') is not compat_str:
1993             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1994             self.report_warning(
1995                 'Your Python is broken! Update to a newer and supported version')
1996
1997         stdout_encoding = getattr(
1998             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1999         encoding_str = (
2000             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2001                 locale.getpreferredencoding(),
2002                 sys.getfilesystemencoding(),
2003                 stdout_encoding,
2004                 self.get_encoding()))
2005         write_string(encoding_str, encoding=None)
2006
2007         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2008         if _LAZY_LOADER:
2009             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2010         try:
2011             sp = subprocess.Popen(
2012                 ['git', 'rev-parse', '--short', 'HEAD'],
2013                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2014                 cwd=os.path.dirname(os.path.abspath(__file__)))
2015             out, err = sp.communicate()
2016             out = out.decode().strip()
2017             if re.match('[0-9a-f]+', out):
2018                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2019         except Exception:
2020             try:
2021                 sys.exc_clear()
2022             except Exception:
2023                 pass
2024         self._write_string('[debug] Python version %s - %s\n' % (
2025             platform.python_version(), platform_name()))
2026
2027         exe_versions = FFmpegPostProcessor.get_versions(self)
2028         exe_versions['rtmpdump'] = rtmpdump_version()
2029         exe_str = ', '.join(
2030             '%s %s' % (exe, v)
2031             for exe, v in sorted(exe_versions.items())
2032             if v
2033         )
2034         if not exe_str:
2035             exe_str = 'none'
2036         self._write_string('[debug] exe versions: %s\n' % exe_str)
2037
2038         proxy_map = {}
2039         for handler in self._opener.handlers:
2040             if hasattr(handler, 'proxies'):
2041                 proxy_map.update(handler.proxies)
2042         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2043
2044         if self.params.get('call_home', False):
2045             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2046             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2047             latest_version = self.urlopen(
2048                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2049             if version_tuple(latest_version) > version_tuple(__version__):
2050                 self.report_warning(
2051                     'You are using an outdated version (newest version: %s)! '
2052                     'See https://yt-dl.org/update if you need help updating.' %
2053                     latest_version)
2054
2055     def _setup_opener(self):
2056         timeout_val = self.params.get('socket_timeout')
2057         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2058
2059         opts_cookiefile = self.params.get('cookiefile')
2060         opts_proxy = self.params.get('proxy')
2061
2062         if opts_cookiefile is None:
2063             self.cookiejar = compat_cookiejar.CookieJar()
2064         else:
2065             opts_cookiefile = compat_expanduser(opts_cookiefile)
2066             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2067                 opts_cookiefile)
2068             if os.access(opts_cookiefile, os.R_OK):
2069                 self.cookiejar.load()
2070
2071         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2072         if opts_proxy is not None:
2073             if opts_proxy == '':
2074                 proxies = {}
2075             else:
2076                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2077         else:
2078             proxies = compat_urllib_request.getproxies()
2079             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2080             if 'http' in proxies and 'https' not in proxies:
2081                 proxies['https'] = proxies['http']
2082         proxy_handler = PerRequestProxyHandler(proxies)
2083
2084         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2085         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2086         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2087         data_handler = compat_urllib_request_DataHandler()
2088
2089         # When passing our own FileHandler instance, build_opener won't add the
2090         # default FileHandler and allows us to disable the file protocol, which
2091         # can be used for malicious purposes (see
2092         # https://github.com/rg3/youtube-dl/issues/8227)
2093         file_handler = compat_urllib_request.FileHandler()
2094
2095         def file_open(*args, **kwargs):
2096             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2097         file_handler.file_open = file_open
2098
2099         opener = compat_urllib_request.build_opener(
2100             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2101
2102         # Delete the default user-agent header, which would otherwise apply in
2103         # cases where our custom HTTP handler doesn't come into play
2104         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2105         opener.addheaders = []
2106         self._opener = opener
2107
2108     def encode(self, s):
2109         if isinstance(s, bytes):
2110             return s  # Already encoded
2111
2112         try:
2113             return s.encode(self.get_encoding())
2114         except UnicodeEncodeError as err:
2115             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2116             raise
2117
2118     def get_encoding(self):
2119         encoding = self.params.get('encoding')
2120         if encoding is None:
2121             encoding = preferredencoding()
2122         return encoding
2123
2124     def _write_thumbnails(self, info_dict, filename):
2125         if self.params.get('writethumbnail', False):
2126             thumbnails = info_dict.get('thumbnails')
2127             if thumbnails:
2128                 thumbnails = [thumbnails[-1]]
2129         elif self.params.get('write_all_thumbnails', False):
2130             thumbnails = info_dict.get('thumbnails')
2131         else:
2132             return
2133
2134         if not thumbnails:
2135             # No thumbnails present, so return immediately
2136             return
2137
2138         for t in thumbnails:
2139             thumb_ext = determine_ext(t['url'], 'jpg')
2140             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2141             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2142             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2143
2144             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2145                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2146                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2147             else:
2148                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2149                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2150                 try:
2151                     uf = self.urlopen(t['url'])
2152                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2153                         shutil.copyfileobj(uf, thumbf)
2154                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2155                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2156                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2157                     self.report_warning('Unable to download thumbnail "%s": %s' %
2158                                         (t['url'], error_to_compat_str(err)))