[YoutubeDL] Sanitize final URLs (Closes #8991)
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import tokenize
25 import traceback
26
27 from .compat import (
28     compat_basestring,
29     compat_cookiejar,
30     compat_expanduser,
31     compat_get_terminal_size,
32     compat_http_client,
33     compat_kwargs,
34     compat_os_name,
35     compat_str,
36     compat_tokenize_tokenize,
37     compat_urllib_error,
38     compat_urllib_request,
39     compat_urllib_request_DataHandler,
40 )
41 from .utils import (
42     ContentTooShortError,
43     date_from_str,
44     DateRange,
45     DEFAULT_OUTTMPL,
46     determine_ext,
47     determine_protocol,
48     DownloadError,
49     encode_compat_str,
50     encodeFilename,
51     error_to_compat_str,
52     ExtractorError,
53     format_bytes,
54     formatSeconds,
55     locked_file,
56     make_HTTPS_handler,
57     MaxDownloadsReached,
58     PagedList,
59     parse_filesize,
60     PerRequestProxyHandler,
61     PostProcessingError,
62     platform_name,
63     preferredencoding,
64     render_table,
65     SameFileError,
66     sanitize_filename,
67     sanitize_path,
68     sanitize_url,
69     sanitized_Request,
70     std_headers,
71     subtitles_filename,
72     UnavailableVideoError,
73     url_basename,
74     version_tuple,
75     write_json_file,
76     write_string,
77     YoutubeDLCookieProcessor,
78     YoutubeDLHandler,
79     prepend_extension,
80     replace_extension,
81     args_to_str,
82     age_restricted,
83 )
84 from .cache import Cache
85 from .extractor import get_info_extractor, gen_extractors
86 from .downloader import get_suitable_downloader
87 from .downloader.rtmp import rtmpdump_version
88 from .postprocessor import (
89     FFmpegFixupM3u8PP,
90     FFmpegFixupM4aPP,
91     FFmpegFixupStretchedPP,
92     FFmpegMergerPP,
93     FFmpegPostProcessor,
94     get_postprocessor,
95 )
96 from .version import __version__
97
98 if compat_os_name == 'nt':
99     import ctypes
100
101
102 class YoutubeDL(object):
103     """YoutubeDL class.
104
105     YoutubeDL objects are the ones responsible of downloading the
106     actual video file and writing it to disk if the user has requested
107     it, among some other tasks. In most cases there should be one per
108     program. As, given a video URL, the downloader doesn't know how to
109     extract all the needed information, task that InfoExtractors do, it
110     has to pass the URL to one of them.
111
112     For this, YoutubeDL objects have a method that allows
113     InfoExtractors to be registered in a given order. When it is passed
114     a URL, the YoutubeDL object handles it to the first InfoExtractor it
115     finds that reports being able to handle it. The InfoExtractor extracts
116     all the information about the video or videos the URL refers to, and
117     YoutubeDL process the extracted information, possibly using a File
118     Downloader to download the video.
119
120     YoutubeDL objects accept a lot of parameters. In order not to saturate
121     the object constructor with arguments, it receives a dictionary of
122     options instead. These options are available through the params
123     attribute for the InfoExtractors to use. The YoutubeDL also
124     registers itself as the downloader in charge for the InfoExtractors
125     that are added to it, so this is a "mutual registration".
126
127     Available options:
128
129     username:          Username for authentication purposes.
130     password:          Password for authentication purposes.
131     videopassword:     Password for accessing a video.
132     usenetrc:          Use netrc for authentication instead.
133     verbose:           Print additional info to stdout.
134     quiet:             Do not print messages to stdout.
135     no_warnings:       Do not print out anything for warnings.
136     forceurl:          Force printing final URL.
137     forcetitle:        Force printing title.
138     forceid:           Force printing ID.
139     forcethumbnail:    Force printing thumbnail URL.
140     forcedescription:  Force printing description.
141     forcefilename:     Force printing final filename.
142     forceduration:     Force printing duration.
143     forcejson:         Force printing info_dict as JSON.
144     dump_single_json:  Force printing the info_dict of the whole playlist
145                        (or video) as a single JSON line.
146     simulate:          Do not download the video files.
147     format:            Video format code. See options.py for more information.
148     outtmpl:           Template for output names.
149     restrictfilenames: Do not allow "&" and spaces in file names
150     ignoreerrors:      Do not stop on download errors.
151     force_generic_extractor: Force downloader to use the generic extractor
152     nooverwrites:      Prevent overwriting files.
153     playliststart:     Playlist item to start at.
154     playlistend:       Playlist item to end at.
155     playlist_items:    Specific indices of playlist to download.
156     playlistreverse:   Download playlist items in reverse order.
157     matchtitle:        Download only matching titles.
158     rejecttitle:       Reject downloads for matching titles.
159     logger:            Log messages to a logging.Logger instance.
160     logtostderr:       Log messages to stderr instead of stdout.
161     writedescription:  Write the video description to a .description file
162     writeinfojson:     Write the video description to a .info.json file
163     writeannotations:  Write the video annotations to a .annotations.xml file
164     writethumbnail:    Write the thumbnail image to a file
165     write_all_thumbnails:  Write all thumbnail formats to files
166     writesubtitles:    Write the video subtitles to a file
167     writeautomaticsub: Write the automatically generated subtitles to a file
168     allsubtitles:      Downloads all the subtitles of the video
169                        (requires writesubtitles or writeautomaticsub)
170     listsubtitles:     Lists all available subtitles for the video
171     subtitlesformat:   The format code for subtitles
172     subtitleslangs:    List of languages of the subtitles to download
173     keepvideo:         Keep the video file after post-processing
174     daterange:         A DateRange object, download only if the upload_date is in the range.
175     skip_download:     Skip the actual download of the video file
176     cachedir:          Location of the cache files in the filesystem.
177                        False to disable filesystem cache.
178     noplaylist:        Download single video instead of a playlist if in doubt.
179     age_limit:         An integer representing the user's age in years.
180                        Unsuitable videos for the given age are skipped.
181     min_views:         An integer representing the minimum view count the video
182                        must have in order to not be skipped.
183                        Videos without view count information are always
184                        downloaded. None for no limit.
185     max_views:         An integer representing the maximum view count.
186                        Videos that are more popular than that are not
187                        downloaded.
188                        Videos without view count information are always
189                        downloaded. None for no limit.
190     download_archive:  File name of a file where all downloads are recorded.
191                        Videos already present in the file are not downloaded
192                        again.
193     cookiefile:        File name where cookies should be read from and dumped to.
194     nocheckcertificate:Do not verify SSL certificates
195     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
196                        At the moment, this is only supported by YouTube.
197     proxy:             URL of the proxy server to use
198     cn_verification_proxy:  URL of the proxy to use for IP address verification
199                        on Chinese sites. (Experimental)
200     socket_timeout:    Time to wait for unresponsive hosts, in seconds
201     bidi_workaround:   Work around buggy terminals without bidirectional text
202                        support, using fridibi
203     debug_printtraffic:Print out sent and received HTTP traffic
204     include_ads:       Download ads as well
205     default_search:    Prepend this string if an input url is not valid.
206                        'auto' for elaborate guessing
207     encoding:          Use this encoding instead of the system-specified.
208     extract_flat:      Do not resolve URLs, return the immediate result.
209                        Pass in 'in_playlist' to only show this behavior for
210                        playlist items.
211     postprocessors:    A list of dictionaries, each with an entry
212                        * key:  The name of the postprocessor. See
213                                youtube_dl/postprocessor/__init__.py for a list.
214                        as well as any further keyword arguments for the
215                        postprocessor.
216     progress_hooks:    A list of functions that get called on download
217                        progress, with a dictionary with the entries
218                        * status: One of "downloading", "error", or "finished".
219                                  Check this first and ignore unknown values.
220
221                        If status is one of "downloading", or "finished", the
222                        following properties may also be present:
223                        * filename: The final filename (always present)
224                        * tmpfilename: The filename we're currently writing to
225                        * downloaded_bytes: Bytes on disk
226                        * total_bytes: Size of the whole file, None if unknown
227                        * total_bytes_estimate: Guess of the eventual file size,
228                                                None if unavailable.
229                        * elapsed: The number of seconds since download started.
230                        * eta: The estimated time in seconds, None if unknown
231                        * speed: The download speed in bytes/second, None if
232                                 unknown
233                        * fragment_index: The counter of the currently
234                                          downloaded video fragment.
235                        * fragment_count: The number of fragments (= individual
236                                          files that will be merged)
237
238                        Progress hooks are guaranteed to be called at least once
239                        (with status "finished") if the download is successful.
240     merge_output_format: Extension to use when merging formats.
241     fixup:             Automatically correct known faults of the file.
242                        One of:
243                        - "never": do nothing
244                        - "warn": only emit a warning
245                        - "detect_or_warn": check whether we can do anything
246                                            about it, warn otherwise (default)
247     source_address:    (Experimental) Client-side IP address to bind to.
248     call_home:         Boolean, true iff we are allowed to contact the
249                        youtube-dl servers for debugging.
250     sleep_interval:    Number of seconds to sleep before each download.
251     listformats:       Print an overview of available video formats and exit.
252     list_thumbnails:   Print a table of all thumbnails and exit.
253     match_filter:      A function that gets called with the info_dict of
254                        every video.
255                        If it returns a message, the video is ignored.
256                        If it returns None, the video is downloaded.
257                        match_filter_func in utils.py is one example for this.
258     no_color:          Do not emit color codes in output.
259
260     The following options determine which downloader is picked:
261     external_downloader: Executable of the external downloader to call.
262                        None or unset for standard (built-in) downloader.
263     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
264
265     The following parameters are not used by YoutubeDL itself, they are used by
266     the downloader (see youtube_dl/downloader/common.py):
267     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
268     noresizebuffer, retries, continuedl, noprogress, consoletitle,
269     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
270
271     The following options are used by the post processors:
272     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
273                        otherwise prefer avconv.
274     postprocessor_args: A list of additional command-line arguments for the
275                         postprocessor.
276     """
277
278     params = None
279     _ies = []
280     _pps = []
281     _download_retcode = None
282     _num_downloads = None
283     _screen_file = None
284
285     def __init__(self, params=None, auto_init=True):
286         """Create a FileDownloader object with the given options."""
287         if params is None:
288             params = {}
289         self._ies = []
290         self._ies_instances = {}
291         self._pps = []
292         self._progress_hooks = []
293         self._download_retcode = 0
294         self._num_downloads = 0
295         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
296         self._err_file = sys.stderr
297         self.params = {
298             # Default parameters
299             'nocheckcertificate': False,
300         }
301         self.params.update(params)
302         self.cache = Cache(self)
303
304         if params.get('bidi_workaround', False):
305             try:
306                 import pty
307                 master, slave = pty.openpty()
308                 width = compat_get_terminal_size().columns
309                 if width is None:
310                     width_args = []
311                 else:
312                     width_args = ['-w', str(width)]
313                 sp_kwargs = dict(
314                     stdin=subprocess.PIPE,
315                     stdout=slave,
316                     stderr=self._err_file)
317                 try:
318                     self._output_process = subprocess.Popen(
319                         ['bidiv'] + width_args, **sp_kwargs
320                     )
321                 except OSError:
322                     self._output_process = subprocess.Popen(
323                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
324                 self._output_channel = os.fdopen(master, 'rb')
325             except OSError as ose:
326                 if ose.errno == 2:
327                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
328                 else:
329                     raise
330
331         if (sys.version_info >= (3,) and sys.platform != 'win32' and
332                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
333                 not params.get('restrictfilenames', False)):
334             # On Python 3, the Unicode filesystem API will throw errors (#1474)
335             self.report_warning(
336                 'Assuming --restrict-filenames since file system encoding '
337                 'cannot encode all characters. '
338                 'Set the LC_ALL environment variable to fix this.')
339             self.params['restrictfilenames'] = True
340
341         if isinstance(params.get('outtmpl'), bytes):
342             self.report_warning(
343                 'Parameter outtmpl is bytes, but should be a unicode string. '
344                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
345
346         self._setup_opener()
347
348         if auto_init:
349             self.print_debug_header()
350             self.add_default_info_extractors()
351
352         for pp_def_raw in self.params.get('postprocessors', []):
353             pp_class = get_postprocessor(pp_def_raw['key'])
354             pp_def = dict(pp_def_raw)
355             del pp_def['key']
356             pp = pp_class(self, **compat_kwargs(pp_def))
357             self.add_post_processor(pp)
358
359         for ph in self.params.get('progress_hooks', []):
360             self.add_progress_hook(ph)
361
362     def warn_if_short_id(self, argv):
363         # short YouTube ID starting with dash?
364         idxs = [
365             i for i, a in enumerate(argv)
366             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
367         if idxs:
368             correct_argv = (
369                 ['youtube-dl'] +
370                 [a for i, a in enumerate(argv) if i not in idxs] +
371                 ['--'] + [argv[i] for i in idxs]
372             )
373             self.report_warning(
374                 'Long argument string detected. '
375                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
376                 args_to_str(correct_argv))
377
378     def add_info_extractor(self, ie):
379         """Add an InfoExtractor object to the end of the list."""
380         self._ies.append(ie)
381         self._ies_instances[ie.ie_key()] = ie
382         ie.set_downloader(self)
383
384     def get_info_extractor(self, ie_key):
385         """
386         Get an instance of an IE with name ie_key, it will try to get one from
387         the _ies list, if there's no instance it will create a new one and add
388         it to the extractor list.
389         """
390         ie = self._ies_instances.get(ie_key)
391         if ie is None:
392             ie = get_info_extractor(ie_key)()
393             self.add_info_extractor(ie)
394         return ie
395
396     def add_default_info_extractors(self):
397         """
398         Add the InfoExtractors returned by gen_extractors to the end of the list
399         """
400         for ie in gen_extractors():
401             self.add_info_extractor(ie)
402
403     def add_post_processor(self, pp):
404         """Add a PostProcessor object to the end of the chain."""
405         self._pps.append(pp)
406         pp.set_downloader(self)
407
408     def add_progress_hook(self, ph):
409         """Add the progress hook (currently only for the file downloader)"""
410         self._progress_hooks.append(ph)
411
412     def _bidi_workaround(self, message):
413         if not hasattr(self, '_output_channel'):
414             return message
415
416         assert hasattr(self, '_output_process')
417         assert isinstance(message, compat_str)
418         line_count = message.count('\n') + 1
419         self._output_process.stdin.write((message + '\n').encode('utf-8'))
420         self._output_process.stdin.flush()
421         res = ''.join(self._output_channel.readline().decode('utf-8')
422                       for _ in range(line_count))
423         return res[:-len('\n')]
424
425     def to_screen(self, message, skip_eol=False):
426         """Print message to stdout if not in quiet mode."""
427         return self.to_stdout(message, skip_eol, check_quiet=True)
428
429     def _write_string(self, s, out=None):
430         write_string(s, out=out, encoding=self.params.get('encoding'))
431
432     def to_stdout(self, message, skip_eol=False, check_quiet=False):
433         """Print message to stdout if not in quiet mode."""
434         if self.params.get('logger'):
435             self.params['logger'].debug(message)
436         elif not check_quiet or not self.params.get('quiet', False):
437             message = self._bidi_workaround(message)
438             terminator = ['\n', ''][skip_eol]
439             output = message + terminator
440
441             self._write_string(output, self._screen_file)
442
443     def to_stderr(self, message):
444         """Print message to stderr."""
445         assert isinstance(message, compat_str)
446         if self.params.get('logger'):
447             self.params['logger'].error(message)
448         else:
449             message = self._bidi_workaround(message)
450             output = message + '\n'
451             self._write_string(output, self._err_file)
452
453     def to_console_title(self, message):
454         if not self.params.get('consoletitle', False):
455             return
456         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
457             # c_wchar_p() might not be necessary if `message` is
458             # already of type unicode()
459             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
460         elif 'TERM' in os.environ:
461             self._write_string('\033]0;%s\007' % message, self._screen_file)
462
463     def save_console_title(self):
464         if not self.params.get('consoletitle', False):
465             return
466         if 'TERM' in os.environ:
467             # Save the title on stack
468             self._write_string('\033[22;0t', self._screen_file)
469
470     def restore_console_title(self):
471         if not self.params.get('consoletitle', False):
472             return
473         if 'TERM' in os.environ:
474             # Restore the title from stack
475             self._write_string('\033[23;0t', self._screen_file)
476
477     def __enter__(self):
478         self.save_console_title()
479         return self
480
481     def __exit__(self, *args):
482         self.restore_console_title()
483
484         if self.params.get('cookiefile') is not None:
485             self.cookiejar.save()
486
487     def trouble(self, message=None, tb=None):
488         """Determine action to take when a download problem appears.
489
490         Depending on if the downloader has been configured to ignore
491         download errors or not, this method may throw an exception or
492         not when errors are found, after printing the message.
493
494         tb, if given, is additional traceback information.
495         """
496         if message is not None:
497             self.to_stderr(message)
498         if self.params.get('verbose'):
499             if tb is None:
500                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
501                     tb = ''
502                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
503                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
504                     tb += encode_compat_str(traceback.format_exc())
505                 else:
506                     tb_data = traceback.format_list(traceback.extract_stack())
507                     tb = ''.join(tb_data)
508             self.to_stderr(tb)
509         if not self.params.get('ignoreerrors', False):
510             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
511                 exc_info = sys.exc_info()[1].exc_info
512             else:
513                 exc_info = sys.exc_info()
514             raise DownloadError(message, exc_info)
515         self._download_retcode = 1
516
517     def report_warning(self, message):
518         '''
519         Print the message to stderr, it will be prefixed with 'WARNING:'
520         If stderr is a tty file the 'WARNING:' will be colored
521         '''
522         if self.params.get('logger') is not None:
523             self.params['logger'].warning(message)
524         else:
525             if self.params.get('no_warnings'):
526                 return
527             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
528                 _msg_header = '\033[0;33mWARNING:\033[0m'
529             else:
530                 _msg_header = 'WARNING:'
531             warning_message = '%s %s' % (_msg_header, message)
532             self.to_stderr(warning_message)
533
534     def report_error(self, message, tb=None):
535         '''
536         Do the same as trouble, but prefixes the message with 'ERROR:', colored
537         in red if stderr is a tty file.
538         '''
539         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
540             _msg_header = '\033[0;31mERROR:\033[0m'
541         else:
542             _msg_header = 'ERROR:'
543         error_message = '%s %s' % (_msg_header, message)
544         self.trouble(error_message, tb)
545
546     def report_file_already_downloaded(self, file_name):
547         """Report file has already been fully downloaded."""
548         try:
549             self.to_screen('[download] %s has already been downloaded' % file_name)
550         except UnicodeEncodeError:
551             self.to_screen('[download] The file has already been downloaded')
552
553     def prepare_filename(self, info_dict):
554         """Generate the output filename."""
555         try:
556             template_dict = dict(info_dict)
557
558             template_dict['epoch'] = int(time.time())
559             autonumber_size = self.params.get('autonumber_size')
560             if autonumber_size is None:
561                 autonumber_size = 5
562             autonumber_templ = '%0' + str(autonumber_size) + 'd'
563             template_dict['autonumber'] = autonumber_templ % self._num_downloads
564             if template_dict.get('playlist_index') is not None:
565                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
566             if template_dict.get('resolution') is None:
567                 if template_dict.get('width') and template_dict.get('height'):
568                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
569                 elif template_dict.get('height'):
570                     template_dict['resolution'] = '%sp' % template_dict['height']
571                 elif template_dict.get('width'):
572                     template_dict['resolution'] = '%dx?' % template_dict['width']
573
574             sanitize = lambda k, v: sanitize_filename(
575                 compat_str(v),
576                 restricted=self.params.get('restrictfilenames'),
577                 is_id=(k == 'id'))
578             template_dict = dict((k, sanitize(k, v))
579                                  for k, v in template_dict.items()
580                                  if v is not None)
581             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
582
583             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
584             tmpl = compat_expanduser(outtmpl)
585             filename = tmpl % template_dict
586             # Temporary fix for #4787
587             # 'Treat' all problem characters by passing filename through preferredencoding
588             # to workaround encoding issues with subprocess on python2 @ Windows
589             if sys.version_info < (3, 0) and sys.platform == 'win32':
590                 filename = encodeFilename(filename, True).decode(preferredencoding())
591             return sanitize_path(filename)
592         except ValueError as err:
593             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
594             return None
595
596     def _match_entry(self, info_dict, incomplete):
597         """ Returns None iff the file should be downloaded """
598
599         video_title = info_dict.get('title', info_dict.get('id', 'video'))
600         if 'title' in info_dict:
601             # This can happen when we're just evaluating the playlist
602             title = info_dict['title']
603             matchtitle = self.params.get('matchtitle', False)
604             if matchtitle:
605                 if not re.search(matchtitle, title, re.IGNORECASE):
606                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
607             rejecttitle = self.params.get('rejecttitle', False)
608             if rejecttitle:
609                 if re.search(rejecttitle, title, re.IGNORECASE):
610                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
611         date = info_dict.get('upload_date')
612         if date is not None:
613             dateRange = self.params.get('daterange', DateRange())
614             if date not in dateRange:
615                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
616         view_count = info_dict.get('view_count')
617         if view_count is not None:
618             min_views = self.params.get('min_views')
619             if min_views is not None and view_count < min_views:
620                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
621             max_views = self.params.get('max_views')
622             if max_views is not None and view_count > max_views:
623                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
624         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
625             return 'Skipping "%s" because it is age restricted' % video_title
626         if self.in_download_archive(info_dict):
627             return '%s has already been recorded in archive' % video_title
628
629         if not incomplete:
630             match_filter = self.params.get('match_filter')
631             if match_filter is not None:
632                 ret = match_filter(info_dict)
633                 if ret is not None:
634                     return ret
635
636         return None
637
638     @staticmethod
639     def add_extra_info(info_dict, extra_info):
640         '''Set the keys from extra_info in info dict if they are missing'''
641         for key, value in extra_info.items():
642             info_dict.setdefault(key, value)
643
644     def extract_info(self, url, download=True, ie_key=None, extra_info={},
645                      process=True, force_generic_extractor=False):
646         '''
647         Returns a list with a dictionary for each video we find.
648         If 'download', also downloads the videos.
649         extra_info is a dict containing the extra values to add to each result
650         '''
651
652         if not ie_key and force_generic_extractor:
653             ie_key = 'Generic'
654
655         if ie_key:
656             ies = [self.get_info_extractor(ie_key)]
657         else:
658             ies = self._ies
659
660         for ie in ies:
661             if not ie.suitable(url):
662                 continue
663
664             if not ie.working():
665                 self.report_warning('The program functionality for this site has been marked as broken, '
666                                     'and will probably not work.')
667
668             try:
669                 ie_result = ie.extract(url)
670                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
671                     break
672                 if isinstance(ie_result, list):
673                     # Backwards compatibility: old IE result format
674                     ie_result = {
675                         '_type': 'compat_list',
676                         'entries': ie_result,
677                     }
678                 self.add_default_extra_info(ie_result, ie, url)
679                 if process:
680                     return self.process_ie_result(ie_result, download, extra_info)
681                 else:
682                     return ie_result
683             except ExtractorError as e:  # An error we somewhat expected
684                 self.report_error(compat_str(e), e.format_traceback())
685                 break
686             except MaxDownloadsReached:
687                 raise
688             except Exception as e:
689                 if self.params.get('ignoreerrors', False):
690                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
691                     break
692                 else:
693                     raise
694         else:
695             self.report_error('no suitable InfoExtractor for URL %s' % url)
696
697     def add_default_extra_info(self, ie_result, ie, url):
698         self.add_extra_info(ie_result, {
699             'extractor': ie.IE_NAME,
700             'webpage_url': url,
701             'webpage_url_basename': url_basename(url),
702             'extractor_key': ie.ie_key(),
703         })
704
705     def process_ie_result(self, ie_result, download=True, extra_info={}):
706         """
707         Take the result of the ie(may be modified) and resolve all unresolved
708         references (URLs, playlist items).
709
710         It will also download the videos if 'download'.
711         Returns the resolved ie_result.
712         """
713         result_type = ie_result.get('_type', 'video')
714
715         if result_type in ('url', 'url_transparent'):
716             extract_flat = self.params.get('extract_flat', False)
717             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
718                     extract_flat is True):
719                 if self.params.get('forcejson', False):
720                     self.to_stdout(json.dumps(ie_result))
721                 return ie_result
722
723         if result_type == 'video':
724             self.add_extra_info(ie_result, extra_info)
725             return self.process_video_result(ie_result, download=download)
726         elif result_type == 'url':
727             # We have to add extra_info to the results because it may be
728             # contained in a playlist
729             return self.extract_info(ie_result['url'],
730                                      download,
731                                      ie_key=ie_result.get('ie_key'),
732                                      extra_info=extra_info)
733         elif result_type == 'url_transparent':
734             # Use the information from the embedding page
735             info = self.extract_info(
736                 ie_result['url'], ie_key=ie_result.get('ie_key'),
737                 extra_info=extra_info, download=False, process=False)
738
739             force_properties = dict(
740                 (k, v) for k, v in ie_result.items() if v is not None)
741             for f in ('_type', 'url', 'ie_key'):
742                 if f in force_properties:
743                     del force_properties[f]
744             new_result = info.copy()
745             new_result.update(force_properties)
746
747             assert new_result.get('_type') != 'url_transparent'
748
749             return self.process_ie_result(
750                 new_result, download=download, extra_info=extra_info)
751         elif result_type == 'playlist' or result_type == 'multi_video':
752             # We process each entry in the playlist
753             playlist = ie_result.get('title') or ie_result.get('id')
754             self.to_screen('[download] Downloading playlist: %s' % playlist)
755
756             playlist_results = []
757
758             playliststart = self.params.get('playliststart', 1) - 1
759             playlistend = self.params.get('playlistend')
760             # For backwards compatibility, interpret -1 as whole list
761             if playlistend == -1:
762                 playlistend = None
763
764             playlistitems_str = self.params.get('playlist_items')
765             playlistitems = None
766             if playlistitems_str is not None:
767                 def iter_playlistitems(format):
768                     for string_segment in format.split(','):
769                         if '-' in string_segment:
770                             start, end = string_segment.split('-')
771                             for item in range(int(start), int(end) + 1):
772                                 yield int(item)
773                         else:
774                             yield int(string_segment)
775                 playlistitems = iter_playlistitems(playlistitems_str)
776
777             ie_entries = ie_result['entries']
778             if isinstance(ie_entries, list):
779                 n_all_entries = len(ie_entries)
780                 if playlistitems:
781                     entries = [
782                         ie_entries[i - 1] for i in playlistitems
783                         if -n_all_entries <= i - 1 < n_all_entries]
784                 else:
785                     entries = ie_entries[playliststart:playlistend]
786                 n_entries = len(entries)
787                 self.to_screen(
788                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
789                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
790             elif isinstance(ie_entries, PagedList):
791                 if playlistitems:
792                     entries = []
793                     for item in playlistitems:
794                         entries.extend(ie_entries.getslice(
795                             item - 1, item
796                         ))
797                 else:
798                     entries = ie_entries.getslice(
799                         playliststart, playlistend)
800                 n_entries = len(entries)
801                 self.to_screen(
802                     '[%s] playlist %s: Downloading %d videos' %
803                     (ie_result['extractor'], playlist, n_entries))
804             else:  # iterable
805                 if playlistitems:
806                     entry_list = list(ie_entries)
807                     entries = [entry_list[i - 1] for i in playlistitems]
808                 else:
809                     entries = list(itertools.islice(
810                         ie_entries, playliststart, playlistend))
811                 n_entries = len(entries)
812                 self.to_screen(
813                     '[%s] playlist %s: Downloading %d videos' %
814                     (ie_result['extractor'], playlist, n_entries))
815
816             if self.params.get('playlistreverse', False):
817                 entries = entries[::-1]
818
819             for i, entry in enumerate(entries, 1):
820                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
821                 extra = {
822                     'n_entries': n_entries,
823                     'playlist': playlist,
824                     'playlist_id': ie_result.get('id'),
825                     'playlist_title': ie_result.get('title'),
826                     'playlist_index': i + playliststart,
827                     'extractor': ie_result['extractor'],
828                     'webpage_url': ie_result['webpage_url'],
829                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
830                     'extractor_key': ie_result['extractor_key'],
831                 }
832
833                 reason = self._match_entry(entry, incomplete=True)
834                 if reason is not None:
835                     self.to_screen('[download] ' + reason)
836                     continue
837
838                 entry_result = self.process_ie_result(entry,
839                                                       download=download,
840                                                       extra_info=extra)
841                 playlist_results.append(entry_result)
842             ie_result['entries'] = playlist_results
843             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
844             return ie_result
845         elif result_type == 'compat_list':
846             self.report_warning(
847                 'Extractor %s returned a compat_list result. '
848                 'It needs to be updated.' % ie_result.get('extractor'))
849
850             def _fixup(r):
851                 self.add_extra_info(
852                     r,
853                     {
854                         'extractor': ie_result['extractor'],
855                         'webpage_url': ie_result['webpage_url'],
856                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
857                         'extractor_key': ie_result['extractor_key'],
858                     }
859                 )
860                 return r
861             ie_result['entries'] = [
862                 self.process_ie_result(_fixup(r), download, extra_info)
863                 for r in ie_result['entries']
864             ]
865             return ie_result
866         else:
867             raise Exception('Invalid result type: %s' % result_type)
868
869     def _build_format_filter(self, filter_spec):
870         " Returns a function to filter the formats according to the filter_spec "
871
872         OPERATORS = {
873             '<': operator.lt,
874             '<=': operator.le,
875             '>': operator.gt,
876             '>=': operator.ge,
877             '=': operator.eq,
878             '!=': operator.ne,
879         }
880         operator_rex = re.compile(r'''(?x)\s*
881             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
882             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
883             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
884             $
885             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
886         m = operator_rex.search(filter_spec)
887         if m:
888             try:
889                 comparison_value = int(m.group('value'))
890             except ValueError:
891                 comparison_value = parse_filesize(m.group('value'))
892                 if comparison_value is None:
893                     comparison_value = parse_filesize(m.group('value') + 'B')
894                 if comparison_value is None:
895                     raise ValueError(
896                         'Invalid value %r in format specification %r' % (
897                             m.group('value'), filter_spec))
898             op = OPERATORS[m.group('op')]
899
900         if not m:
901             STR_OPERATORS = {
902                 '=': operator.eq,
903                 '!=': operator.ne,
904                 '^=': lambda attr, value: attr.startswith(value),
905                 '$=': lambda attr, value: attr.endswith(value),
906                 '*=': lambda attr, value: value in attr,
907             }
908             str_operator_rex = re.compile(r'''(?x)
909                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
910                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
911                 \s*(?P<value>[a-zA-Z0-9._-]+)
912                 \s*$
913                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
914             m = str_operator_rex.search(filter_spec)
915             if m:
916                 comparison_value = m.group('value')
917                 op = STR_OPERATORS[m.group('op')]
918
919         if not m:
920             raise ValueError('Invalid filter specification %r' % filter_spec)
921
922         def _filter(f):
923             actual_value = f.get(m.group('key'))
924             if actual_value is None:
925                 return m.group('none_inclusive')
926             return op(actual_value, comparison_value)
927         return _filter
928
929     def build_format_selector(self, format_spec):
930         def syntax_error(note, start):
931             message = (
932                 'Invalid format specification: '
933                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
934             return SyntaxError(message)
935
936         PICKFIRST = 'PICKFIRST'
937         MERGE = 'MERGE'
938         SINGLE = 'SINGLE'
939         GROUP = 'GROUP'
940         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
941
942         def _parse_filter(tokens):
943             filter_parts = []
944             for type, string, start, _, _ in tokens:
945                 if type == tokenize.OP and string == ']':
946                     return ''.join(filter_parts)
947                 else:
948                     filter_parts.append(string)
949
950         def _remove_unused_ops(tokens):
951             # Remove operators that we don't use and join them with the surrounding strings
952             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
953             ALLOWED_OPS = ('/', '+', ',', '(', ')')
954             last_string, last_start, last_end, last_line = None, None, None, None
955             for type, string, start, end, line in tokens:
956                 if type == tokenize.OP and string == '[':
957                     if last_string:
958                         yield tokenize.NAME, last_string, last_start, last_end, last_line
959                         last_string = None
960                     yield type, string, start, end, line
961                     # everything inside brackets will be handled by _parse_filter
962                     for type, string, start, end, line in tokens:
963                         yield type, string, start, end, line
964                         if type == tokenize.OP and string == ']':
965                             break
966                 elif type == tokenize.OP and string in ALLOWED_OPS:
967                     if last_string:
968                         yield tokenize.NAME, last_string, last_start, last_end, last_line
969                         last_string = None
970                     yield type, string, start, end, line
971                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
972                     if not last_string:
973                         last_string = string
974                         last_start = start
975                         last_end = end
976                     else:
977                         last_string += string
978             if last_string:
979                 yield tokenize.NAME, last_string, last_start, last_end, last_line
980
981         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
982             selectors = []
983             current_selector = None
984             for type, string, start, _, _ in tokens:
985                 # ENCODING is only defined in python 3.x
986                 if type == getattr(tokenize, 'ENCODING', None):
987                     continue
988                 elif type in [tokenize.NAME, tokenize.NUMBER]:
989                     current_selector = FormatSelector(SINGLE, string, [])
990                 elif type == tokenize.OP:
991                     if string == ')':
992                         if not inside_group:
993                             # ')' will be handled by the parentheses group
994                             tokens.restore_last_token()
995                         break
996                     elif inside_merge and string in ['/', ',']:
997                         tokens.restore_last_token()
998                         break
999                     elif inside_choice and string == ',':
1000                         tokens.restore_last_token()
1001                         break
1002                     elif string == ',':
1003                         if not current_selector:
1004                             raise syntax_error('"," must follow a format selector', start)
1005                         selectors.append(current_selector)
1006                         current_selector = None
1007                     elif string == '/':
1008                         if not current_selector:
1009                             raise syntax_error('"/" must follow a format selector', start)
1010                         first_choice = current_selector
1011                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1012                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1013                     elif string == '[':
1014                         if not current_selector:
1015                             current_selector = FormatSelector(SINGLE, 'best', [])
1016                         format_filter = _parse_filter(tokens)
1017                         current_selector.filters.append(format_filter)
1018                     elif string == '(':
1019                         if current_selector:
1020                             raise syntax_error('Unexpected "("', start)
1021                         group = _parse_format_selection(tokens, inside_group=True)
1022                         current_selector = FormatSelector(GROUP, group, [])
1023                     elif string == '+':
1024                         video_selector = current_selector
1025                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1026                         if not video_selector or not audio_selector:
1027                             raise syntax_error('"+" must be between two format selectors', start)
1028                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1029                     else:
1030                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1031                 elif type == tokenize.ENDMARKER:
1032                     break
1033             if current_selector:
1034                 selectors.append(current_selector)
1035             return selectors
1036
1037         def _build_selector_function(selector):
1038             if isinstance(selector, list):
1039                 fs = [_build_selector_function(s) for s in selector]
1040
1041                 def selector_function(formats):
1042                     for f in fs:
1043                         for format in f(formats):
1044                             yield format
1045                 return selector_function
1046             elif selector.type == GROUP:
1047                 selector_function = _build_selector_function(selector.selector)
1048             elif selector.type == PICKFIRST:
1049                 fs = [_build_selector_function(s) for s in selector.selector]
1050
1051                 def selector_function(formats):
1052                     for f in fs:
1053                         picked_formats = list(f(formats))
1054                         if picked_formats:
1055                             return picked_formats
1056                     return []
1057             elif selector.type == SINGLE:
1058                 format_spec = selector.selector
1059
1060                 def selector_function(formats):
1061                     formats = list(formats)
1062                     if not formats:
1063                         return
1064                     if format_spec == 'all':
1065                         for f in formats:
1066                             yield f
1067                     elif format_spec in ['best', 'worst', None]:
1068                         format_idx = 0 if format_spec == 'worst' else -1
1069                         audiovideo_formats = [
1070                             f for f in formats
1071                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1072                         if audiovideo_formats:
1073                             yield audiovideo_formats[format_idx]
1074                         # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1075                         elif (all(f.get('acodec') != 'none' for f in formats) or
1076                               all(f.get('vcodec') != 'none' for f in formats)):
1077                             yield formats[format_idx]
1078                     elif format_spec == 'bestaudio':
1079                         audio_formats = [
1080                             f for f in formats
1081                             if f.get('vcodec') == 'none']
1082                         if audio_formats:
1083                             yield audio_formats[-1]
1084                     elif format_spec == 'worstaudio':
1085                         audio_formats = [
1086                             f for f in formats
1087                             if f.get('vcodec') == 'none']
1088                         if audio_formats:
1089                             yield audio_formats[0]
1090                     elif format_spec == 'bestvideo':
1091                         video_formats = [
1092                             f for f in formats
1093                             if f.get('acodec') == 'none']
1094                         if video_formats:
1095                             yield video_formats[-1]
1096                     elif format_spec == 'worstvideo':
1097                         video_formats = [
1098                             f for f in formats
1099                             if f.get('acodec') == 'none']
1100                         if video_formats:
1101                             yield video_formats[0]
1102                     else:
1103                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1104                         if format_spec in extensions:
1105                             filter_f = lambda f: f['ext'] == format_spec
1106                         else:
1107                             filter_f = lambda f: f['format_id'] == format_spec
1108                         matches = list(filter(filter_f, formats))
1109                         if matches:
1110                             yield matches[-1]
1111             elif selector.type == MERGE:
1112                 def _merge(formats_info):
1113                     format_1, format_2 = [f['format_id'] for f in formats_info]
1114                     # The first format must contain the video and the
1115                     # second the audio
1116                     if formats_info[0].get('vcodec') == 'none':
1117                         self.report_error('The first format must '
1118                                           'contain the video, try using '
1119                                           '"-f %s+%s"' % (format_2, format_1))
1120                         return
1121                     # Formats must be opposite (video+audio)
1122                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1123                         self.report_error(
1124                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1125                             % (format_1, format_2))
1126                         return
1127                     output_ext = (
1128                         formats_info[0]['ext']
1129                         if self.params.get('merge_output_format') is None
1130                         else self.params['merge_output_format'])
1131                     return {
1132                         'requested_formats': formats_info,
1133                         'format': '%s+%s' % (formats_info[0].get('format'),
1134                                              formats_info[1].get('format')),
1135                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1136                                                 formats_info[1].get('format_id')),
1137                         'width': formats_info[0].get('width'),
1138                         'height': formats_info[0].get('height'),
1139                         'resolution': formats_info[0].get('resolution'),
1140                         'fps': formats_info[0].get('fps'),
1141                         'vcodec': formats_info[0].get('vcodec'),
1142                         'vbr': formats_info[0].get('vbr'),
1143                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1144                         'acodec': formats_info[1].get('acodec'),
1145                         'abr': formats_info[1].get('abr'),
1146                         'ext': output_ext,
1147                     }
1148                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1149
1150                 def selector_function(formats):
1151                     formats = list(formats)
1152                     for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1153                         yield _merge(pair)
1154
1155             filters = [self._build_format_filter(f) for f in selector.filters]
1156
1157             def final_selector(formats):
1158                 for _filter in filters:
1159                     formats = list(filter(_filter, formats))
1160                 return selector_function(formats)
1161             return final_selector
1162
1163         stream = io.BytesIO(format_spec.encode('utf-8'))
1164         try:
1165             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1166         except tokenize.TokenError:
1167             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1168
1169         class TokenIterator(object):
1170             def __init__(self, tokens):
1171                 self.tokens = tokens
1172                 self.counter = 0
1173
1174             def __iter__(self):
1175                 return self
1176
1177             def __next__(self):
1178                 if self.counter >= len(self.tokens):
1179                     raise StopIteration()
1180                 value = self.tokens[self.counter]
1181                 self.counter += 1
1182                 return value
1183
1184             next = __next__
1185
1186             def restore_last_token(self):
1187                 self.counter -= 1
1188
1189         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1190         return _build_selector_function(parsed_selector)
1191
1192     def _calc_headers(self, info_dict):
1193         res = std_headers.copy()
1194
1195         add_headers = info_dict.get('http_headers')
1196         if add_headers:
1197             res.update(add_headers)
1198
1199         cookies = self._calc_cookies(info_dict)
1200         if cookies:
1201             res['Cookie'] = cookies
1202
1203         return res
1204
1205     def _calc_cookies(self, info_dict):
1206         pr = sanitized_Request(info_dict['url'])
1207         self.cookiejar.add_cookie_header(pr)
1208         return pr.get_header('Cookie')
1209
1210     def process_video_result(self, info_dict, download=True):
1211         assert info_dict.get('_type', 'video') == 'video'
1212
1213         if 'id' not in info_dict:
1214             raise ExtractorError('Missing "id" field in extractor result')
1215         if 'title' not in info_dict:
1216             raise ExtractorError('Missing "title" field in extractor result')
1217
1218         if 'playlist' not in info_dict:
1219             # It isn't part of a playlist
1220             info_dict['playlist'] = None
1221             info_dict['playlist_index'] = None
1222
1223         thumbnails = info_dict.get('thumbnails')
1224         if thumbnails is None:
1225             thumbnail = info_dict.get('thumbnail')
1226             if thumbnail:
1227                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1228         if thumbnails:
1229             thumbnails.sort(key=lambda t: (
1230                 t.get('preference'), t.get('width'), t.get('height'),
1231                 t.get('id'), t.get('url')))
1232             for i, t in enumerate(thumbnails):
1233                 t['url'] = sanitize_url(t['url'])
1234                 if t.get('width') and t.get('height'):
1235                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1236                 if t.get('id') is None:
1237                     t['id'] = '%d' % i
1238
1239         if self.params.get('list_thumbnails'):
1240             self.list_thumbnails(info_dict)
1241             return
1242
1243         if thumbnails and 'thumbnail' not in info_dict:
1244             info_dict['thumbnail'] = thumbnails[-1]['url']
1245
1246         if 'display_id' not in info_dict and 'id' in info_dict:
1247             info_dict['display_id'] = info_dict['id']
1248
1249         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1250             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1251             # see http://bugs.python.org/issue1646728)
1252             try:
1253                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1254                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1255             except (ValueError, OverflowError, OSError):
1256                 pass
1257
1258         # Auto generate title fields corresponding to the *_number fields when missing
1259         # in order to always have clean titles. This is very common for TV series.
1260         for field in ('chapter', 'season', 'episode'):
1261             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1262                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1263
1264         subtitles = info_dict.get('subtitles')
1265         if subtitles:
1266             for _, subtitle in subtitles.items():
1267                 for subtitle_format in subtitle:
1268                     subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1269                     if 'ext' not in subtitle_format:
1270                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1271
1272         if self.params.get('listsubtitles', False):
1273             if 'automatic_captions' in info_dict:
1274                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1275             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1276             return
1277         info_dict['requested_subtitles'] = self.process_subtitles(
1278             info_dict['id'], subtitles,
1279             info_dict.get('automatic_captions'))
1280
1281         # We now pick which formats have to be downloaded
1282         if info_dict.get('formats') is None:
1283             # There's only one format available
1284             formats = [info_dict]
1285         else:
1286             formats = info_dict['formats']
1287
1288         if not formats:
1289             raise ExtractorError('No video formats found!')
1290
1291         formats_dict = {}
1292
1293         # We check that all the formats have the format and format_id fields
1294         for i, format in enumerate(formats):
1295             if 'url' not in format:
1296                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1297
1298             format['url'] = sanitize_url(format['url'])
1299
1300             if format.get('format_id') is None:
1301                 format['format_id'] = compat_str(i)
1302             else:
1303                 # Sanitize format_id from characters used in format selector expression
1304                 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1305             format_id = format['format_id']
1306             if format_id not in formats_dict:
1307                 formats_dict[format_id] = []
1308             formats_dict[format_id].append(format)
1309
1310         # Make sure all formats have unique format_id
1311         for format_id, ambiguous_formats in formats_dict.items():
1312             if len(ambiguous_formats) > 1:
1313                 for i, format in enumerate(ambiguous_formats):
1314                     format['format_id'] = '%s-%d' % (format_id, i)
1315
1316         for i, format in enumerate(formats):
1317             if format.get('format') is None:
1318                 format['format'] = '{id} - {res}{note}'.format(
1319                     id=format['format_id'],
1320                     res=self.format_resolution(format),
1321                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1322                 )
1323             # Automatically determine file extension if missing
1324             if 'ext' not in format:
1325                 format['ext'] = determine_ext(format['url']).lower()
1326             # Automatically determine protocol if missing (useful for format
1327             # selection purposes)
1328             if 'protocol' not in format:
1329                 format['protocol'] = determine_protocol(format)
1330             # Add HTTP headers, so that external programs can use them from the
1331             # json output
1332             full_format_info = info_dict.copy()
1333             full_format_info.update(format)
1334             format['http_headers'] = self._calc_headers(full_format_info)
1335
1336         # TODO Central sorting goes here
1337
1338         if formats[0] is not info_dict:
1339             # only set the 'formats' fields if the original info_dict list them
1340             # otherwise we end up with a circular reference, the first (and unique)
1341             # element in the 'formats' field in info_dict is info_dict itself,
1342             # which can't be exported to json
1343             info_dict['formats'] = formats
1344         if self.params.get('listformats'):
1345             self.list_formats(info_dict)
1346             return
1347
1348         req_format = self.params.get('format')
1349         if req_format is None:
1350             req_format_list = []
1351             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1352                     not info_dict.get('is_live')):
1353                 merger = FFmpegMergerPP(self)
1354                 if merger.available and merger.can_merge():
1355                     req_format_list.append('bestvideo+bestaudio')
1356             req_format_list.append('best')
1357             req_format = '/'.join(req_format_list)
1358         format_selector = self.build_format_selector(req_format)
1359         formats_to_download = list(format_selector(formats))
1360         if not formats_to_download:
1361             raise ExtractorError('requested format not available',
1362                                  expected=True)
1363
1364         if download:
1365             if len(formats_to_download) > 1:
1366                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1367             for format in formats_to_download:
1368                 new_info = dict(info_dict)
1369                 new_info.update(format)
1370                 self.process_info(new_info)
1371         # We update the info dict with the best quality format (backwards compatibility)
1372         info_dict.update(formats_to_download[-1])
1373         return info_dict
1374
1375     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1376         """Select the requested subtitles and their format"""
1377         available_subs = {}
1378         if normal_subtitles and self.params.get('writesubtitles'):
1379             available_subs.update(normal_subtitles)
1380         if automatic_captions and self.params.get('writeautomaticsub'):
1381             for lang, cap_info in automatic_captions.items():
1382                 if lang not in available_subs:
1383                     available_subs[lang] = cap_info
1384
1385         if (not self.params.get('writesubtitles') and not
1386                 self.params.get('writeautomaticsub') or not
1387                 available_subs):
1388             return None
1389
1390         if self.params.get('allsubtitles', False):
1391             requested_langs = available_subs.keys()
1392         else:
1393             if self.params.get('subtitleslangs', False):
1394                 requested_langs = self.params.get('subtitleslangs')
1395             elif 'en' in available_subs:
1396                 requested_langs = ['en']
1397             else:
1398                 requested_langs = [list(available_subs.keys())[0]]
1399
1400         formats_query = self.params.get('subtitlesformat', 'best')
1401         formats_preference = formats_query.split('/') if formats_query else []
1402         subs = {}
1403         for lang in requested_langs:
1404             formats = available_subs.get(lang)
1405             if formats is None:
1406                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1407                 continue
1408             for ext in formats_preference:
1409                 if ext == 'best':
1410                     f = formats[-1]
1411                     break
1412                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1413                 if matches:
1414                     f = matches[-1]
1415                     break
1416             else:
1417                 f = formats[-1]
1418                 self.report_warning(
1419                     'No subtitle format found matching "%s" for language %s, '
1420                     'using %s' % (formats_query, lang, f['ext']))
1421             subs[lang] = f
1422         return subs
1423
1424     def process_info(self, info_dict):
1425         """Process a single resolved IE result."""
1426
1427         assert info_dict.get('_type', 'video') == 'video'
1428
1429         max_downloads = self.params.get('max_downloads')
1430         if max_downloads is not None:
1431             if self._num_downloads >= int(max_downloads):
1432                 raise MaxDownloadsReached()
1433
1434         info_dict['fulltitle'] = info_dict['title']
1435         if len(info_dict['title']) > 200:
1436             info_dict['title'] = info_dict['title'][:197] + '...'
1437
1438         if 'format' not in info_dict:
1439             info_dict['format'] = info_dict['ext']
1440
1441         reason = self._match_entry(info_dict, incomplete=False)
1442         if reason is not None:
1443             self.to_screen('[download] ' + reason)
1444             return
1445
1446         self._num_downloads += 1
1447
1448         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1449
1450         # Forced printings
1451         if self.params.get('forcetitle', False):
1452             self.to_stdout(info_dict['fulltitle'])
1453         if self.params.get('forceid', False):
1454             self.to_stdout(info_dict['id'])
1455         if self.params.get('forceurl', False):
1456             if info_dict.get('requested_formats') is not None:
1457                 for f in info_dict['requested_formats']:
1458                     self.to_stdout(f['url'] + f.get('play_path', ''))
1459             else:
1460                 # For RTMP URLs, also include the playpath
1461                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1462         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1463             self.to_stdout(info_dict['thumbnail'])
1464         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1465             self.to_stdout(info_dict['description'])
1466         if self.params.get('forcefilename', False) and filename is not None:
1467             self.to_stdout(filename)
1468         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1469             self.to_stdout(formatSeconds(info_dict['duration']))
1470         if self.params.get('forceformat', False):
1471             self.to_stdout(info_dict['format'])
1472         if self.params.get('forcejson', False):
1473             self.to_stdout(json.dumps(info_dict))
1474
1475         # Do nothing else if in simulate mode
1476         if self.params.get('simulate', False):
1477             return
1478
1479         if filename is None:
1480             return
1481
1482         try:
1483             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1484             if dn and not os.path.exists(dn):
1485                 os.makedirs(dn)
1486         except (OSError, IOError) as err:
1487             self.report_error('unable to create directory ' + error_to_compat_str(err))
1488             return
1489
1490         if self.params.get('writedescription', False):
1491             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1492             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1493                 self.to_screen('[info] Video description is already present')
1494             elif info_dict.get('description') is None:
1495                 self.report_warning('There\'s no description to write.')
1496             else:
1497                 try:
1498                     self.to_screen('[info] Writing video description to: ' + descfn)
1499                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1500                         descfile.write(info_dict['description'])
1501                 except (OSError, IOError):
1502                     self.report_error('Cannot write description file ' + descfn)
1503                     return
1504
1505         if self.params.get('writeannotations', False):
1506             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1507             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1508                 self.to_screen('[info] Video annotations are already present')
1509             else:
1510                 try:
1511                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1512                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1513                         annofile.write(info_dict['annotations'])
1514                 except (KeyError, TypeError):
1515                     self.report_warning('There are no annotations to write.')
1516                 except (OSError, IOError):
1517                     self.report_error('Cannot write annotations file: ' + annofn)
1518                     return
1519
1520         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1521                                        self.params.get('writeautomaticsub')])
1522
1523         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1524             # subtitles download errors are already managed as troubles in relevant IE
1525             # that way it will silently go on when used with unsupporting IE
1526             subtitles = info_dict['requested_subtitles']
1527             ie = self.get_info_extractor(info_dict['extractor_key'])
1528             for sub_lang, sub_info in subtitles.items():
1529                 sub_format = sub_info['ext']
1530                 if sub_info.get('data') is not None:
1531                     sub_data = sub_info['data']
1532                 else:
1533                     try:
1534                         sub_data = ie._download_webpage(
1535                             sub_info['url'], info_dict['id'], note=False)
1536                     except ExtractorError as err:
1537                         self.report_warning('Unable to download subtitle for "%s": %s' %
1538                                             (sub_lang, error_to_compat_str(err.cause)))
1539                         continue
1540                 try:
1541                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1542                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1543                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1544                     else:
1545                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1546                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1547                             subfile.write(sub_data)
1548                 except (OSError, IOError):
1549                     self.report_error('Cannot write subtitles file ' + sub_filename)
1550                     return
1551
1552         if self.params.get('writeinfojson', False):
1553             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1554             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1555                 self.to_screen('[info] Video description metadata is already present')
1556             else:
1557                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1558                 try:
1559                     write_json_file(self.filter_requested_info(info_dict), infofn)
1560                 except (OSError, IOError):
1561                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1562                     return
1563
1564         self._write_thumbnails(info_dict, filename)
1565
1566         if not self.params.get('skip_download', False):
1567             try:
1568                 def dl(name, info):
1569                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1570                     for ph in self._progress_hooks:
1571                         fd.add_progress_hook(ph)
1572                     if self.params.get('verbose'):
1573                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1574                     return fd.download(name, info)
1575
1576                 if info_dict.get('requested_formats') is not None:
1577                     downloaded = []
1578                     success = True
1579                     merger = FFmpegMergerPP(self)
1580                     if not merger.available:
1581                         postprocessors = []
1582                         self.report_warning('You have requested multiple '
1583                                             'formats but ffmpeg or avconv are not installed.'
1584                                             ' The formats won\'t be merged.')
1585                     else:
1586                         postprocessors = [merger]
1587
1588                     def compatible_formats(formats):
1589                         video, audio = formats
1590                         # Check extension
1591                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1592                         if video_ext and audio_ext:
1593                             COMPATIBLE_EXTS = (
1594                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1595                                 ('webm')
1596                             )
1597                             for exts in COMPATIBLE_EXTS:
1598                                 if video_ext in exts and audio_ext in exts:
1599                                     return True
1600                         # TODO: Check acodec/vcodec
1601                         return False
1602
1603                     filename_real_ext = os.path.splitext(filename)[1][1:]
1604                     filename_wo_ext = (
1605                         os.path.splitext(filename)[0]
1606                         if filename_real_ext == info_dict['ext']
1607                         else filename)
1608                     requested_formats = info_dict['requested_formats']
1609                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1610                         info_dict['ext'] = 'mkv'
1611                         self.report_warning(
1612                             'Requested formats are incompatible for merge and will be merged into mkv.')
1613                     # Ensure filename always has a correct extension for successful merge
1614                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1615                     if os.path.exists(encodeFilename(filename)):
1616                         self.to_screen(
1617                             '[download] %s has already been downloaded and '
1618                             'merged' % filename)
1619                     else:
1620                         for f in requested_formats:
1621                             new_info = dict(info_dict)
1622                             new_info.update(f)
1623                             fname = self.prepare_filename(new_info)
1624                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1625                             downloaded.append(fname)
1626                             partial_success = dl(fname, new_info)
1627                             success = success and partial_success
1628                         info_dict['__postprocessors'] = postprocessors
1629                         info_dict['__files_to_merge'] = downloaded
1630                 else:
1631                     # Just a single file
1632                     success = dl(filename, info_dict)
1633             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1634                 self.report_error('unable to download video data: %s' % str(err))
1635                 return
1636             except (OSError, IOError) as err:
1637                 raise UnavailableVideoError(err)
1638             except (ContentTooShortError, ) as err:
1639                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1640                 return
1641
1642             if success and filename != '-':
1643                 # Fixup content
1644                 fixup_policy = self.params.get('fixup')
1645                 if fixup_policy is None:
1646                     fixup_policy = 'detect_or_warn'
1647
1648                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1649
1650                 stretched_ratio = info_dict.get('stretched_ratio')
1651                 if stretched_ratio is not None and stretched_ratio != 1:
1652                     if fixup_policy == 'warn':
1653                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1654                             info_dict['id'], stretched_ratio))
1655                     elif fixup_policy == 'detect_or_warn':
1656                         stretched_pp = FFmpegFixupStretchedPP(self)
1657                         if stretched_pp.available:
1658                             info_dict.setdefault('__postprocessors', [])
1659                             info_dict['__postprocessors'].append(stretched_pp)
1660                         else:
1661                             self.report_warning(
1662                                 '%s: Non-uniform pixel ratio (%s). %s'
1663                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1664                     else:
1665                         assert fixup_policy in ('ignore', 'never')
1666
1667                 if (info_dict.get('requested_formats') is None and
1668                         info_dict.get('container') == 'm4a_dash'):
1669                     if fixup_policy == 'warn':
1670                         self.report_warning(
1671                             '%s: writing DASH m4a. '
1672                             'Only some players support this container.'
1673                             % info_dict['id'])
1674                     elif fixup_policy == 'detect_or_warn':
1675                         fixup_pp = FFmpegFixupM4aPP(self)
1676                         if fixup_pp.available:
1677                             info_dict.setdefault('__postprocessors', [])
1678                             info_dict['__postprocessors'].append(fixup_pp)
1679                         else:
1680                             self.report_warning(
1681                                 '%s: writing DASH m4a. '
1682                                 'Only some players support this container. %s'
1683                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1684                     else:
1685                         assert fixup_policy in ('ignore', 'never')
1686
1687                 if (info_dict.get('protocol') == 'm3u8_native' or
1688                         info_dict.get('protocol') == 'm3u8' and
1689                         self.params.get('hls_prefer_native')):
1690                     if fixup_policy == 'warn':
1691                         self.report_warning('%s: malformated aac bitstream.' % (
1692                             info_dict['id']))
1693                     elif fixup_policy == 'detect_or_warn':
1694                         fixup_pp = FFmpegFixupM3u8PP(self)
1695                         if fixup_pp.available:
1696                             info_dict.setdefault('__postprocessors', [])
1697                             info_dict['__postprocessors'].append(fixup_pp)
1698                         else:
1699                             self.report_warning(
1700                                 '%s: malformated aac bitstream. %s'
1701                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1702                     else:
1703                         assert fixup_policy in ('ignore', 'never')
1704
1705                 try:
1706                     self.post_process(filename, info_dict)
1707                 except (PostProcessingError) as err:
1708                     self.report_error('postprocessing: %s' % str(err))
1709                     return
1710                 self.record_download_archive(info_dict)
1711
1712     def download(self, url_list):
1713         """Download a given list of URLs."""
1714         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1715         if (len(url_list) > 1 and
1716                 '%' not in outtmpl and
1717                 self.params.get('max_downloads') != 1):
1718             raise SameFileError(outtmpl)
1719
1720         for url in url_list:
1721             try:
1722                 # It also downloads the videos
1723                 res = self.extract_info(
1724                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1725             except UnavailableVideoError:
1726                 self.report_error('unable to download video')
1727             except MaxDownloadsReached:
1728                 self.to_screen('[info] Maximum number of downloaded files reached.')
1729                 raise
1730             else:
1731                 if self.params.get('dump_single_json', False):
1732                     self.to_stdout(json.dumps(res))
1733
1734         return self._download_retcode
1735
1736     def download_with_info_file(self, info_filename):
1737         with contextlib.closing(fileinput.FileInput(
1738                 [info_filename], mode='r',
1739                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1740             # FileInput doesn't have a read method, we can't call json.load
1741             info = self.filter_requested_info(json.loads('\n'.join(f)))
1742         try:
1743             self.process_ie_result(info, download=True)
1744         except DownloadError:
1745             webpage_url = info.get('webpage_url')
1746             if webpage_url is not None:
1747                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1748                 return self.download([webpage_url])
1749             else:
1750                 raise
1751         return self._download_retcode
1752
1753     @staticmethod
1754     def filter_requested_info(info_dict):
1755         return dict(
1756             (k, v) for k, v in info_dict.items()
1757             if k not in ['requested_formats', 'requested_subtitles'])
1758
1759     def post_process(self, filename, ie_info):
1760         """Run all the postprocessors on the given file."""
1761         info = dict(ie_info)
1762         info['filepath'] = filename
1763         pps_chain = []
1764         if ie_info.get('__postprocessors') is not None:
1765             pps_chain.extend(ie_info['__postprocessors'])
1766         pps_chain.extend(self._pps)
1767         for pp in pps_chain:
1768             files_to_delete = []
1769             try:
1770                 files_to_delete, info = pp.run(info)
1771             except PostProcessingError as e:
1772                 self.report_error(e.msg)
1773             if files_to_delete and not self.params.get('keepvideo', False):
1774                 for old_filename in files_to_delete:
1775                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1776                     try:
1777                         os.remove(encodeFilename(old_filename))
1778                     except (IOError, OSError):
1779                         self.report_warning('Unable to remove downloaded original file')
1780
1781     def _make_archive_id(self, info_dict):
1782         # Future-proof against any change in case
1783         # and backwards compatibility with prior versions
1784         extractor = info_dict.get('extractor_key')
1785         if extractor is None:
1786             if 'id' in info_dict:
1787                 extractor = info_dict.get('ie_key')  # key in a playlist
1788         if extractor is None:
1789             return None  # Incomplete video information
1790         return extractor.lower() + ' ' + info_dict['id']
1791
1792     def in_download_archive(self, info_dict):
1793         fn = self.params.get('download_archive')
1794         if fn is None:
1795             return False
1796
1797         vid_id = self._make_archive_id(info_dict)
1798         if vid_id is None:
1799             return False  # Incomplete video information
1800
1801         try:
1802             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1803                 for line in archive_file:
1804                     if line.strip() == vid_id:
1805                         return True
1806         except IOError as ioe:
1807             if ioe.errno != errno.ENOENT:
1808                 raise
1809         return False
1810
1811     def record_download_archive(self, info_dict):
1812         fn = self.params.get('download_archive')
1813         if fn is None:
1814             return
1815         vid_id = self._make_archive_id(info_dict)
1816         assert vid_id
1817         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1818             archive_file.write(vid_id + '\n')
1819
1820     @staticmethod
1821     def format_resolution(format, default='unknown'):
1822         if format.get('vcodec') == 'none':
1823             return 'audio only'
1824         if format.get('resolution') is not None:
1825             return format['resolution']
1826         if format.get('height') is not None:
1827             if format.get('width') is not None:
1828                 res = '%sx%s' % (format['width'], format['height'])
1829             else:
1830                 res = '%sp' % format['height']
1831         elif format.get('width') is not None:
1832             res = '%dx?' % format['width']
1833         else:
1834             res = default
1835         return res
1836
1837     def _format_note(self, fdict):
1838         res = ''
1839         if fdict.get('ext') in ['f4f', 'f4m']:
1840             res += '(unsupported) '
1841         if fdict.get('language'):
1842             if res:
1843                 res += ' '
1844             res += '[%s] ' % fdict['language']
1845         if fdict.get('format_note') is not None:
1846             res += fdict['format_note'] + ' '
1847         if fdict.get('tbr') is not None:
1848             res += '%4dk ' % fdict['tbr']
1849         if fdict.get('container') is not None:
1850             if res:
1851                 res += ', '
1852             res += '%s container' % fdict['container']
1853         if (fdict.get('vcodec') is not None and
1854                 fdict.get('vcodec') != 'none'):
1855             if res:
1856                 res += ', '
1857             res += fdict['vcodec']
1858             if fdict.get('vbr') is not None:
1859                 res += '@'
1860         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1861             res += 'video@'
1862         if fdict.get('vbr') is not None:
1863             res += '%4dk' % fdict['vbr']
1864         if fdict.get('fps') is not None:
1865             if res:
1866                 res += ', '
1867             res += '%sfps' % fdict['fps']
1868         if fdict.get('acodec') is not None:
1869             if res:
1870                 res += ', '
1871             if fdict['acodec'] == 'none':
1872                 res += 'video only'
1873             else:
1874                 res += '%-5s' % fdict['acodec']
1875         elif fdict.get('abr') is not None:
1876             if res:
1877                 res += ', '
1878             res += 'audio'
1879         if fdict.get('abr') is not None:
1880             res += '@%3dk' % fdict['abr']
1881         if fdict.get('asr') is not None:
1882             res += ' (%5dHz)' % fdict['asr']
1883         if fdict.get('filesize') is not None:
1884             if res:
1885                 res += ', '
1886             res += format_bytes(fdict['filesize'])
1887         elif fdict.get('filesize_approx') is not None:
1888             if res:
1889                 res += ', '
1890             res += '~' + format_bytes(fdict['filesize_approx'])
1891         return res
1892
1893     def list_formats(self, info_dict):
1894         formats = info_dict.get('formats', [info_dict])
1895         table = [
1896             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1897             for f in formats
1898             if f.get('preference') is None or f['preference'] >= -1000]
1899         if len(formats) > 1:
1900             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1901
1902         header_line = ['format code', 'extension', 'resolution', 'note']
1903         self.to_screen(
1904             '[info] Available formats for %s:\n%s' %
1905             (info_dict['id'], render_table(header_line, table)))
1906
1907     def list_thumbnails(self, info_dict):
1908         thumbnails = info_dict.get('thumbnails')
1909         if not thumbnails:
1910             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1911             return
1912
1913         self.to_screen(
1914             '[info] Thumbnails for %s:' % info_dict['id'])
1915         self.to_screen(render_table(
1916             ['ID', 'width', 'height', 'URL'],
1917             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1918
1919     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1920         if not subtitles:
1921             self.to_screen('%s has no %s' % (video_id, name))
1922             return
1923         self.to_screen(
1924             'Available %s for %s:' % (name, video_id))
1925         self.to_screen(render_table(
1926             ['Language', 'formats'],
1927             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1928                 for lang, formats in subtitles.items()]))
1929
1930     def urlopen(self, req):
1931         """ Start an HTTP download """
1932         if isinstance(req, compat_basestring):
1933             req = sanitized_Request(req)
1934         return self._opener.open(req, timeout=self._socket_timeout)
1935
1936     def print_debug_header(self):
1937         if not self.params.get('verbose'):
1938             return
1939
1940         if type('') is not compat_str:
1941             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1942             self.report_warning(
1943                 'Your Python is broken! Update to a newer and supported version')
1944
1945         stdout_encoding = getattr(
1946             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1947         encoding_str = (
1948             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1949                 locale.getpreferredencoding(),
1950                 sys.getfilesystemencoding(),
1951                 stdout_encoding,
1952                 self.get_encoding()))
1953         write_string(encoding_str, encoding=None)
1954
1955         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1956         try:
1957             sp = subprocess.Popen(
1958                 ['git', 'rev-parse', '--short', 'HEAD'],
1959                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1960                 cwd=os.path.dirname(os.path.abspath(__file__)))
1961             out, err = sp.communicate()
1962             out = out.decode().strip()
1963             if re.match('[0-9a-f]+', out):
1964                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1965         except Exception:
1966             try:
1967                 sys.exc_clear()
1968             except Exception:
1969                 pass
1970         self._write_string('[debug] Python version %s - %s\n' % (
1971             platform.python_version(), platform_name()))
1972
1973         exe_versions = FFmpegPostProcessor.get_versions(self)
1974         exe_versions['rtmpdump'] = rtmpdump_version()
1975         exe_str = ', '.join(
1976             '%s %s' % (exe, v)
1977             for exe, v in sorted(exe_versions.items())
1978             if v
1979         )
1980         if not exe_str:
1981             exe_str = 'none'
1982         self._write_string('[debug] exe versions: %s\n' % exe_str)
1983
1984         proxy_map = {}
1985         for handler in self._opener.handlers:
1986             if hasattr(handler, 'proxies'):
1987                 proxy_map.update(handler.proxies)
1988         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1989
1990         if self.params.get('call_home', False):
1991             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1992             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1993             latest_version = self.urlopen(
1994                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1995             if version_tuple(latest_version) > version_tuple(__version__):
1996                 self.report_warning(
1997                     'You are using an outdated version (newest version: %s)! '
1998                     'See https://yt-dl.org/update if you need help updating.' %
1999                     latest_version)
2000
2001     def _setup_opener(self):
2002         timeout_val = self.params.get('socket_timeout')
2003         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2004
2005         opts_cookiefile = self.params.get('cookiefile')
2006         opts_proxy = self.params.get('proxy')
2007
2008         if opts_cookiefile is None:
2009             self.cookiejar = compat_cookiejar.CookieJar()
2010         else:
2011             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2012                 opts_cookiefile)
2013             if os.access(opts_cookiefile, os.R_OK):
2014                 self.cookiejar.load()
2015
2016         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2017         if opts_proxy is not None:
2018             if opts_proxy == '':
2019                 proxies = {}
2020             else:
2021                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2022         else:
2023             proxies = compat_urllib_request.getproxies()
2024             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2025             if 'http' in proxies and 'https' not in proxies:
2026                 proxies['https'] = proxies['http']
2027         proxy_handler = PerRequestProxyHandler(proxies)
2028
2029         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2030         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2031         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2032         data_handler = compat_urllib_request_DataHandler()
2033
2034         # When passing our own FileHandler instance, build_opener won't add the
2035         # default FileHandler and allows us to disable the file protocol, which
2036         # can be used for malicious purposes (see
2037         # https://github.com/rg3/youtube-dl/issues/8227)
2038         file_handler = compat_urllib_request.FileHandler()
2039
2040         def file_open(*args, **kwargs):
2041             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2042         file_handler.file_open = file_open
2043
2044         opener = compat_urllib_request.build_opener(
2045             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2046
2047         # Delete the default user-agent header, which would otherwise apply in
2048         # cases where our custom HTTP handler doesn't come into play
2049         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2050         opener.addheaders = []
2051         self._opener = opener
2052
2053     def encode(self, s):
2054         if isinstance(s, bytes):
2055             return s  # Already encoded
2056
2057         try:
2058             return s.encode(self.get_encoding())
2059         except UnicodeEncodeError as err:
2060             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2061             raise
2062
2063     def get_encoding(self):
2064         encoding = self.params.get('encoding')
2065         if encoding is None:
2066             encoding = preferredencoding()
2067         return encoding
2068
2069     def _write_thumbnails(self, info_dict, filename):
2070         if self.params.get('writethumbnail', False):
2071             thumbnails = info_dict.get('thumbnails')
2072             if thumbnails:
2073                 thumbnails = [thumbnails[-1]]
2074         elif self.params.get('write_all_thumbnails', False):
2075             thumbnails = info_dict.get('thumbnails')
2076         else:
2077             return
2078
2079         if not thumbnails:
2080             # No thumbnails present, so return immediately
2081             return
2082
2083         for t in thumbnails:
2084             thumb_ext = determine_ext(t['url'], 'jpg')
2085             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2086             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2087             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2088
2089             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2090                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2091                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2092             else:
2093                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2094                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2095                 try:
2096                     uf = self.urlopen(t['url'])
2097                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2098                         shutil.copyfileobj(uf, thumbf)
2099                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2100                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2101                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2102                     self.report_warning('Unable to download thumbnail "%s": %s' %
2103                                         (t['url'], error_to_compat_str(err)))