[YoutubeDL] Fix sanitizing subtitles' url
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import tokenize
25 import traceback
26
27 from .compat import (
28     compat_basestring,
29     compat_cookiejar,
30     compat_expanduser,
31     compat_get_terminal_size,
32     compat_http_client,
33     compat_kwargs,
34     compat_os_name,
35     compat_str,
36     compat_tokenize_tokenize,
37     compat_urllib_error,
38     compat_urllib_request,
39     compat_urllib_request_DataHandler,
40 )
41 from .utils import (
42     age_restricted,
43     args_to_str,
44     ContentTooShortError,
45     date_from_str,
46     DateRange,
47     DEFAULT_OUTTMPL,
48     determine_ext,
49     determine_protocol,
50     DownloadError,
51     encode_compat_str,
52     encodeFilename,
53     error_to_compat_str,
54     ExtractorError,
55     format_bytes,
56     formatSeconds,
57     locked_file,
58     make_HTTPS_handler,
59     MaxDownloadsReached,
60     PagedList,
61     parse_filesize,
62     PerRequestProxyHandler,
63     platform_name,
64     PostProcessingError,
65     preferredencoding,
66     prepend_extension,
67     render_table,
68     replace_extension,
69     SameFileError,
70     sanitize_filename,
71     sanitize_path,
72     sanitize_url,
73     sanitized_Request,
74     std_headers,
75     subtitles_filename,
76     UnavailableVideoError,
77     url_basename,
78     version_tuple,
79     write_json_file,
80     write_string,
81     YoutubeDLCookieProcessor,
82     YoutubeDLHandler,
83 )
84 from .cache import Cache
85 from .extractor import get_info_extractor, gen_extractors
86 from .downloader import get_suitable_downloader
87 from .downloader.rtmp import rtmpdump_version
88 from .postprocessor import (
89     FFmpegFixupM3u8PP,
90     FFmpegFixupM4aPP,
91     FFmpegFixupStretchedPP,
92     FFmpegMergerPP,
93     FFmpegPostProcessor,
94     get_postprocessor,
95 )
96 from .version import __version__
97
98 if compat_os_name == 'nt':
99     import ctypes
100
101
102 class YoutubeDL(object):
103     """YoutubeDL class.
104
105     YoutubeDL objects are the ones responsible of downloading the
106     actual video file and writing it to disk if the user has requested
107     it, among some other tasks. In most cases there should be one per
108     program. As, given a video URL, the downloader doesn't know how to
109     extract all the needed information, task that InfoExtractors do, it
110     has to pass the URL to one of them.
111
112     For this, YoutubeDL objects have a method that allows
113     InfoExtractors to be registered in a given order. When it is passed
114     a URL, the YoutubeDL object handles it to the first InfoExtractor it
115     finds that reports being able to handle it. The InfoExtractor extracts
116     all the information about the video or videos the URL refers to, and
117     YoutubeDL process the extracted information, possibly using a File
118     Downloader to download the video.
119
120     YoutubeDL objects accept a lot of parameters. In order not to saturate
121     the object constructor with arguments, it receives a dictionary of
122     options instead. These options are available through the params
123     attribute for the InfoExtractors to use. The YoutubeDL also
124     registers itself as the downloader in charge for the InfoExtractors
125     that are added to it, so this is a "mutual registration".
126
127     Available options:
128
129     username:          Username for authentication purposes.
130     password:          Password for authentication purposes.
131     videopassword:     Password for accessing a video.
132     usenetrc:          Use netrc for authentication instead.
133     verbose:           Print additional info to stdout.
134     quiet:             Do not print messages to stdout.
135     no_warnings:       Do not print out anything for warnings.
136     forceurl:          Force printing final URL.
137     forcetitle:        Force printing title.
138     forceid:           Force printing ID.
139     forcethumbnail:    Force printing thumbnail URL.
140     forcedescription:  Force printing description.
141     forcefilename:     Force printing final filename.
142     forceduration:     Force printing duration.
143     forcejson:         Force printing info_dict as JSON.
144     dump_single_json:  Force printing the info_dict of the whole playlist
145                        (or video) as a single JSON line.
146     simulate:          Do not download the video files.
147     format:            Video format code. See options.py for more information.
148     outtmpl:           Template for output names.
149     restrictfilenames: Do not allow "&" and spaces in file names
150     ignoreerrors:      Do not stop on download errors.
151     force_generic_extractor: Force downloader to use the generic extractor
152     nooverwrites:      Prevent overwriting files.
153     playliststart:     Playlist item to start at.
154     playlistend:       Playlist item to end at.
155     playlist_items:    Specific indices of playlist to download.
156     playlistreverse:   Download playlist items in reverse order.
157     matchtitle:        Download only matching titles.
158     rejecttitle:       Reject downloads for matching titles.
159     logger:            Log messages to a logging.Logger instance.
160     logtostderr:       Log messages to stderr instead of stdout.
161     writedescription:  Write the video description to a .description file
162     writeinfojson:     Write the video description to a .info.json file
163     writeannotations:  Write the video annotations to a .annotations.xml file
164     writethumbnail:    Write the thumbnail image to a file
165     write_all_thumbnails:  Write all thumbnail formats to files
166     writesubtitles:    Write the video subtitles to a file
167     writeautomaticsub: Write the automatically generated subtitles to a file
168     allsubtitles:      Downloads all the subtitles of the video
169                        (requires writesubtitles or writeautomaticsub)
170     listsubtitles:     Lists all available subtitles for the video
171     subtitlesformat:   The format code for subtitles
172     subtitleslangs:    List of languages of the subtitles to download
173     keepvideo:         Keep the video file after post-processing
174     daterange:         A DateRange object, download only if the upload_date is in the range.
175     skip_download:     Skip the actual download of the video file
176     cachedir:          Location of the cache files in the filesystem.
177                        False to disable filesystem cache.
178     noplaylist:        Download single video instead of a playlist if in doubt.
179     age_limit:         An integer representing the user's age in years.
180                        Unsuitable videos for the given age are skipped.
181     min_views:         An integer representing the minimum view count the video
182                        must have in order to not be skipped.
183                        Videos without view count information are always
184                        downloaded. None for no limit.
185     max_views:         An integer representing the maximum view count.
186                        Videos that are more popular than that are not
187                        downloaded.
188                        Videos without view count information are always
189                        downloaded. None for no limit.
190     download_archive:  File name of a file where all downloads are recorded.
191                        Videos already present in the file are not downloaded
192                        again.
193     cookiefile:        File name where cookies should be read from and dumped to.
194     nocheckcertificate:Do not verify SSL certificates
195     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
196                        At the moment, this is only supported by YouTube.
197     proxy:             URL of the proxy server to use
198     cn_verification_proxy:  URL of the proxy to use for IP address verification
199                        on Chinese sites. (Experimental)
200     socket_timeout:    Time to wait for unresponsive hosts, in seconds
201     bidi_workaround:   Work around buggy terminals without bidirectional text
202                        support, using fridibi
203     debug_printtraffic:Print out sent and received HTTP traffic
204     include_ads:       Download ads as well
205     default_search:    Prepend this string if an input url is not valid.
206                        'auto' for elaborate guessing
207     encoding:          Use this encoding instead of the system-specified.
208     extract_flat:      Do not resolve URLs, return the immediate result.
209                        Pass in 'in_playlist' to only show this behavior for
210                        playlist items.
211     postprocessors:    A list of dictionaries, each with an entry
212                        * key:  The name of the postprocessor. See
213                                youtube_dl/postprocessor/__init__.py for a list.
214                        as well as any further keyword arguments for the
215                        postprocessor.
216     progress_hooks:    A list of functions that get called on download
217                        progress, with a dictionary with the entries
218                        * status: One of "downloading", "error", or "finished".
219                                  Check this first and ignore unknown values.
220
221                        If status is one of "downloading", or "finished", the
222                        following properties may also be present:
223                        * filename: The final filename (always present)
224                        * tmpfilename: The filename we're currently writing to
225                        * downloaded_bytes: Bytes on disk
226                        * total_bytes: Size of the whole file, None if unknown
227                        * total_bytes_estimate: Guess of the eventual file size,
228                                                None if unavailable.
229                        * elapsed: The number of seconds since download started.
230                        * eta: The estimated time in seconds, None if unknown
231                        * speed: The download speed in bytes/second, None if
232                                 unknown
233                        * fragment_index: The counter of the currently
234                                          downloaded video fragment.
235                        * fragment_count: The number of fragments (= individual
236                                          files that will be merged)
237
238                        Progress hooks are guaranteed to be called at least once
239                        (with status "finished") if the download is successful.
240     merge_output_format: Extension to use when merging formats.
241     fixup:             Automatically correct known faults of the file.
242                        One of:
243                        - "never": do nothing
244                        - "warn": only emit a warning
245                        - "detect_or_warn": check whether we can do anything
246                                            about it, warn otherwise (default)
247     source_address:    (Experimental) Client-side IP address to bind to.
248     call_home:         Boolean, true iff we are allowed to contact the
249                        youtube-dl servers for debugging.
250     sleep_interval:    Number of seconds to sleep before each download.
251     listformats:       Print an overview of available video formats and exit.
252     list_thumbnails:   Print a table of all thumbnails and exit.
253     match_filter:      A function that gets called with the info_dict of
254                        every video.
255                        If it returns a message, the video is ignored.
256                        If it returns None, the video is downloaded.
257                        match_filter_func in utils.py is one example for this.
258     no_color:          Do not emit color codes in output.
259
260     The following options determine which downloader is picked:
261     external_downloader: Executable of the external downloader to call.
262                        None or unset for standard (built-in) downloader.
263     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
264
265     The following parameters are not used by YoutubeDL itself, they are used by
266     the downloader (see youtube_dl/downloader/common.py):
267     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
268     noresizebuffer, retries, continuedl, noprogress, consoletitle,
269     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
270
271     The following options are used by the post processors:
272     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
273                        otherwise prefer avconv.
274     postprocessor_args: A list of additional command-line arguments for the
275                         postprocessor.
276     """
277
278     params = None
279     _ies = []
280     _pps = []
281     _download_retcode = None
282     _num_downloads = None
283     _screen_file = None
284
285     def __init__(self, params=None, auto_init=True):
286         """Create a FileDownloader object with the given options."""
287         if params is None:
288             params = {}
289         self._ies = []
290         self._ies_instances = {}
291         self._pps = []
292         self._progress_hooks = []
293         self._download_retcode = 0
294         self._num_downloads = 0
295         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
296         self._err_file = sys.stderr
297         self.params = {
298             # Default parameters
299             'nocheckcertificate': False,
300         }
301         self.params.update(params)
302         self.cache = Cache(self)
303
304         if params.get('bidi_workaround', False):
305             try:
306                 import pty
307                 master, slave = pty.openpty()
308                 width = compat_get_terminal_size().columns
309                 if width is None:
310                     width_args = []
311                 else:
312                     width_args = ['-w', str(width)]
313                 sp_kwargs = dict(
314                     stdin=subprocess.PIPE,
315                     stdout=slave,
316                     stderr=self._err_file)
317                 try:
318                     self._output_process = subprocess.Popen(
319                         ['bidiv'] + width_args, **sp_kwargs
320                     )
321                 except OSError:
322                     self._output_process = subprocess.Popen(
323                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
324                 self._output_channel = os.fdopen(master, 'rb')
325             except OSError as ose:
326                 if ose.errno == 2:
327                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
328                 else:
329                     raise
330
331         if (sys.version_info >= (3,) and sys.platform != 'win32' and
332                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
333                 not params.get('restrictfilenames', False)):
334             # On Python 3, the Unicode filesystem API will throw errors (#1474)
335             self.report_warning(
336                 'Assuming --restrict-filenames since file system encoding '
337                 'cannot encode all characters. '
338                 'Set the LC_ALL environment variable to fix this.')
339             self.params['restrictfilenames'] = True
340
341         if isinstance(params.get('outtmpl'), bytes):
342             self.report_warning(
343                 'Parameter outtmpl is bytes, but should be a unicode string. '
344                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
345
346         self._setup_opener()
347
348         if auto_init:
349             self.print_debug_header()
350             self.add_default_info_extractors()
351
352         for pp_def_raw in self.params.get('postprocessors', []):
353             pp_class = get_postprocessor(pp_def_raw['key'])
354             pp_def = dict(pp_def_raw)
355             del pp_def['key']
356             pp = pp_class(self, **compat_kwargs(pp_def))
357             self.add_post_processor(pp)
358
359         for ph in self.params.get('progress_hooks', []):
360             self.add_progress_hook(ph)
361
362     def warn_if_short_id(self, argv):
363         # short YouTube ID starting with dash?
364         idxs = [
365             i for i, a in enumerate(argv)
366             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
367         if idxs:
368             correct_argv = (
369                 ['youtube-dl'] +
370                 [a for i, a in enumerate(argv) if i not in idxs] +
371                 ['--'] + [argv[i] for i in idxs]
372             )
373             self.report_warning(
374                 'Long argument string detected. '
375                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
376                 args_to_str(correct_argv))
377
378     def add_info_extractor(self, ie):
379         """Add an InfoExtractor object to the end of the list."""
380         self._ies.append(ie)
381         self._ies_instances[ie.ie_key()] = ie
382         ie.set_downloader(self)
383
384     def get_info_extractor(self, ie_key):
385         """
386         Get an instance of an IE with name ie_key, it will try to get one from
387         the _ies list, if there's no instance it will create a new one and add
388         it to the extractor list.
389         """
390         ie = self._ies_instances.get(ie_key)
391         if ie is None:
392             ie = get_info_extractor(ie_key)()
393             self.add_info_extractor(ie)
394         return ie
395
396     def add_default_info_extractors(self):
397         """
398         Add the InfoExtractors returned by gen_extractors to the end of the list
399         """
400         for ie in gen_extractors():
401             self.add_info_extractor(ie)
402
403     def add_post_processor(self, pp):
404         """Add a PostProcessor object to the end of the chain."""
405         self._pps.append(pp)
406         pp.set_downloader(self)
407
408     def add_progress_hook(self, ph):
409         """Add the progress hook (currently only for the file downloader)"""
410         self._progress_hooks.append(ph)
411
412     def _bidi_workaround(self, message):
413         if not hasattr(self, '_output_channel'):
414             return message
415
416         assert hasattr(self, '_output_process')
417         assert isinstance(message, compat_str)
418         line_count = message.count('\n') + 1
419         self._output_process.stdin.write((message + '\n').encode('utf-8'))
420         self._output_process.stdin.flush()
421         res = ''.join(self._output_channel.readline().decode('utf-8')
422                       for _ in range(line_count))
423         return res[:-len('\n')]
424
425     def to_screen(self, message, skip_eol=False):
426         """Print message to stdout if not in quiet mode."""
427         return self.to_stdout(message, skip_eol, check_quiet=True)
428
429     def _write_string(self, s, out=None):
430         write_string(s, out=out, encoding=self.params.get('encoding'))
431
432     def to_stdout(self, message, skip_eol=False, check_quiet=False):
433         """Print message to stdout if not in quiet mode."""
434         if self.params.get('logger'):
435             self.params['logger'].debug(message)
436         elif not check_quiet or not self.params.get('quiet', False):
437             message = self._bidi_workaround(message)
438             terminator = ['\n', ''][skip_eol]
439             output = message + terminator
440
441             self._write_string(output, self._screen_file)
442
443     def to_stderr(self, message):
444         """Print message to stderr."""
445         assert isinstance(message, compat_str)
446         if self.params.get('logger'):
447             self.params['logger'].error(message)
448         else:
449             message = self._bidi_workaround(message)
450             output = message + '\n'
451             self._write_string(output, self._err_file)
452
453     def to_console_title(self, message):
454         if not self.params.get('consoletitle', False):
455             return
456         if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
457             # c_wchar_p() might not be necessary if `message` is
458             # already of type unicode()
459             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
460         elif 'TERM' in os.environ:
461             self._write_string('\033]0;%s\007' % message, self._screen_file)
462
463     def save_console_title(self):
464         if not self.params.get('consoletitle', False):
465             return
466         if 'TERM' in os.environ:
467             # Save the title on stack
468             self._write_string('\033[22;0t', self._screen_file)
469
470     def restore_console_title(self):
471         if not self.params.get('consoletitle', False):
472             return
473         if 'TERM' in os.environ:
474             # Restore the title from stack
475             self._write_string('\033[23;0t', self._screen_file)
476
477     def __enter__(self):
478         self.save_console_title()
479         return self
480
481     def __exit__(self, *args):
482         self.restore_console_title()
483
484         if self.params.get('cookiefile') is not None:
485             self.cookiejar.save()
486
487     def trouble(self, message=None, tb=None):
488         """Determine action to take when a download problem appears.
489
490         Depending on if the downloader has been configured to ignore
491         download errors or not, this method may throw an exception or
492         not when errors are found, after printing the message.
493
494         tb, if given, is additional traceback information.
495         """
496         if message is not None:
497             self.to_stderr(message)
498         if self.params.get('verbose'):
499             if tb is None:
500                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
501                     tb = ''
502                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
503                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
504                     tb += encode_compat_str(traceback.format_exc())
505                 else:
506                     tb_data = traceback.format_list(traceback.extract_stack())
507                     tb = ''.join(tb_data)
508             self.to_stderr(tb)
509         if not self.params.get('ignoreerrors', False):
510             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
511                 exc_info = sys.exc_info()[1].exc_info
512             else:
513                 exc_info = sys.exc_info()
514             raise DownloadError(message, exc_info)
515         self._download_retcode = 1
516
517     def report_warning(self, message):
518         '''
519         Print the message to stderr, it will be prefixed with 'WARNING:'
520         If stderr is a tty file the 'WARNING:' will be colored
521         '''
522         if self.params.get('logger') is not None:
523             self.params['logger'].warning(message)
524         else:
525             if self.params.get('no_warnings'):
526                 return
527             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
528                 _msg_header = '\033[0;33mWARNING:\033[0m'
529             else:
530                 _msg_header = 'WARNING:'
531             warning_message = '%s %s' % (_msg_header, message)
532             self.to_stderr(warning_message)
533
534     def report_error(self, message, tb=None):
535         '''
536         Do the same as trouble, but prefixes the message with 'ERROR:', colored
537         in red if stderr is a tty file.
538         '''
539         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
540             _msg_header = '\033[0;31mERROR:\033[0m'
541         else:
542             _msg_header = 'ERROR:'
543         error_message = '%s %s' % (_msg_header, message)
544         self.trouble(error_message, tb)
545
546     def report_file_already_downloaded(self, file_name):
547         """Report file has already been fully downloaded."""
548         try:
549             self.to_screen('[download] %s has already been downloaded' % file_name)
550         except UnicodeEncodeError:
551             self.to_screen('[download] The file has already been downloaded')
552
553     def prepare_filename(self, info_dict):
554         """Generate the output filename."""
555         try:
556             template_dict = dict(info_dict)
557
558             template_dict['epoch'] = int(time.time())
559             autonumber_size = self.params.get('autonumber_size')
560             if autonumber_size is None:
561                 autonumber_size = 5
562             autonumber_templ = '%0' + str(autonumber_size) + 'd'
563             template_dict['autonumber'] = autonumber_templ % self._num_downloads
564             if template_dict.get('playlist_index') is not None:
565                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
566             if template_dict.get('resolution') is None:
567                 if template_dict.get('width') and template_dict.get('height'):
568                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
569                 elif template_dict.get('height'):
570                     template_dict['resolution'] = '%sp' % template_dict['height']
571                 elif template_dict.get('width'):
572                     template_dict['resolution'] = '%dx?' % template_dict['width']
573
574             sanitize = lambda k, v: sanitize_filename(
575                 compat_str(v),
576                 restricted=self.params.get('restrictfilenames'),
577                 is_id=(k == 'id'))
578             template_dict = dict((k, sanitize(k, v))
579                                  for k, v in template_dict.items()
580                                  if v is not None)
581             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
582
583             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
584             tmpl = compat_expanduser(outtmpl)
585             filename = tmpl % template_dict
586             # Temporary fix for #4787
587             # 'Treat' all problem characters by passing filename through preferredencoding
588             # to workaround encoding issues with subprocess on python2 @ Windows
589             if sys.version_info < (3, 0) and sys.platform == 'win32':
590                 filename = encodeFilename(filename, True).decode(preferredencoding())
591             return sanitize_path(filename)
592         except ValueError as err:
593             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
594             return None
595
596     def _match_entry(self, info_dict, incomplete):
597         """ Returns None iff the file should be downloaded """
598
599         video_title = info_dict.get('title', info_dict.get('id', 'video'))
600         if 'title' in info_dict:
601             # This can happen when we're just evaluating the playlist
602             title = info_dict['title']
603             matchtitle = self.params.get('matchtitle', False)
604             if matchtitle:
605                 if not re.search(matchtitle, title, re.IGNORECASE):
606                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
607             rejecttitle = self.params.get('rejecttitle', False)
608             if rejecttitle:
609                 if re.search(rejecttitle, title, re.IGNORECASE):
610                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
611         date = info_dict.get('upload_date')
612         if date is not None:
613             dateRange = self.params.get('daterange', DateRange())
614             if date not in dateRange:
615                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
616         view_count = info_dict.get('view_count')
617         if view_count is not None:
618             min_views = self.params.get('min_views')
619             if min_views is not None and view_count < min_views:
620                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
621             max_views = self.params.get('max_views')
622             if max_views is not None and view_count > max_views:
623                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
624         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
625             return 'Skipping "%s" because it is age restricted' % video_title
626         if self.in_download_archive(info_dict):
627             return '%s has already been recorded in archive' % video_title
628
629         if not incomplete:
630             match_filter = self.params.get('match_filter')
631             if match_filter is not None:
632                 ret = match_filter(info_dict)
633                 if ret is not None:
634                     return ret
635
636         return None
637
638     @staticmethod
639     def add_extra_info(info_dict, extra_info):
640         '''Set the keys from extra_info in info dict if they are missing'''
641         for key, value in extra_info.items():
642             info_dict.setdefault(key, value)
643
644     def extract_info(self, url, download=True, ie_key=None, extra_info={},
645                      process=True, force_generic_extractor=False):
646         '''
647         Returns a list with a dictionary for each video we find.
648         If 'download', also downloads the videos.
649         extra_info is a dict containing the extra values to add to each result
650         '''
651
652         if not ie_key and force_generic_extractor:
653             ie_key = 'Generic'
654
655         if ie_key:
656             ies = [self.get_info_extractor(ie_key)]
657         else:
658             ies = self._ies
659
660         for ie in ies:
661             if not ie.suitable(url):
662                 continue
663
664             if not ie.working():
665                 self.report_warning('The program functionality for this site has been marked as broken, '
666                                     'and will probably not work.')
667
668             try:
669                 ie_result = ie.extract(url)
670                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
671                     break
672                 if isinstance(ie_result, list):
673                     # Backwards compatibility: old IE result format
674                     ie_result = {
675                         '_type': 'compat_list',
676                         'entries': ie_result,
677                     }
678                 self.add_default_extra_info(ie_result, ie, url)
679                 if process:
680                     return self.process_ie_result(ie_result, download, extra_info)
681                 else:
682                     return ie_result
683             except ExtractorError as e:  # An error we somewhat expected
684                 self.report_error(compat_str(e), e.format_traceback())
685                 break
686             except MaxDownloadsReached:
687                 raise
688             except Exception as e:
689                 if self.params.get('ignoreerrors', False):
690                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
691                     break
692                 else:
693                     raise
694         else:
695             self.report_error('no suitable InfoExtractor for URL %s' % url)
696
697     def add_default_extra_info(self, ie_result, ie, url):
698         self.add_extra_info(ie_result, {
699             'extractor': ie.IE_NAME,
700             'webpage_url': url,
701             'webpage_url_basename': url_basename(url),
702             'extractor_key': ie.ie_key(),
703         })
704
705     def process_ie_result(self, ie_result, download=True, extra_info={}):
706         """
707         Take the result of the ie(may be modified) and resolve all unresolved
708         references (URLs, playlist items).
709
710         It will also download the videos if 'download'.
711         Returns the resolved ie_result.
712         """
713         result_type = ie_result.get('_type', 'video')
714
715         if result_type in ('url', 'url_transparent'):
716             extract_flat = self.params.get('extract_flat', False)
717             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
718                     extract_flat is True):
719                 if self.params.get('forcejson', False):
720                     self.to_stdout(json.dumps(ie_result))
721                 return ie_result
722
723         if result_type == 'video':
724             self.add_extra_info(ie_result, extra_info)
725             return self.process_video_result(ie_result, download=download)
726         elif result_type == 'url':
727             # We have to add extra_info to the results because it may be
728             # contained in a playlist
729             return self.extract_info(ie_result['url'],
730                                      download,
731                                      ie_key=ie_result.get('ie_key'),
732                                      extra_info=extra_info)
733         elif result_type == 'url_transparent':
734             # Use the information from the embedding page
735             info = self.extract_info(
736                 ie_result['url'], ie_key=ie_result.get('ie_key'),
737                 extra_info=extra_info, download=False, process=False)
738
739             force_properties = dict(
740                 (k, v) for k, v in ie_result.items() if v is not None)
741             for f in ('_type', 'url', 'ie_key'):
742                 if f in force_properties:
743                     del force_properties[f]
744             new_result = info.copy()
745             new_result.update(force_properties)
746
747             assert new_result.get('_type') != 'url_transparent'
748
749             return self.process_ie_result(
750                 new_result, download=download, extra_info=extra_info)
751         elif result_type == 'playlist' or result_type == 'multi_video':
752             # We process each entry in the playlist
753             playlist = ie_result.get('title') or ie_result.get('id')
754             self.to_screen('[download] Downloading playlist: %s' % playlist)
755
756             playlist_results = []
757
758             playliststart = self.params.get('playliststart', 1) - 1
759             playlistend = self.params.get('playlistend')
760             # For backwards compatibility, interpret -1 as whole list
761             if playlistend == -1:
762                 playlistend = None
763
764             playlistitems_str = self.params.get('playlist_items')
765             playlistitems = None
766             if playlistitems_str is not None:
767                 def iter_playlistitems(format):
768                     for string_segment in format.split(','):
769                         if '-' in string_segment:
770                             start, end = string_segment.split('-')
771                             for item in range(int(start), int(end) + 1):
772                                 yield int(item)
773                         else:
774                             yield int(string_segment)
775                 playlistitems = iter_playlistitems(playlistitems_str)
776
777             ie_entries = ie_result['entries']
778             if isinstance(ie_entries, list):
779                 n_all_entries = len(ie_entries)
780                 if playlistitems:
781                     entries = [
782                         ie_entries[i - 1] for i in playlistitems
783                         if -n_all_entries <= i - 1 < n_all_entries]
784                 else:
785                     entries = ie_entries[playliststart:playlistend]
786                 n_entries = len(entries)
787                 self.to_screen(
788                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
789                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
790             elif isinstance(ie_entries, PagedList):
791                 if playlistitems:
792                     entries = []
793                     for item in playlistitems:
794                         entries.extend(ie_entries.getslice(
795                             item - 1, item
796                         ))
797                 else:
798                     entries = ie_entries.getslice(
799                         playliststart, playlistend)
800                 n_entries = len(entries)
801                 self.to_screen(
802                     '[%s] playlist %s: Downloading %d videos' %
803                     (ie_result['extractor'], playlist, n_entries))
804             else:  # iterable
805                 if playlistitems:
806                     entry_list = list(ie_entries)
807                     entries = [entry_list[i - 1] for i in playlistitems]
808                 else:
809                     entries = list(itertools.islice(
810                         ie_entries, playliststart, playlistend))
811                 n_entries = len(entries)
812                 self.to_screen(
813                     '[%s] playlist %s: Downloading %d videos' %
814                     (ie_result['extractor'], playlist, n_entries))
815
816             if self.params.get('playlistreverse', False):
817                 entries = entries[::-1]
818
819             for i, entry in enumerate(entries, 1):
820                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
821                 extra = {
822                     'n_entries': n_entries,
823                     'playlist': playlist,
824                     'playlist_id': ie_result.get('id'),
825                     'playlist_title': ie_result.get('title'),
826                     'playlist_index': i + playliststart,
827                     'extractor': ie_result['extractor'],
828                     'webpage_url': ie_result['webpage_url'],
829                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
830                     'extractor_key': ie_result['extractor_key'],
831                 }
832
833                 reason = self._match_entry(entry, incomplete=True)
834                 if reason is not None:
835                     self.to_screen('[download] ' + reason)
836                     continue
837
838                 entry_result = self.process_ie_result(entry,
839                                                       download=download,
840                                                       extra_info=extra)
841                 playlist_results.append(entry_result)
842             ie_result['entries'] = playlist_results
843             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
844             return ie_result
845         elif result_type == 'compat_list':
846             self.report_warning(
847                 'Extractor %s returned a compat_list result. '
848                 'It needs to be updated.' % ie_result.get('extractor'))
849
850             def _fixup(r):
851                 self.add_extra_info(
852                     r,
853                     {
854                         'extractor': ie_result['extractor'],
855                         'webpage_url': ie_result['webpage_url'],
856                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
857                         'extractor_key': ie_result['extractor_key'],
858                     }
859                 )
860                 return r
861             ie_result['entries'] = [
862                 self.process_ie_result(_fixup(r), download, extra_info)
863                 for r in ie_result['entries']
864             ]
865             return ie_result
866         else:
867             raise Exception('Invalid result type: %s' % result_type)
868
869     def _build_format_filter(self, filter_spec):
870         " Returns a function to filter the formats according to the filter_spec "
871
872         OPERATORS = {
873             '<': operator.lt,
874             '<=': operator.le,
875             '>': operator.gt,
876             '>=': operator.ge,
877             '=': operator.eq,
878             '!=': operator.ne,
879         }
880         operator_rex = re.compile(r'''(?x)\s*
881             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
882             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
883             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
884             $
885             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
886         m = operator_rex.search(filter_spec)
887         if m:
888             try:
889                 comparison_value = int(m.group('value'))
890             except ValueError:
891                 comparison_value = parse_filesize(m.group('value'))
892                 if comparison_value is None:
893                     comparison_value = parse_filesize(m.group('value') + 'B')
894                 if comparison_value is None:
895                     raise ValueError(
896                         'Invalid value %r in format specification %r' % (
897                             m.group('value'), filter_spec))
898             op = OPERATORS[m.group('op')]
899
900         if not m:
901             STR_OPERATORS = {
902                 '=': operator.eq,
903                 '!=': operator.ne,
904                 '^=': lambda attr, value: attr.startswith(value),
905                 '$=': lambda attr, value: attr.endswith(value),
906                 '*=': lambda attr, value: value in attr,
907             }
908             str_operator_rex = re.compile(r'''(?x)
909                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
910                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
911                 \s*(?P<value>[a-zA-Z0-9._-]+)
912                 \s*$
913                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
914             m = str_operator_rex.search(filter_spec)
915             if m:
916                 comparison_value = m.group('value')
917                 op = STR_OPERATORS[m.group('op')]
918
919         if not m:
920             raise ValueError('Invalid filter specification %r' % filter_spec)
921
922         def _filter(f):
923             actual_value = f.get(m.group('key'))
924             if actual_value is None:
925                 return m.group('none_inclusive')
926             return op(actual_value, comparison_value)
927         return _filter
928
929     def build_format_selector(self, format_spec):
930         def syntax_error(note, start):
931             message = (
932                 'Invalid format specification: '
933                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
934             return SyntaxError(message)
935
936         PICKFIRST = 'PICKFIRST'
937         MERGE = 'MERGE'
938         SINGLE = 'SINGLE'
939         GROUP = 'GROUP'
940         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
941
942         def _parse_filter(tokens):
943             filter_parts = []
944             for type, string, start, _, _ in tokens:
945                 if type == tokenize.OP and string == ']':
946                     return ''.join(filter_parts)
947                 else:
948                     filter_parts.append(string)
949
950         def _remove_unused_ops(tokens):
951             # Remove operators that we don't use and join them with the surrounding strings
952             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
953             ALLOWED_OPS = ('/', '+', ',', '(', ')')
954             last_string, last_start, last_end, last_line = None, None, None, None
955             for type, string, start, end, line in tokens:
956                 if type == tokenize.OP and string == '[':
957                     if last_string:
958                         yield tokenize.NAME, last_string, last_start, last_end, last_line
959                         last_string = None
960                     yield type, string, start, end, line
961                     # everything inside brackets will be handled by _parse_filter
962                     for type, string, start, end, line in tokens:
963                         yield type, string, start, end, line
964                         if type == tokenize.OP and string == ']':
965                             break
966                 elif type == tokenize.OP and string in ALLOWED_OPS:
967                     if last_string:
968                         yield tokenize.NAME, last_string, last_start, last_end, last_line
969                         last_string = None
970                     yield type, string, start, end, line
971                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
972                     if not last_string:
973                         last_string = string
974                         last_start = start
975                         last_end = end
976                     else:
977                         last_string += string
978             if last_string:
979                 yield tokenize.NAME, last_string, last_start, last_end, last_line
980
981         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
982             selectors = []
983             current_selector = None
984             for type, string, start, _, _ in tokens:
985                 # ENCODING is only defined in python 3.x
986                 if type == getattr(tokenize, 'ENCODING', None):
987                     continue
988                 elif type in [tokenize.NAME, tokenize.NUMBER]:
989                     current_selector = FormatSelector(SINGLE, string, [])
990                 elif type == tokenize.OP:
991                     if string == ')':
992                         if not inside_group:
993                             # ')' will be handled by the parentheses group
994                             tokens.restore_last_token()
995                         break
996                     elif inside_merge and string in ['/', ',']:
997                         tokens.restore_last_token()
998                         break
999                     elif inside_choice and string == ',':
1000                         tokens.restore_last_token()
1001                         break
1002                     elif string == ',':
1003                         if not current_selector:
1004                             raise syntax_error('"," must follow a format selector', start)
1005                         selectors.append(current_selector)
1006                         current_selector = None
1007                     elif string == '/':
1008                         if not current_selector:
1009                             raise syntax_error('"/" must follow a format selector', start)
1010                         first_choice = current_selector
1011                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1012                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1013                     elif string == '[':
1014                         if not current_selector:
1015                             current_selector = FormatSelector(SINGLE, 'best', [])
1016                         format_filter = _parse_filter(tokens)
1017                         current_selector.filters.append(format_filter)
1018                     elif string == '(':
1019                         if current_selector:
1020                             raise syntax_error('Unexpected "("', start)
1021                         group = _parse_format_selection(tokens, inside_group=True)
1022                         current_selector = FormatSelector(GROUP, group, [])
1023                     elif string == '+':
1024                         video_selector = current_selector
1025                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1026                         if not video_selector or not audio_selector:
1027                             raise syntax_error('"+" must be between two format selectors', start)
1028                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1029                     else:
1030                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1031                 elif type == tokenize.ENDMARKER:
1032                     break
1033             if current_selector:
1034                 selectors.append(current_selector)
1035             return selectors
1036
1037         def _build_selector_function(selector):
1038             if isinstance(selector, list):
1039                 fs = [_build_selector_function(s) for s in selector]
1040
1041                 def selector_function(formats):
1042                     for f in fs:
1043                         for format in f(formats):
1044                             yield format
1045                 return selector_function
1046             elif selector.type == GROUP:
1047                 selector_function = _build_selector_function(selector.selector)
1048             elif selector.type == PICKFIRST:
1049                 fs = [_build_selector_function(s) for s in selector.selector]
1050
1051                 def selector_function(formats):
1052                     for f in fs:
1053                         picked_formats = list(f(formats))
1054                         if picked_formats:
1055                             return picked_formats
1056                     return []
1057             elif selector.type == SINGLE:
1058                 format_spec = selector.selector
1059
1060                 def selector_function(formats):
1061                     formats = list(formats)
1062                     if not formats:
1063                         return
1064                     if format_spec == 'all':
1065                         for f in formats:
1066                             yield f
1067                     elif format_spec in ['best', 'worst', None]:
1068                         format_idx = 0 if format_spec == 'worst' else -1
1069                         audiovideo_formats = [
1070                             f for f in formats
1071                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1072                         if audiovideo_formats:
1073                             yield audiovideo_formats[format_idx]
1074                         # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1075                         elif (all(f.get('acodec') != 'none' for f in formats) or
1076                               all(f.get('vcodec') != 'none' for f in formats)):
1077                             yield formats[format_idx]
1078                     elif format_spec == 'bestaudio':
1079                         audio_formats = [
1080                             f for f in formats
1081                             if f.get('vcodec') == 'none']
1082                         if audio_formats:
1083                             yield audio_formats[-1]
1084                     elif format_spec == 'worstaudio':
1085                         audio_formats = [
1086                             f for f in formats
1087                             if f.get('vcodec') == 'none']
1088                         if audio_formats:
1089                             yield audio_formats[0]
1090                     elif format_spec == 'bestvideo':
1091                         video_formats = [
1092                             f for f in formats
1093                             if f.get('acodec') == 'none']
1094                         if video_formats:
1095                             yield video_formats[-1]
1096                     elif format_spec == 'worstvideo':
1097                         video_formats = [
1098                             f for f in formats
1099                             if f.get('acodec') == 'none']
1100                         if video_formats:
1101                             yield video_formats[0]
1102                     else:
1103                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1104                         if format_spec in extensions:
1105                             filter_f = lambda f: f['ext'] == format_spec
1106                         else:
1107                             filter_f = lambda f: f['format_id'] == format_spec
1108                         matches = list(filter(filter_f, formats))
1109                         if matches:
1110                             yield matches[-1]
1111             elif selector.type == MERGE:
1112                 def _merge(formats_info):
1113                     format_1, format_2 = [f['format_id'] for f in formats_info]
1114                     # The first format must contain the video and the
1115                     # second the audio
1116                     if formats_info[0].get('vcodec') == 'none':
1117                         self.report_error('The first format must '
1118                                           'contain the video, try using '
1119                                           '"-f %s+%s"' % (format_2, format_1))
1120                         return
1121                     # Formats must be opposite (video+audio)
1122                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1123                         self.report_error(
1124                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1125                             % (format_1, format_2))
1126                         return
1127                     output_ext = (
1128                         formats_info[0]['ext']
1129                         if self.params.get('merge_output_format') is None
1130                         else self.params['merge_output_format'])
1131                     return {
1132                         'requested_formats': formats_info,
1133                         'format': '%s+%s' % (formats_info[0].get('format'),
1134                                              formats_info[1].get('format')),
1135                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1136                                                 formats_info[1].get('format_id')),
1137                         'width': formats_info[0].get('width'),
1138                         'height': formats_info[0].get('height'),
1139                         'resolution': formats_info[0].get('resolution'),
1140                         'fps': formats_info[0].get('fps'),
1141                         'vcodec': formats_info[0].get('vcodec'),
1142                         'vbr': formats_info[0].get('vbr'),
1143                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1144                         'acodec': formats_info[1].get('acodec'),
1145                         'abr': formats_info[1].get('abr'),
1146                         'ext': output_ext,
1147                     }
1148                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1149
1150                 def selector_function(formats):
1151                     formats = list(formats)
1152                     for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1153                         yield _merge(pair)
1154
1155             filters = [self._build_format_filter(f) for f in selector.filters]
1156
1157             def final_selector(formats):
1158                 for _filter in filters:
1159                     formats = list(filter(_filter, formats))
1160                 return selector_function(formats)
1161             return final_selector
1162
1163         stream = io.BytesIO(format_spec.encode('utf-8'))
1164         try:
1165             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1166         except tokenize.TokenError:
1167             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1168
1169         class TokenIterator(object):
1170             def __init__(self, tokens):
1171                 self.tokens = tokens
1172                 self.counter = 0
1173
1174             def __iter__(self):
1175                 return self
1176
1177             def __next__(self):
1178                 if self.counter >= len(self.tokens):
1179                     raise StopIteration()
1180                 value = self.tokens[self.counter]
1181                 self.counter += 1
1182                 return value
1183
1184             next = __next__
1185
1186             def restore_last_token(self):
1187                 self.counter -= 1
1188
1189         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1190         return _build_selector_function(parsed_selector)
1191
1192     def _calc_headers(self, info_dict):
1193         res = std_headers.copy()
1194
1195         add_headers = info_dict.get('http_headers')
1196         if add_headers:
1197             res.update(add_headers)
1198
1199         cookies = self._calc_cookies(info_dict)
1200         if cookies:
1201             res['Cookie'] = cookies
1202
1203         return res
1204
1205     def _calc_cookies(self, info_dict):
1206         pr = sanitized_Request(info_dict['url'])
1207         self.cookiejar.add_cookie_header(pr)
1208         return pr.get_header('Cookie')
1209
1210     def process_video_result(self, info_dict, download=True):
1211         assert info_dict.get('_type', 'video') == 'video'
1212
1213         if 'id' not in info_dict:
1214             raise ExtractorError('Missing "id" field in extractor result')
1215         if 'title' not in info_dict:
1216             raise ExtractorError('Missing "title" field in extractor result')
1217
1218         if 'playlist' not in info_dict:
1219             # It isn't part of a playlist
1220             info_dict['playlist'] = None
1221             info_dict['playlist_index'] = None
1222
1223         thumbnails = info_dict.get('thumbnails')
1224         if thumbnails is None:
1225             thumbnail = info_dict.get('thumbnail')
1226             if thumbnail:
1227                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1228         if thumbnails:
1229             thumbnails.sort(key=lambda t: (
1230                 t.get('preference'), t.get('width'), t.get('height'),
1231                 t.get('id'), t.get('url')))
1232             for i, t in enumerate(thumbnails):
1233                 t['url'] = sanitize_url(t['url'])
1234                 if t.get('width') and t.get('height'):
1235                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1236                 if t.get('id') is None:
1237                     t['id'] = '%d' % i
1238
1239         if self.params.get('list_thumbnails'):
1240             self.list_thumbnails(info_dict)
1241             return
1242
1243         if thumbnails and 'thumbnail' not in info_dict:
1244             info_dict['thumbnail'] = thumbnails[-1]['url']
1245
1246         if 'display_id' not in info_dict and 'id' in info_dict:
1247             info_dict['display_id'] = info_dict['id']
1248
1249         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1250             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1251             # see http://bugs.python.org/issue1646728)
1252             try:
1253                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1254                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1255             except (ValueError, OverflowError, OSError):
1256                 pass
1257
1258         # Auto generate title fields corresponding to the *_number fields when missing
1259         # in order to always have clean titles. This is very common for TV series.
1260         for field in ('chapter', 'season', 'episode'):
1261             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1262                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1263
1264         subtitles = info_dict.get('subtitles')
1265         if subtitles:
1266             for _, subtitle in subtitles.items():
1267                 for subtitle_format in subtitle:
1268                     if subtitle_format.get('url'):
1269                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1270                     if 'ext' not in subtitle_format:
1271                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1272
1273         if self.params.get('listsubtitles', False):
1274             if 'automatic_captions' in info_dict:
1275                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1276             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1277             return
1278         info_dict['requested_subtitles'] = self.process_subtitles(
1279             info_dict['id'], subtitles,
1280             info_dict.get('automatic_captions'))
1281
1282         # We now pick which formats have to be downloaded
1283         if info_dict.get('formats') is None:
1284             # There's only one format available
1285             formats = [info_dict]
1286         else:
1287             formats = info_dict['formats']
1288
1289         if not formats:
1290             raise ExtractorError('No video formats found!')
1291
1292         formats_dict = {}
1293
1294         # We check that all the formats have the format and format_id fields
1295         for i, format in enumerate(formats):
1296             if 'url' not in format:
1297                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1298
1299             format['url'] = sanitize_url(format['url'])
1300
1301             if format.get('format_id') is None:
1302                 format['format_id'] = compat_str(i)
1303             else:
1304                 # Sanitize format_id from characters used in format selector expression
1305                 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1306             format_id = format['format_id']
1307             if format_id not in formats_dict:
1308                 formats_dict[format_id] = []
1309             formats_dict[format_id].append(format)
1310
1311         # Make sure all formats have unique format_id
1312         for format_id, ambiguous_formats in formats_dict.items():
1313             if len(ambiguous_formats) > 1:
1314                 for i, format in enumerate(ambiguous_formats):
1315                     format['format_id'] = '%s-%d' % (format_id, i)
1316
1317         for i, format in enumerate(formats):
1318             if format.get('format') is None:
1319                 format['format'] = '{id} - {res}{note}'.format(
1320                     id=format['format_id'],
1321                     res=self.format_resolution(format),
1322                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1323                 )
1324             # Automatically determine file extension if missing
1325             if 'ext' not in format:
1326                 format['ext'] = determine_ext(format['url']).lower()
1327             # Automatically determine protocol if missing (useful for format
1328             # selection purposes)
1329             if 'protocol' not in format:
1330                 format['protocol'] = determine_protocol(format)
1331             # Add HTTP headers, so that external programs can use them from the
1332             # json output
1333             full_format_info = info_dict.copy()
1334             full_format_info.update(format)
1335             format['http_headers'] = self._calc_headers(full_format_info)
1336
1337         # TODO Central sorting goes here
1338
1339         if formats[0] is not info_dict:
1340             # only set the 'formats' fields if the original info_dict list them
1341             # otherwise we end up with a circular reference, the first (and unique)
1342             # element in the 'formats' field in info_dict is info_dict itself,
1343             # which can't be exported to json
1344             info_dict['formats'] = formats
1345         if self.params.get('listformats'):
1346             self.list_formats(info_dict)
1347             return
1348
1349         req_format = self.params.get('format')
1350         if req_format is None:
1351             req_format_list = []
1352             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1353                     not info_dict.get('is_live')):
1354                 merger = FFmpegMergerPP(self)
1355                 if merger.available and merger.can_merge():
1356                     req_format_list.append('bestvideo+bestaudio')
1357             req_format_list.append('best')
1358             req_format = '/'.join(req_format_list)
1359         format_selector = self.build_format_selector(req_format)
1360         formats_to_download = list(format_selector(formats))
1361         if not formats_to_download:
1362             raise ExtractorError('requested format not available',
1363                                  expected=True)
1364
1365         if download:
1366             if len(formats_to_download) > 1:
1367                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1368             for format in formats_to_download:
1369                 new_info = dict(info_dict)
1370                 new_info.update(format)
1371                 self.process_info(new_info)
1372         # We update the info dict with the best quality format (backwards compatibility)
1373         info_dict.update(formats_to_download[-1])
1374         return info_dict
1375
1376     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1377         """Select the requested subtitles and their format"""
1378         available_subs = {}
1379         if normal_subtitles and self.params.get('writesubtitles'):
1380             available_subs.update(normal_subtitles)
1381         if automatic_captions and self.params.get('writeautomaticsub'):
1382             for lang, cap_info in automatic_captions.items():
1383                 if lang not in available_subs:
1384                     available_subs[lang] = cap_info
1385
1386         if (not self.params.get('writesubtitles') and not
1387                 self.params.get('writeautomaticsub') or not
1388                 available_subs):
1389             return None
1390
1391         if self.params.get('allsubtitles', False):
1392             requested_langs = available_subs.keys()
1393         else:
1394             if self.params.get('subtitleslangs', False):
1395                 requested_langs = self.params.get('subtitleslangs')
1396             elif 'en' in available_subs:
1397                 requested_langs = ['en']
1398             else:
1399                 requested_langs = [list(available_subs.keys())[0]]
1400
1401         formats_query = self.params.get('subtitlesformat', 'best')
1402         formats_preference = formats_query.split('/') if formats_query else []
1403         subs = {}
1404         for lang in requested_langs:
1405             formats = available_subs.get(lang)
1406             if formats is None:
1407                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1408                 continue
1409             for ext in formats_preference:
1410                 if ext == 'best':
1411                     f = formats[-1]
1412                     break
1413                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1414                 if matches:
1415                     f = matches[-1]
1416                     break
1417             else:
1418                 f = formats[-1]
1419                 self.report_warning(
1420                     'No subtitle format found matching "%s" for language %s, '
1421                     'using %s' % (formats_query, lang, f['ext']))
1422             subs[lang] = f
1423         return subs
1424
1425     def process_info(self, info_dict):
1426         """Process a single resolved IE result."""
1427
1428         assert info_dict.get('_type', 'video') == 'video'
1429
1430         max_downloads = self.params.get('max_downloads')
1431         if max_downloads is not None:
1432             if self._num_downloads >= int(max_downloads):
1433                 raise MaxDownloadsReached()
1434
1435         info_dict['fulltitle'] = info_dict['title']
1436         if len(info_dict['title']) > 200:
1437             info_dict['title'] = info_dict['title'][:197] + '...'
1438
1439         if 'format' not in info_dict:
1440             info_dict['format'] = info_dict['ext']
1441
1442         reason = self._match_entry(info_dict, incomplete=False)
1443         if reason is not None:
1444             self.to_screen('[download] ' + reason)
1445             return
1446
1447         self._num_downloads += 1
1448
1449         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1450
1451         # Forced printings
1452         if self.params.get('forcetitle', False):
1453             self.to_stdout(info_dict['fulltitle'])
1454         if self.params.get('forceid', False):
1455             self.to_stdout(info_dict['id'])
1456         if self.params.get('forceurl', False):
1457             if info_dict.get('requested_formats') is not None:
1458                 for f in info_dict['requested_formats']:
1459                     self.to_stdout(f['url'] + f.get('play_path', ''))
1460             else:
1461                 # For RTMP URLs, also include the playpath
1462                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1463         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1464             self.to_stdout(info_dict['thumbnail'])
1465         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1466             self.to_stdout(info_dict['description'])
1467         if self.params.get('forcefilename', False) and filename is not None:
1468             self.to_stdout(filename)
1469         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1470             self.to_stdout(formatSeconds(info_dict['duration']))
1471         if self.params.get('forceformat', False):
1472             self.to_stdout(info_dict['format'])
1473         if self.params.get('forcejson', False):
1474             self.to_stdout(json.dumps(info_dict))
1475
1476         # Do nothing else if in simulate mode
1477         if self.params.get('simulate', False):
1478             return
1479
1480         if filename is None:
1481             return
1482
1483         try:
1484             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1485             if dn and not os.path.exists(dn):
1486                 os.makedirs(dn)
1487         except (OSError, IOError) as err:
1488             self.report_error('unable to create directory ' + error_to_compat_str(err))
1489             return
1490
1491         if self.params.get('writedescription', False):
1492             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1493             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1494                 self.to_screen('[info] Video description is already present')
1495             elif info_dict.get('description') is None:
1496                 self.report_warning('There\'s no description to write.')
1497             else:
1498                 try:
1499                     self.to_screen('[info] Writing video description to: ' + descfn)
1500                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1501                         descfile.write(info_dict['description'])
1502                 except (OSError, IOError):
1503                     self.report_error('Cannot write description file ' + descfn)
1504                     return
1505
1506         if self.params.get('writeannotations', False):
1507             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1508             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1509                 self.to_screen('[info] Video annotations are already present')
1510             else:
1511                 try:
1512                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1513                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1514                         annofile.write(info_dict['annotations'])
1515                 except (KeyError, TypeError):
1516                     self.report_warning('There are no annotations to write.')
1517                 except (OSError, IOError):
1518                     self.report_error('Cannot write annotations file: ' + annofn)
1519                     return
1520
1521         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1522                                        self.params.get('writeautomaticsub')])
1523
1524         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1525             # subtitles download errors are already managed as troubles in relevant IE
1526             # that way it will silently go on when used with unsupporting IE
1527             subtitles = info_dict['requested_subtitles']
1528             ie = self.get_info_extractor(info_dict['extractor_key'])
1529             for sub_lang, sub_info in subtitles.items():
1530                 sub_format = sub_info['ext']
1531                 if sub_info.get('data') is not None:
1532                     sub_data = sub_info['data']
1533                 else:
1534                     try:
1535                         sub_data = ie._download_webpage(
1536                             sub_info['url'], info_dict['id'], note=False)
1537                     except ExtractorError as err:
1538                         self.report_warning('Unable to download subtitle for "%s": %s' %
1539                                             (sub_lang, error_to_compat_str(err.cause)))
1540                         continue
1541                 try:
1542                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1543                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1544                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1545                     else:
1546                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1547                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1548                             subfile.write(sub_data)
1549                 except (OSError, IOError):
1550                     self.report_error('Cannot write subtitles file ' + sub_filename)
1551                     return
1552
1553         if self.params.get('writeinfojson', False):
1554             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1555             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1556                 self.to_screen('[info] Video description metadata is already present')
1557             else:
1558                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1559                 try:
1560                     write_json_file(self.filter_requested_info(info_dict), infofn)
1561                 except (OSError, IOError):
1562                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1563                     return
1564
1565         self._write_thumbnails(info_dict, filename)
1566
1567         if not self.params.get('skip_download', False):
1568             try:
1569                 def dl(name, info):
1570                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1571                     for ph in self._progress_hooks:
1572                         fd.add_progress_hook(ph)
1573                     if self.params.get('verbose'):
1574                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1575                     return fd.download(name, info)
1576
1577                 if info_dict.get('requested_formats') is not None:
1578                     downloaded = []
1579                     success = True
1580                     merger = FFmpegMergerPP(self)
1581                     if not merger.available:
1582                         postprocessors = []
1583                         self.report_warning('You have requested multiple '
1584                                             'formats but ffmpeg or avconv are not installed.'
1585                                             ' The formats won\'t be merged.')
1586                     else:
1587                         postprocessors = [merger]
1588
1589                     def compatible_formats(formats):
1590                         video, audio = formats
1591                         # Check extension
1592                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1593                         if video_ext and audio_ext:
1594                             COMPATIBLE_EXTS = (
1595                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1596                                 ('webm')
1597                             )
1598                             for exts in COMPATIBLE_EXTS:
1599                                 if video_ext in exts and audio_ext in exts:
1600                                     return True
1601                         # TODO: Check acodec/vcodec
1602                         return False
1603
1604                     filename_real_ext = os.path.splitext(filename)[1][1:]
1605                     filename_wo_ext = (
1606                         os.path.splitext(filename)[0]
1607                         if filename_real_ext == info_dict['ext']
1608                         else filename)
1609                     requested_formats = info_dict['requested_formats']
1610                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1611                         info_dict['ext'] = 'mkv'
1612                         self.report_warning(
1613                             'Requested formats are incompatible for merge and will be merged into mkv.')
1614                     # Ensure filename always has a correct extension for successful merge
1615                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1616                     if os.path.exists(encodeFilename(filename)):
1617                         self.to_screen(
1618                             '[download] %s has already been downloaded and '
1619                             'merged' % filename)
1620                     else:
1621                         for f in requested_formats:
1622                             new_info = dict(info_dict)
1623                             new_info.update(f)
1624                             fname = self.prepare_filename(new_info)
1625                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1626                             downloaded.append(fname)
1627                             partial_success = dl(fname, new_info)
1628                             success = success and partial_success
1629                         info_dict['__postprocessors'] = postprocessors
1630                         info_dict['__files_to_merge'] = downloaded
1631                 else:
1632                     # Just a single file
1633                     success = dl(filename, info_dict)
1634             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1635                 self.report_error('unable to download video data: %s' % str(err))
1636                 return
1637             except (OSError, IOError) as err:
1638                 raise UnavailableVideoError(err)
1639             except (ContentTooShortError, ) as err:
1640                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1641                 return
1642
1643             if success and filename != '-':
1644                 # Fixup content
1645                 fixup_policy = self.params.get('fixup')
1646                 if fixup_policy is None:
1647                     fixup_policy = 'detect_or_warn'
1648
1649                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1650
1651                 stretched_ratio = info_dict.get('stretched_ratio')
1652                 if stretched_ratio is not None and stretched_ratio != 1:
1653                     if fixup_policy == 'warn':
1654                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1655                             info_dict['id'], stretched_ratio))
1656                     elif fixup_policy == 'detect_or_warn':
1657                         stretched_pp = FFmpegFixupStretchedPP(self)
1658                         if stretched_pp.available:
1659                             info_dict.setdefault('__postprocessors', [])
1660                             info_dict['__postprocessors'].append(stretched_pp)
1661                         else:
1662                             self.report_warning(
1663                                 '%s: Non-uniform pixel ratio (%s). %s'
1664                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1665                     else:
1666                         assert fixup_policy in ('ignore', 'never')
1667
1668                 if (info_dict.get('requested_formats') is None and
1669                         info_dict.get('container') == 'm4a_dash'):
1670                     if fixup_policy == 'warn':
1671                         self.report_warning(
1672                             '%s: writing DASH m4a. '
1673                             'Only some players support this container.'
1674                             % info_dict['id'])
1675                     elif fixup_policy == 'detect_or_warn':
1676                         fixup_pp = FFmpegFixupM4aPP(self)
1677                         if fixup_pp.available:
1678                             info_dict.setdefault('__postprocessors', [])
1679                             info_dict['__postprocessors'].append(fixup_pp)
1680                         else:
1681                             self.report_warning(
1682                                 '%s: writing DASH m4a. '
1683                                 'Only some players support this container. %s'
1684                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1685                     else:
1686                         assert fixup_policy in ('ignore', 'never')
1687
1688                 if (info_dict.get('protocol') == 'm3u8_native' or
1689                         info_dict.get('protocol') == 'm3u8' and
1690                         self.params.get('hls_prefer_native')):
1691                     if fixup_policy == 'warn':
1692                         self.report_warning('%s: malformated aac bitstream.' % (
1693                             info_dict['id']))
1694                     elif fixup_policy == 'detect_or_warn':
1695                         fixup_pp = FFmpegFixupM3u8PP(self)
1696                         if fixup_pp.available:
1697                             info_dict.setdefault('__postprocessors', [])
1698                             info_dict['__postprocessors'].append(fixup_pp)
1699                         else:
1700                             self.report_warning(
1701                                 '%s: malformated aac bitstream. %s'
1702                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1703                     else:
1704                         assert fixup_policy in ('ignore', 'never')
1705
1706                 try:
1707                     self.post_process(filename, info_dict)
1708                 except (PostProcessingError) as err:
1709                     self.report_error('postprocessing: %s' % str(err))
1710                     return
1711                 self.record_download_archive(info_dict)
1712
1713     def download(self, url_list):
1714         """Download a given list of URLs."""
1715         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1716         if (len(url_list) > 1 and
1717                 '%' not in outtmpl and
1718                 self.params.get('max_downloads') != 1):
1719             raise SameFileError(outtmpl)
1720
1721         for url in url_list:
1722             try:
1723                 # It also downloads the videos
1724                 res = self.extract_info(
1725                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1726             except UnavailableVideoError:
1727                 self.report_error('unable to download video')
1728             except MaxDownloadsReached:
1729                 self.to_screen('[info] Maximum number of downloaded files reached.')
1730                 raise
1731             else:
1732                 if self.params.get('dump_single_json', False):
1733                     self.to_stdout(json.dumps(res))
1734
1735         return self._download_retcode
1736
1737     def download_with_info_file(self, info_filename):
1738         with contextlib.closing(fileinput.FileInput(
1739                 [info_filename], mode='r',
1740                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1741             # FileInput doesn't have a read method, we can't call json.load
1742             info = self.filter_requested_info(json.loads('\n'.join(f)))
1743         try:
1744             self.process_ie_result(info, download=True)
1745         except DownloadError:
1746             webpage_url = info.get('webpage_url')
1747             if webpage_url is not None:
1748                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1749                 return self.download([webpage_url])
1750             else:
1751                 raise
1752         return self._download_retcode
1753
1754     @staticmethod
1755     def filter_requested_info(info_dict):
1756         return dict(
1757             (k, v) for k, v in info_dict.items()
1758             if k not in ['requested_formats', 'requested_subtitles'])
1759
1760     def post_process(self, filename, ie_info):
1761         """Run all the postprocessors on the given file."""
1762         info = dict(ie_info)
1763         info['filepath'] = filename
1764         pps_chain = []
1765         if ie_info.get('__postprocessors') is not None:
1766             pps_chain.extend(ie_info['__postprocessors'])
1767         pps_chain.extend(self._pps)
1768         for pp in pps_chain:
1769             files_to_delete = []
1770             try:
1771                 files_to_delete, info = pp.run(info)
1772             except PostProcessingError as e:
1773                 self.report_error(e.msg)
1774             if files_to_delete and not self.params.get('keepvideo', False):
1775                 for old_filename in files_to_delete:
1776                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1777                     try:
1778                         os.remove(encodeFilename(old_filename))
1779                     except (IOError, OSError):
1780                         self.report_warning('Unable to remove downloaded original file')
1781
1782     def _make_archive_id(self, info_dict):
1783         # Future-proof against any change in case
1784         # and backwards compatibility with prior versions
1785         extractor = info_dict.get('extractor_key')
1786         if extractor is None:
1787             if 'id' in info_dict:
1788                 extractor = info_dict.get('ie_key')  # key in a playlist
1789         if extractor is None:
1790             return None  # Incomplete video information
1791         return extractor.lower() + ' ' + info_dict['id']
1792
1793     def in_download_archive(self, info_dict):
1794         fn = self.params.get('download_archive')
1795         if fn is None:
1796             return False
1797
1798         vid_id = self._make_archive_id(info_dict)
1799         if vid_id is None:
1800             return False  # Incomplete video information
1801
1802         try:
1803             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1804                 for line in archive_file:
1805                     if line.strip() == vid_id:
1806                         return True
1807         except IOError as ioe:
1808             if ioe.errno != errno.ENOENT:
1809                 raise
1810         return False
1811
1812     def record_download_archive(self, info_dict):
1813         fn = self.params.get('download_archive')
1814         if fn is None:
1815             return
1816         vid_id = self._make_archive_id(info_dict)
1817         assert vid_id
1818         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1819             archive_file.write(vid_id + '\n')
1820
1821     @staticmethod
1822     def format_resolution(format, default='unknown'):
1823         if format.get('vcodec') == 'none':
1824             return 'audio only'
1825         if format.get('resolution') is not None:
1826             return format['resolution']
1827         if format.get('height') is not None:
1828             if format.get('width') is not None:
1829                 res = '%sx%s' % (format['width'], format['height'])
1830             else:
1831                 res = '%sp' % format['height']
1832         elif format.get('width') is not None:
1833             res = '%dx?' % format['width']
1834         else:
1835             res = default
1836         return res
1837
1838     def _format_note(self, fdict):
1839         res = ''
1840         if fdict.get('ext') in ['f4f', 'f4m']:
1841             res += '(unsupported) '
1842         if fdict.get('language'):
1843             if res:
1844                 res += ' '
1845             res += '[%s] ' % fdict['language']
1846         if fdict.get('format_note') is not None:
1847             res += fdict['format_note'] + ' '
1848         if fdict.get('tbr') is not None:
1849             res += '%4dk ' % fdict['tbr']
1850         if fdict.get('container') is not None:
1851             if res:
1852                 res += ', '
1853             res += '%s container' % fdict['container']
1854         if (fdict.get('vcodec') is not None and
1855                 fdict.get('vcodec') != 'none'):
1856             if res:
1857                 res += ', '
1858             res += fdict['vcodec']
1859             if fdict.get('vbr') is not None:
1860                 res += '@'
1861         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1862             res += 'video@'
1863         if fdict.get('vbr') is not None:
1864             res += '%4dk' % fdict['vbr']
1865         if fdict.get('fps') is not None:
1866             if res:
1867                 res += ', '
1868             res += '%sfps' % fdict['fps']
1869         if fdict.get('acodec') is not None:
1870             if res:
1871                 res += ', '
1872             if fdict['acodec'] == 'none':
1873                 res += 'video only'
1874             else:
1875                 res += '%-5s' % fdict['acodec']
1876         elif fdict.get('abr') is not None:
1877             if res:
1878                 res += ', '
1879             res += 'audio'
1880         if fdict.get('abr') is not None:
1881             res += '@%3dk' % fdict['abr']
1882         if fdict.get('asr') is not None:
1883             res += ' (%5dHz)' % fdict['asr']
1884         if fdict.get('filesize') is not None:
1885             if res:
1886                 res += ', '
1887             res += format_bytes(fdict['filesize'])
1888         elif fdict.get('filesize_approx') is not None:
1889             if res:
1890                 res += ', '
1891             res += '~' + format_bytes(fdict['filesize_approx'])
1892         return res
1893
1894     def list_formats(self, info_dict):
1895         formats = info_dict.get('formats', [info_dict])
1896         table = [
1897             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1898             for f in formats
1899             if f.get('preference') is None or f['preference'] >= -1000]
1900         if len(formats) > 1:
1901             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1902
1903         header_line = ['format code', 'extension', 'resolution', 'note']
1904         self.to_screen(
1905             '[info] Available formats for %s:\n%s' %
1906             (info_dict['id'], render_table(header_line, table)))
1907
1908     def list_thumbnails(self, info_dict):
1909         thumbnails = info_dict.get('thumbnails')
1910         if not thumbnails:
1911             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1912             return
1913
1914         self.to_screen(
1915             '[info] Thumbnails for %s:' % info_dict['id'])
1916         self.to_screen(render_table(
1917             ['ID', 'width', 'height', 'URL'],
1918             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1919
1920     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1921         if not subtitles:
1922             self.to_screen('%s has no %s' % (video_id, name))
1923             return
1924         self.to_screen(
1925             'Available %s for %s:' % (name, video_id))
1926         self.to_screen(render_table(
1927             ['Language', 'formats'],
1928             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1929                 for lang, formats in subtitles.items()]))
1930
1931     def urlopen(self, req):
1932         """ Start an HTTP download """
1933         if isinstance(req, compat_basestring):
1934             req = sanitized_Request(req)
1935         return self._opener.open(req, timeout=self._socket_timeout)
1936
1937     def print_debug_header(self):
1938         if not self.params.get('verbose'):
1939             return
1940
1941         if type('') is not compat_str:
1942             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1943             self.report_warning(
1944                 'Your Python is broken! Update to a newer and supported version')
1945
1946         stdout_encoding = getattr(
1947             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1948         encoding_str = (
1949             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1950                 locale.getpreferredencoding(),
1951                 sys.getfilesystemencoding(),
1952                 stdout_encoding,
1953                 self.get_encoding()))
1954         write_string(encoding_str, encoding=None)
1955
1956         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1957         try:
1958             sp = subprocess.Popen(
1959                 ['git', 'rev-parse', '--short', 'HEAD'],
1960                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1961                 cwd=os.path.dirname(os.path.abspath(__file__)))
1962             out, err = sp.communicate()
1963             out = out.decode().strip()
1964             if re.match('[0-9a-f]+', out):
1965                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1966         except Exception:
1967             try:
1968                 sys.exc_clear()
1969             except Exception:
1970                 pass
1971         self._write_string('[debug] Python version %s - %s\n' % (
1972             platform.python_version(), platform_name()))
1973
1974         exe_versions = FFmpegPostProcessor.get_versions(self)
1975         exe_versions['rtmpdump'] = rtmpdump_version()
1976         exe_str = ', '.join(
1977             '%s %s' % (exe, v)
1978             for exe, v in sorted(exe_versions.items())
1979             if v
1980         )
1981         if not exe_str:
1982             exe_str = 'none'
1983         self._write_string('[debug] exe versions: %s\n' % exe_str)
1984
1985         proxy_map = {}
1986         for handler in self._opener.handlers:
1987             if hasattr(handler, 'proxies'):
1988                 proxy_map.update(handler.proxies)
1989         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1990
1991         if self.params.get('call_home', False):
1992             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1993             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1994             latest_version = self.urlopen(
1995                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1996             if version_tuple(latest_version) > version_tuple(__version__):
1997                 self.report_warning(
1998                     'You are using an outdated version (newest version: %s)! '
1999                     'See https://yt-dl.org/update if you need help updating.' %
2000                     latest_version)
2001
2002     def _setup_opener(self):
2003         timeout_val = self.params.get('socket_timeout')
2004         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2005
2006         opts_cookiefile = self.params.get('cookiefile')
2007         opts_proxy = self.params.get('proxy')
2008
2009         if opts_cookiefile is None:
2010             self.cookiejar = compat_cookiejar.CookieJar()
2011         else:
2012             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2013                 opts_cookiefile)
2014             if os.access(opts_cookiefile, os.R_OK):
2015                 self.cookiejar.load()
2016
2017         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2018         if opts_proxy is not None:
2019             if opts_proxy == '':
2020                 proxies = {}
2021             else:
2022                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2023         else:
2024             proxies = compat_urllib_request.getproxies()
2025             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2026             if 'http' in proxies and 'https' not in proxies:
2027                 proxies['https'] = proxies['http']
2028         proxy_handler = PerRequestProxyHandler(proxies)
2029
2030         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2031         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2032         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2033         data_handler = compat_urllib_request_DataHandler()
2034
2035         # When passing our own FileHandler instance, build_opener won't add the
2036         # default FileHandler and allows us to disable the file protocol, which
2037         # can be used for malicious purposes (see
2038         # https://github.com/rg3/youtube-dl/issues/8227)
2039         file_handler = compat_urllib_request.FileHandler()
2040
2041         def file_open(*args, **kwargs):
2042             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2043         file_handler.file_open = file_open
2044
2045         opener = compat_urllib_request.build_opener(
2046             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2047
2048         # Delete the default user-agent header, which would otherwise apply in
2049         # cases where our custom HTTP handler doesn't come into play
2050         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2051         opener.addheaders = []
2052         self._opener = opener
2053
2054     def encode(self, s):
2055         if isinstance(s, bytes):
2056             return s  # Already encoded
2057
2058         try:
2059             return s.encode(self.get_encoding())
2060         except UnicodeEncodeError as err:
2061             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2062             raise
2063
2064     def get_encoding(self):
2065         encoding = self.params.get('encoding')
2066         if encoding is None:
2067             encoding = preferredencoding()
2068         return encoding
2069
2070     def _write_thumbnails(self, info_dict, filename):
2071         if self.params.get('writethumbnail', False):
2072             thumbnails = info_dict.get('thumbnails')
2073             if thumbnails:
2074                 thumbnails = [thumbnails[-1]]
2075         elif self.params.get('write_all_thumbnails', False):
2076             thumbnails = info_dict.get('thumbnails')
2077         else:
2078             return
2079
2080         if not thumbnails:
2081             # No thumbnails present, so return immediately
2082             return
2083
2084         for t in thumbnails:
2085             thumb_ext = determine_ext(t['url'], 'jpg')
2086             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2087             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2088             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2089
2090             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2091                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2092                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2093             else:
2094                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2095                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2096                 try:
2097                     uf = self.urlopen(t['url'])
2098                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2099                         shutil.copyfileobj(uf, thumbf)
2100                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2101                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2102                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2103                     self.report_warning('Unable to download thumbnail "%s": %s' %
2104                                         (t['url'], error_to_compat_str(err)))