[YoutubeDL] format spec: correctly handle dashes and other unused operators
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import tokenize
25 import traceback
26
27 if os.name == 'nt':
28     import ctypes
29
30 from .compat import (
31     compat_basestring,
32     compat_cookiejar,
33     compat_expanduser,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_str,
38     compat_tokenize_tokenize,
39     compat_urllib_error,
40     compat_urllib_request,
41 )
42 from .utils import (
43     escape_url,
44     ContentTooShortError,
45     date_from_str,
46     DateRange,
47     DEFAULT_OUTTMPL,
48     determine_ext,
49     DownloadError,
50     encodeFilename,
51     ExtractorError,
52     format_bytes,
53     formatSeconds,
54     HEADRequest,
55     locked_file,
56     make_HTTPS_handler,
57     MaxDownloadsReached,
58     PagedList,
59     parse_filesize,
60     PerRequestProxyHandler,
61     PostProcessingError,
62     platform_name,
63     preferredencoding,
64     render_table,
65     SameFileError,
66     sanitize_filename,
67     sanitize_path,
68     std_headers,
69     subtitles_filename,
70     UnavailableVideoError,
71     url_basename,
72     version_tuple,
73     write_json_file,
74     write_string,
75     YoutubeDLHandler,
76     prepend_extension,
77     replace_extension,
78     args_to_str,
79     age_restricted,
80 )
81 from .cache import Cache
82 from .extractor import get_info_extractor, gen_extractors
83 from .downloader import get_suitable_downloader
84 from .downloader.rtmp import rtmpdump_version
85 from .postprocessor import (
86     FFmpegFixupM4aPP,
87     FFmpegFixupStretchedPP,
88     FFmpegMergerPP,
89     FFmpegPostProcessor,
90     get_postprocessor,
91 )
92 from .version import __version__
93
94
95 class YoutubeDL(object):
96     """YoutubeDL class.
97
98     YoutubeDL objects are the ones responsible of downloading the
99     actual video file and writing it to disk if the user has requested
100     it, among some other tasks. In most cases there should be one per
101     program. As, given a video URL, the downloader doesn't know how to
102     extract all the needed information, task that InfoExtractors do, it
103     has to pass the URL to one of them.
104
105     For this, YoutubeDL objects have a method that allows
106     InfoExtractors to be registered in a given order. When it is passed
107     a URL, the YoutubeDL object handles it to the first InfoExtractor it
108     finds that reports being able to handle it. The InfoExtractor extracts
109     all the information about the video or videos the URL refers to, and
110     YoutubeDL process the extracted information, possibly using a File
111     Downloader to download the video.
112
113     YoutubeDL objects accept a lot of parameters. In order not to saturate
114     the object constructor with arguments, it receives a dictionary of
115     options instead. These options are available through the params
116     attribute for the InfoExtractors to use. The YoutubeDL also
117     registers itself as the downloader in charge for the InfoExtractors
118     that are added to it, so this is a "mutual registration".
119
120     Available options:
121
122     username:          Username for authentication purposes.
123     password:          Password for authentication purposes.
124     videopassword:     Password for accessing a video.
125     usenetrc:          Use netrc for authentication instead.
126     verbose:           Print additional info to stdout.
127     quiet:             Do not print messages to stdout.
128     no_warnings:       Do not print out anything for warnings.
129     forceurl:          Force printing final URL.
130     forcetitle:        Force printing title.
131     forceid:           Force printing ID.
132     forcethumbnail:    Force printing thumbnail URL.
133     forcedescription:  Force printing description.
134     forcefilename:     Force printing final filename.
135     forceduration:     Force printing duration.
136     forcejson:         Force printing info_dict as JSON.
137     dump_single_json:  Force printing the info_dict of the whole playlist
138                        (or video) as a single JSON line.
139     simulate:          Do not download the video files.
140     format:            Video format code. See options.py for more information.
141     outtmpl:           Template for output names.
142     restrictfilenames: Do not allow "&" and spaces in file names
143     ignoreerrors:      Do not stop on download errors.
144     force_generic_extractor: Force downloader to use the generic extractor
145     nooverwrites:      Prevent overwriting files.
146     playliststart:     Playlist item to start at.
147     playlistend:       Playlist item to end at.
148     playlist_items:    Specific indices of playlist to download.
149     playlistreverse:   Download playlist items in reverse order.
150     matchtitle:        Download only matching titles.
151     rejecttitle:       Reject downloads for matching titles.
152     logger:            Log messages to a logging.Logger instance.
153     logtostderr:       Log messages to stderr instead of stdout.
154     writedescription:  Write the video description to a .description file
155     writeinfojson:     Write the video description to a .info.json file
156     writeannotations:  Write the video annotations to a .annotations.xml file
157     writethumbnail:    Write the thumbnail image to a file
158     write_all_thumbnails:  Write all thumbnail formats to files
159     writesubtitles:    Write the video subtitles to a file
160     writeautomaticsub: Write the automatic subtitles to a file
161     allsubtitles:      Downloads all the subtitles of the video
162                        (requires writesubtitles or writeautomaticsub)
163     listsubtitles:     Lists all available subtitles for the video
164     subtitlesformat:   The format code for subtitles
165     subtitleslangs:    List of languages of the subtitles to download
166     keepvideo:         Keep the video file after post-processing
167     daterange:         A DateRange object, download only if the upload_date is in the range.
168     skip_download:     Skip the actual download of the video file
169     cachedir:          Location of the cache files in the filesystem.
170                        False to disable filesystem cache.
171     noplaylist:        Download single video instead of a playlist if in doubt.
172     age_limit:         An integer representing the user's age in years.
173                        Unsuitable videos for the given age are skipped.
174     min_views:         An integer representing the minimum view count the video
175                        must have in order to not be skipped.
176                        Videos without view count information are always
177                        downloaded. None for no limit.
178     max_views:         An integer representing the maximum view count.
179                        Videos that are more popular than that are not
180                        downloaded.
181                        Videos without view count information are always
182                        downloaded. None for no limit.
183     download_archive:  File name of a file where all downloads are recorded.
184                        Videos already present in the file are not downloaded
185                        again.
186     cookiefile:        File name where cookies should be read from and dumped to.
187     nocheckcertificate:Do not verify SSL certificates
188     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
189                        At the moment, this is only supported by YouTube.
190     proxy:             URL of the proxy server to use
191     cn_verification_proxy:  URL of the proxy to use for IP address verification
192                        on Chinese sites. (Experimental)
193     socket_timeout:    Time to wait for unresponsive hosts, in seconds
194     bidi_workaround:   Work around buggy terminals without bidirectional text
195                        support, using fridibi
196     debug_printtraffic:Print out sent and received HTTP traffic
197     include_ads:       Download ads as well
198     default_search:    Prepend this string if an input url is not valid.
199                        'auto' for elaborate guessing
200     encoding:          Use this encoding instead of the system-specified.
201     extract_flat:      Do not resolve URLs, return the immediate result.
202                        Pass in 'in_playlist' to only show this behavior for
203                        playlist items.
204     postprocessors:    A list of dictionaries, each with an entry
205                        * key:  The name of the postprocessor. See
206                                youtube_dl/postprocessor/__init__.py for a list.
207                        as well as any further keyword arguments for the
208                        postprocessor.
209     progress_hooks:    A list of functions that get called on download
210                        progress, with a dictionary with the entries
211                        * status: One of "downloading", "error", or "finished".
212                                  Check this first and ignore unknown values.
213
214                        If status is one of "downloading", or "finished", the
215                        following properties may also be present:
216                        * filename: The final filename (always present)
217                        * tmpfilename: The filename we're currently writing to
218                        * downloaded_bytes: Bytes on disk
219                        * total_bytes: Size of the whole file, None if unknown
220                        * total_bytes_estimate: Guess of the eventual file size,
221                                                None if unavailable.
222                        * elapsed: The number of seconds since download started.
223                        * eta: The estimated time in seconds, None if unknown
224                        * speed: The download speed in bytes/second, None if
225                                 unknown
226                        * fragment_index: The counter of the currently
227                                          downloaded video fragment.
228                        * fragment_count: The number of fragments (= individual
229                                          files that will be merged)
230
231                        Progress hooks are guaranteed to be called at least once
232                        (with status "finished") if the download is successful.
233     merge_output_format: Extension to use when merging formats.
234     fixup:             Automatically correct known faults of the file.
235                        One of:
236                        - "never": do nothing
237                        - "warn": only emit a warning
238                        - "detect_or_warn": check whether we can do anything
239                                            about it, warn otherwise (default)
240     source_address:    (Experimental) Client-side IP address to bind to.
241     call_home:         Boolean, true iff we are allowed to contact the
242                        youtube-dl servers for debugging.
243     sleep_interval:    Number of seconds to sleep before each download.
244     listformats:       Print an overview of available video formats and exit.
245     list_thumbnails:   Print a table of all thumbnails and exit.
246     match_filter:      A function that gets called with the info_dict of
247                        every video.
248                        If it returns a message, the video is ignored.
249                        If it returns None, the video is downloaded.
250                        match_filter_func in utils.py is one example for this.
251     no_color:          Do not emit color codes in output.
252
253     The following options determine which downloader is picked:
254     external_downloader: Executable of the external downloader to call.
255                        None or unset for standard (built-in) downloader.
256     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
257
258     The following parameters are not used by YoutubeDL itself, they are used by
259     the downloader (see youtube_dl/downloader/common.py):
260     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
261     noresizebuffer, retries, continuedl, noprogress, consoletitle,
262     xattr_set_filesize, external_downloader_args.
263
264     The following options are used by the post processors:
265     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
266                        otherwise prefer avconv.
267     postprocessor_args: A list of additional command-line arguments for the
268                         postprocessor.
269     """
270
271     params = None
272     _ies = []
273     _pps = []
274     _download_retcode = None
275     _num_downloads = None
276     _screen_file = None
277
278     def __init__(self, params=None, auto_init=True):
279         """Create a FileDownloader object with the given options."""
280         if params is None:
281             params = {}
282         self._ies = []
283         self._ies_instances = {}
284         self._pps = []
285         self._progress_hooks = []
286         self._download_retcode = 0
287         self._num_downloads = 0
288         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
289         self._err_file = sys.stderr
290         self.params = params
291         self.cache = Cache(self)
292
293         if params.get('bidi_workaround', False):
294             try:
295                 import pty
296                 master, slave = pty.openpty()
297                 width = compat_get_terminal_size().columns
298                 if width is None:
299                     width_args = []
300                 else:
301                     width_args = ['-w', str(width)]
302                 sp_kwargs = dict(
303                     stdin=subprocess.PIPE,
304                     stdout=slave,
305                     stderr=self._err_file)
306                 try:
307                     self._output_process = subprocess.Popen(
308                         ['bidiv'] + width_args, **sp_kwargs
309                     )
310                 except OSError:
311                     self._output_process = subprocess.Popen(
312                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
313                 self._output_channel = os.fdopen(master, 'rb')
314             except OSError as ose:
315                 if ose.errno == 2:
316                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
317                 else:
318                     raise
319
320         if (sys.version_info >= (3,) and sys.platform != 'win32' and
321                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
322                 not params.get('restrictfilenames', False)):
323             # On Python 3, the Unicode filesystem API will throw errors (#1474)
324             self.report_warning(
325                 'Assuming --restrict-filenames since file system encoding '
326                 'cannot encode all characters. '
327                 'Set the LC_ALL environment variable to fix this.')
328             self.params['restrictfilenames'] = True
329
330         if isinstance(params.get('outtmpl'), bytes):
331             self.report_warning(
332                 'Parameter outtmpl is bytes, but should be a unicode string. '
333                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
334
335         self._setup_opener()
336
337         if auto_init:
338             self.print_debug_header()
339             self.add_default_info_extractors()
340
341         for pp_def_raw in self.params.get('postprocessors', []):
342             pp_class = get_postprocessor(pp_def_raw['key'])
343             pp_def = dict(pp_def_raw)
344             del pp_def['key']
345             pp = pp_class(self, **compat_kwargs(pp_def))
346             self.add_post_processor(pp)
347
348         for ph in self.params.get('progress_hooks', []):
349             self.add_progress_hook(ph)
350
351     def warn_if_short_id(self, argv):
352         # short YouTube ID starting with dash?
353         idxs = [
354             i for i, a in enumerate(argv)
355             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
356         if idxs:
357             correct_argv = (
358                 ['youtube-dl'] +
359                 [a for i, a in enumerate(argv) if i not in idxs] +
360                 ['--'] + [argv[i] for i in idxs]
361             )
362             self.report_warning(
363                 'Long argument string detected. '
364                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
365                 args_to_str(correct_argv))
366
367     def add_info_extractor(self, ie):
368         """Add an InfoExtractor object to the end of the list."""
369         self._ies.append(ie)
370         self._ies_instances[ie.ie_key()] = ie
371         ie.set_downloader(self)
372
373     def get_info_extractor(self, ie_key):
374         """
375         Get an instance of an IE with name ie_key, it will try to get one from
376         the _ies list, if there's no instance it will create a new one and add
377         it to the extractor list.
378         """
379         ie = self._ies_instances.get(ie_key)
380         if ie is None:
381             ie = get_info_extractor(ie_key)()
382             self.add_info_extractor(ie)
383         return ie
384
385     def add_default_info_extractors(self):
386         """
387         Add the InfoExtractors returned by gen_extractors to the end of the list
388         """
389         for ie in gen_extractors():
390             self.add_info_extractor(ie)
391
392     def add_post_processor(self, pp):
393         """Add a PostProcessor object to the end of the chain."""
394         self._pps.append(pp)
395         pp.set_downloader(self)
396
397     def add_progress_hook(self, ph):
398         """Add the progress hook (currently only for the file downloader)"""
399         self._progress_hooks.append(ph)
400
401     def _bidi_workaround(self, message):
402         if not hasattr(self, '_output_channel'):
403             return message
404
405         assert hasattr(self, '_output_process')
406         assert isinstance(message, compat_str)
407         line_count = message.count('\n') + 1
408         self._output_process.stdin.write((message + '\n').encode('utf-8'))
409         self._output_process.stdin.flush()
410         res = ''.join(self._output_channel.readline().decode('utf-8')
411                       for _ in range(line_count))
412         return res[:-len('\n')]
413
414     def to_screen(self, message, skip_eol=False):
415         """Print message to stdout if not in quiet mode."""
416         return self.to_stdout(message, skip_eol, check_quiet=True)
417
418     def _write_string(self, s, out=None):
419         write_string(s, out=out, encoding=self.params.get('encoding'))
420
421     def to_stdout(self, message, skip_eol=False, check_quiet=False):
422         """Print message to stdout if not in quiet mode."""
423         if self.params.get('logger'):
424             self.params['logger'].debug(message)
425         elif not check_quiet or not self.params.get('quiet', False):
426             message = self._bidi_workaround(message)
427             terminator = ['\n', ''][skip_eol]
428             output = message + terminator
429
430             self._write_string(output, self._screen_file)
431
432     def to_stderr(self, message):
433         """Print message to stderr."""
434         assert isinstance(message, compat_str)
435         if self.params.get('logger'):
436             self.params['logger'].error(message)
437         else:
438             message = self._bidi_workaround(message)
439             output = message + '\n'
440             self._write_string(output, self._err_file)
441
442     def to_console_title(self, message):
443         if not self.params.get('consoletitle', False):
444             return
445         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
446             # c_wchar_p() might not be necessary if `message` is
447             # already of type unicode()
448             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
449         elif 'TERM' in os.environ:
450             self._write_string('\033]0;%s\007' % message, self._screen_file)
451
452     def save_console_title(self):
453         if not self.params.get('consoletitle', False):
454             return
455         if 'TERM' in os.environ:
456             # Save the title on stack
457             self._write_string('\033[22;0t', self._screen_file)
458
459     def restore_console_title(self):
460         if not self.params.get('consoletitle', False):
461             return
462         if 'TERM' in os.environ:
463             # Restore the title from stack
464             self._write_string('\033[23;0t', self._screen_file)
465
466     def __enter__(self):
467         self.save_console_title()
468         return self
469
470     def __exit__(self, *args):
471         self.restore_console_title()
472
473         if self.params.get('cookiefile') is not None:
474             self.cookiejar.save()
475
476     def trouble(self, message=None, tb=None):
477         """Determine action to take when a download problem appears.
478
479         Depending on if the downloader has been configured to ignore
480         download errors or not, this method may throw an exception or
481         not when errors are found, after printing the message.
482
483         tb, if given, is additional traceback information.
484         """
485         if message is not None:
486             self.to_stderr(message)
487         if self.params.get('verbose'):
488             if tb is None:
489                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
490                     tb = ''
491                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
492                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
493                     tb += compat_str(traceback.format_exc())
494                 else:
495                     tb_data = traceback.format_list(traceback.extract_stack())
496                     tb = ''.join(tb_data)
497             self.to_stderr(tb)
498         if not self.params.get('ignoreerrors', False):
499             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
500                 exc_info = sys.exc_info()[1].exc_info
501             else:
502                 exc_info = sys.exc_info()
503             raise DownloadError(message, exc_info)
504         self._download_retcode = 1
505
506     def report_warning(self, message):
507         '''
508         Print the message to stderr, it will be prefixed with 'WARNING:'
509         If stderr is a tty file the 'WARNING:' will be colored
510         '''
511         if self.params.get('logger') is not None:
512             self.params['logger'].warning(message)
513         else:
514             if self.params.get('no_warnings'):
515                 return
516             if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
517                 _msg_header = '\033[0;33mWARNING:\033[0m'
518             else:
519                 _msg_header = 'WARNING:'
520             warning_message = '%s %s' % (_msg_header, message)
521             self.to_stderr(warning_message)
522
523     def report_error(self, message, tb=None):
524         '''
525         Do the same as trouble, but prefixes the message with 'ERROR:', colored
526         in red if stderr is a tty file.
527         '''
528         if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
529             _msg_header = '\033[0;31mERROR:\033[0m'
530         else:
531             _msg_header = 'ERROR:'
532         error_message = '%s %s' % (_msg_header, message)
533         self.trouble(error_message, tb)
534
535     def report_file_already_downloaded(self, file_name):
536         """Report file has already been fully downloaded."""
537         try:
538             self.to_screen('[download] %s has already been downloaded' % file_name)
539         except UnicodeEncodeError:
540             self.to_screen('[download] The file has already been downloaded')
541
542     def prepare_filename(self, info_dict):
543         """Generate the output filename."""
544         try:
545             template_dict = dict(info_dict)
546
547             template_dict['epoch'] = int(time.time())
548             autonumber_size = self.params.get('autonumber_size')
549             if autonumber_size is None:
550                 autonumber_size = 5
551             autonumber_templ = '%0' + str(autonumber_size) + 'd'
552             template_dict['autonumber'] = autonumber_templ % self._num_downloads
553             if template_dict.get('playlist_index') is not None:
554                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
555             if template_dict.get('resolution') is None:
556                 if template_dict.get('width') and template_dict.get('height'):
557                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
558                 elif template_dict.get('height'):
559                     template_dict['resolution'] = '%sp' % template_dict['height']
560                 elif template_dict.get('width'):
561                     template_dict['resolution'] = '?x%d' % template_dict['width']
562
563             sanitize = lambda k, v: sanitize_filename(
564                 compat_str(v),
565                 restricted=self.params.get('restrictfilenames'),
566                 is_id=(k == 'id'))
567             template_dict = dict((k, sanitize(k, v))
568                                  for k, v in template_dict.items()
569                                  if v is not None)
570             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
571
572             outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
573             tmpl = compat_expanduser(outtmpl)
574             filename = tmpl % template_dict
575             # Temporary fix for #4787
576             # 'Treat' all problem characters by passing filename through preferredencoding
577             # to workaround encoding issues with subprocess on python2 @ Windows
578             if sys.version_info < (3, 0) and sys.platform == 'win32':
579                 filename = encodeFilename(filename, True).decode(preferredencoding())
580             return filename
581         except ValueError as err:
582             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
583             return None
584
585     def _match_entry(self, info_dict, incomplete):
586         """ Returns None iff the file should be downloaded """
587
588         video_title = info_dict.get('title', info_dict.get('id', 'video'))
589         if 'title' in info_dict:
590             # This can happen when we're just evaluating the playlist
591             title = info_dict['title']
592             matchtitle = self.params.get('matchtitle', False)
593             if matchtitle:
594                 if not re.search(matchtitle, title, re.IGNORECASE):
595                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
596             rejecttitle = self.params.get('rejecttitle', False)
597             if rejecttitle:
598                 if re.search(rejecttitle, title, re.IGNORECASE):
599                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
600         date = info_dict.get('upload_date', None)
601         if date is not None:
602             dateRange = self.params.get('daterange', DateRange())
603             if date not in dateRange:
604                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
605         view_count = info_dict.get('view_count', None)
606         if view_count is not None:
607             min_views = self.params.get('min_views')
608             if min_views is not None and view_count < min_views:
609                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
610             max_views = self.params.get('max_views')
611             if max_views is not None and view_count > max_views:
612                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
613         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
614             return 'Skipping "%s" because it is age restricted' % video_title
615         if self.in_download_archive(info_dict):
616             return '%s has already been recorded in archive' % video_title
617
618         if not incomplete:
619             match_filter = self.params.get('match_filter')
620             if match_filter is not None:
621                 ret = match_filter(info_dict)
622                 if ret is not None:
623                     return ret
624
625         return None
626
627     @staticmethod
628     def add_extra_info(info_dict, extra_info):
629         '''Set the keys from extra_info in info dict if they are missing'''
630         for key, value in extra_info.items():
631             info_dict.setdefault(key, value)
632
633     def extract_info(self, url, download=True, ie_key=None, extra_info={},
634                      process=True, force_generic_extractor=False):
635         '''
636         Returns a list with a dictionary for each video we find.
637         If 'download', also downloads the videos.
638         extra_info is a dict containing the extra values to add to each result
639         '''
640
641         if not ie_key and force_generic_extractor:
642             ie_key = 'Generic'
643
644         if ie_key:
645             ies = [self.get_info_extractor(ie_key)]
646         else:
647             ies = self._ies
648
649         for ie in ies:
650             if not ie.suitable(url):
651                 continue
652
653             if not ie.working():
654                 self.report_warning('The program functionality for this site has been marked as broken, '
655                                     'and will probably not work.')
656
657             try:
658                 ie_result = ie.extract(url)
659                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
660                     break
661                 if isinstance(ie_result, list):
662                     # Backwards compatibility: old IE result format
663                     ie_result = {
664                         '_type': 'compat_list',
665                         'entries': ie_result,
666                     }
667                 self.add_default_extra_info(ie_result, ie, url)
668                 if process:
669                     return self.process_ie_result(ie_result, download, extra_info)
670                 else:
671                     return ie_result
672             except ExtractorError as de:  # An error we somewhat expected
673                 self.report_error(compat_str(de), de.format_traceback())
674                 break
675             except MaxDownloadsReached:
676                 raise
677             except Exception as e:
678                 if self.params.get('ignoreerrors', False):
679                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
680                     break
681                 else:
682                     raise
683         else:
684             self.report_error('no suitable InfoExtractor for URL %s' % url)
685
686     def add_default_extra_info(self, ie_result, ie, url):
687         self.add_extra_info(ie_result, {
688             'extractor': ie.IE_NAME,
689             'webpage_url': url,
690             'webpage_url_basename': url_basename(url),
691             'extractor_key': ie.ie_key(),
692         })
693
694     def process_ie_result(self, ie_result, download=True, extra_info={}):
695         """
696         Take the result of the ie(may be modified) and resolve all unresolved
697         references (URLs, playlist items).
698
699         It will also download the videos if 'download'.
700         Returns the resolved ie_result.
701         """
702
703         result_type = ie_result.get('_type', 'video')
704
705         if result_type in ('url', 'url_transparent'):
706             extract_flat = self.params.get('extract_flat', False)
707             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
708                     extract_flat is True):
709                 if self.params.get('forcejson', False):
710                     self.to_stdout(json.dumps(ie_result))
711                 return ie_result
712
713         if result_type == 'video':
714             self.add_extra_info(ie_result, extra_info)
715             return self.process_video_result(ie_result, download=download)
716         elif result_type == 'url':
717             # We have to add extra_info to the results because it may be
718             # contained in a playlist
719             return self.extract_info(ie_result['url'],
720                                      download,
721                                      ie_key=ie_result.get('ie_key'),
722                                      extra_info=extra_info)
723         elif result_type == 'url_transparent':
724             # Use the information from the embedding page
725             info = self.extract_info(
726                 ie_result['url'], ie_key=ie_result.get('ie_key'),
727                 extra_info=extra_info, download=False, process=False)
728
729             force_properties = dict(
730                 (k, v) for k, v in ie_result.items() if v is not None)
731             for f in ('_type', 'url'):
732                 if f in force_properties:
733                     del force_properties[f]
734             new_result = info.copy()
735             new_result.update(force_properties)
736
737             assert new_result.get('_type') != 'url_transparent'
738
739             return self.process_ie_result(
740                 new_result, download=download, extra_info=extra_info)
741         elif result_type == 'playlist' or result_type == 'multi_video':
742             # We process each entry in the playlist
743             playlist = ie_result.get('title', None) or ie_result.get('id', None)
744             self.to_screen('[download] Downloading playlist: %s' % playlist)
745
746             playlist_results = []
747
748             playliststart = self.params.get('playliststart', 1) - 1
749             playlistend = self.params.get('playlistend', None)
750             # For backwards compatibility, interpret -1 as whole list
751             if playlistend == -1:
752                 playlistend = None
753
754             playlistitems_str = self.params.get('playlist_items', None)
755             playlistitems = None
756             if playlistitems_str is not None:
757                 def iter_playlistitems(format):
758                     for string_segment in format.split(','):
759                         if '-' in string_segment:
760                             start, end = string_segment.split('-')
761                             for item in range(int(start), int(end) + 1):
762                                 yield int(item)
763                         else:
764                             yield int(string_segment)
765                 playlistitems = iter_playlistitems(playlistitems_str)
766
767             ie_entries = ie_result['entries']
768             if isinstance(ie_entries, list):
769                 n_all_entries = len(ie_entries)
770                 if playlistitems:
771                     entries = [
772                         ie_entries[i - 1] for i in playlistitems
773                         if -n_all_entries <= i - 1 < n_all_entries]
774                 else:
775                     entries = ie_entries[playliststart:playlistend]
776                 n_entries = len(entries)
777                 self.to_screen(
778                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
779                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
780             elif isinstance(ie_entries, PagedList):
781                 if playlistitems:
782                     entries = []
783                     for item in playlistitems:
784                         entries.extend(ie_entries.getslice(
785                             item - 1, item
786                         ))
787                 else:
788                     entries = ie_entries.getslice(
789                         playliststart, playlistend)
790                 n_entries = len(entries)
791                 self.to_screen(
792                     "[%s] playlist %s: Downloading %d videos" %
793                     (ie_result['extractor'], playlist, n_entries))
794             else:  # iterable
795                 if playlistitems:
796                     entry_list = list(ie_entries)
797                     entries = [entry_list[i - 1] for i in playlistitems]
798                 else:
799                     entries = list(itertools.islice(
800                         ie_entries, playliststart, playlistend))
801                 n_entries = len(entries)
802                 self.to_screen(
803                     "[%s] playlist %s: Downloading %d videos" %
804                     (ie_result['extractor'], playlist, n_entries))
805
806             if self.params.get('playlistreverse', False):
807                 entries = entries[::-1]
808
809             for i, entry in enumerate(entries, 1):
810                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
811                 extra = {
812                     'n_entries': n_entries,
813                     'playlist': playlist,
814                     'playlist_id': ie_result.get('id'),
815                     'playlist_title': ie_result.get('title'),
816                     'playlist_index': i + playliststart,
817                     'extractor': ie_result['extractor'],
818                     'webpage_url': ie_result['webpage_url'],
819                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
820                     'extractor_key': ie_result['extractor_key'],
821                 }
822
823                 reason = self._match_entry(entry, incomplete=True)
824                 if reason is not None:
825                     self.to_screen('[download] ' + reason)
826                     continue
827
828                 entry_result = self.process_ie_result(entry,
829                                                       download=download,
830                                                       extra_info=extra)
831                 playlist_results.append(entry_result)
832             ie_result['entries'] = playlist_results
833             return ie_result
834         elif result_type == 'compat_list':
835             self.report_warning(
836                 'Extractor %s returned a compat_list result. '
837                 'It needs to be updated.' % ie_result.get('extractor'))
838
839             def _fixup(r):
840                 self.add_extra_info(
841                     r,
842                     {
843                         'extractor': ie_result['extractor'],
844                         'webpage_url': ie_result['webpage_url'],
845                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
846                         'extractor_key': ie_result['extractor_key'],
847                     }
848                 )
849                 return r
850             ie_result['entries'] = [
851                 self.process_ie_result(_fixup(r), download, extra_info)
852                 for r in ie_result['entries']
853             ]
854             return ie_result
855         else:
856             raise Exception('Invalid result type: %s' % result_type)
857
858     def _build_format_filter(self, filter_spec):
859         " Returns a function to filter the formats according to the filter_spec "
860
861         OPERATORS = {
862             '<': operator.lt,
863             '<=': operator.le,
864             '>': operator.gt,
865             '>=': operator.ge,
866             '=': operator.eq,
867             '!=': operator.ne,
868         }
869         operator_rex = re.compile(r'''(?x)\s*
870             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
871             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
872             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
873             $
874             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
875         m = operator_rex.search(filter_spec)
876         if m:
877             try:
878                 comparison_value = int(m.group('value'))
879             except ValueError:
880                 comparison_value = parse_filesize(m.group('value'))
881                 if comparison_value is None:
882                     comparison_value = parse_filesize(m.group('value') + 'B')
883                 if comparison_value is None:
884                     raise ValueError(
885                         'Invalid value %r in format specification %r' % (
886                             m.group('value'), filter_spec))
887             op = OPERATORS[m.group('op')]
888
889         if not m:
890             STR_OPERATORS = {
891                 '=': operator.eq,
892                 '!=': operator.ne,
893             }
894             str_operator_rex = re.compile(r'''(?x)
895                 \s*(?P<key>ext|acodec|vcodec|container|protocol)
896                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
897                 \s*(?P<value>[a-zA-Z0-9_-]+)
898                 \s*$
899                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
900             m = str_operator_rex.search(filter_spec)
901             if m:
902                 comparison_value = m.group('value')
903                 op = STR_OPERATORS[m.group('op')]
904
905         if not m:
906             raise ValueError('Invalid filter specification %r' % filter_spec)
907
908         def _filter(f):
909             actual_value = f.get(m.group('key'))
910             if actual_value is None:
911                 return m.group('none_inclusive')
912             return op(actual_value, comparison_value)
913         return _filter
914
915     def build_format_selector(self, format_spec):
916         def syntax_error(note, start):
917             message = (
918                 'Invalid format specification: '
919                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
920             return SyntaxError(message)
921
922         PICKFIRST = 'PICKFIRST'
923         MERGE = 'MERGE'
924         SINGLE = 'SINGLE'
925         GROUP = 'GROUP'
926         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
927
928         def _parse_filter(tokens):
929             filter_parts = []
930             for type, string, start, _, _ in tokens:
931                 if type == tokenize.OP and string == ']':
932                     return ''.join(filter_parts)
933                 else:
934                     filter_parts.append(string)
935
936         def _remove_unused_ops(tokens):
937             # Remove operators that we don't use and join them with the sourrounding strings
938             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
939             ALLOWED_OPS = ('/', '+', ',', '(', ')')
940             last_string, last_start, last_end, last_line = None, None, None, None
941             for type, string, start, end, line in tokens:
942                 if type == tokenize.OP and string == '[':
943                     if last_string:
944                         yield tokenize.NAME, last_string, last_start, last_end, last_line
945                         last_string = None
946                     yield type, string, start, end, line
947                     # everything inside brackets will be handled by _parse_filter
948                     for type, string, start, end, line in tokens:
949                         yield type, string, start, end, line
950                         if type == tokenize.OP and string == ']':
951                             break
952                 elif type == tokenize.OP and string in ALLOWED_OPS:
953                     if last_string:
954                         yield tokenize.NAME, last_string, last_start, last_end, last_line
955                         last_string = None
956                     yield type, string, start, end, line
957                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
958                     if not last_string:
959                         last_string = string
960                         last_start = start
961                         last_end = end
962                     else:
963                         last_string += string
964             if last_string:
965                 yield tokenize.NAME, last_string, last_start, last_end, last_line
966
967         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
968             selectors = []
969             current_selector = None
970             for type, string, start, _, _ in tokens:
971                 # ENCODING is only defined in python 3.x
972                 if type == getattr(tokenize, 'ENCODING', None):
973                     continue
974                 elif type in [tokenize.NAME, tokenize.NUMBER]:
975                     current_selector = FormatSelector(SINGLE, string, [])
976                 elif type == tokenize.OP:
977                     if string == ')':
978                         if not inside_group:
979                             # ')' will be handled by the parentheses group
980                             tokens.restore_last_token()
981                         break
982                     elif inside_merge and string in ['/', ',']:
983                         tokens.restore_last_token()
984                         break
985                     elif inside_choice and string == ',':
986                         tokens.restore_last_token()
987                         break
988                     elif string == ',':
989                         if not current_selector:
990                             raise syntax_error('"," must follow a format selector', start)
991                         selectors.append(current_selector)
992                         current_selector = None
993                     elif string == '/':
994                         if not current_selector:
995                             raise syntax_error('"/" must follow a format selector', start)
996                         first_choice = current_selector
997                         second_choice = _parse_format_selection(tokens, inside_choice=True)
998                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
999                     elif string == '[':
1000                         if not current_selector:
1001                             current_selector = FormatSelector(SINGLE, 'best', [])
1002                         format_filter = _parse_filter(tokens)
1003                         current_selector.filters.append(format_filter)
1004                     elif string == '(':
1005                         if current_selector:
1006                             raise syntax_error('Unexpected "("', start)
1007                         group = _parse_format_selection(tokens, inside_group=True)
1008                         current_selector = FormatSelector(GROUP, group, [])
1009                     elif string == '+':
1010                         video_selector = current_selector
1011                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1012                         if not video_selector or not audio_selector:
1013                             raise syntax_error('"+" must be between two format selectors', start)
1014                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1015                     else:
1016                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1017                 elif type == tokenize.ENDMARKER:
1018                     break
1019             if current_selector:
1020                 selectors.append(current_selector)
1021             return selectors
1022
1023         def _build_selector_function(selector):
1024             if isinstance(selector, list):
1025                 fs = [_build_selector_function(s) for s in selector]
1026
1027                 def selector_function(formats):
1028                     for f in fs:
1029                         for format in f(formats):
1030                             yield format
1031                 return selector_function
1032             elif selector.type == GROUP:
1033                 selector_function = _build_selector_function(selector.selector)
1034             elif selector.type == PICKFIRST:
1035                 fs = [_build_selector_function(s) for s in selector.selector]
1036
1037                 def selector_function(formats):
1038                     for f in fs:
1039                         picked_formats = list(f(formats))
1040                         if picked_formats:
1041                             return picked_formats
1042                     return []
1043             elif selector.type == SINGLE:
1044                 format_spec = selector.selector
1045
1046                 def selector_function(formats):
1047                     formats = list(formats)
1048                     if not formats:
1049                         return
1050                     if format_spec == 'all':
1051                         for f in formats:
1052                             yield f
1053                     elif format_spec in ['best', 'worst', None]:
1054                         format_idx = 0 if format_spec == 'worst' else -1
1055                         audiovideo_formats = [
1056                             f for f in formats
1057                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1058                         if audiovideo_formats:
1059                             yield audiovideo_formats[format_idx]
1060                         # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1061                         elif (all(f.get('acodec') != 'none' for f in formats) or
1062                               all(f.get('vcodec') != 'none' for f in formats)):
1063                             yield formats[format_idx]
1064                     elif format_spec == 'bestaudio':
1065                         audio_formats = [
1066                             f for f in formats
1067                             if f.get('vcodec') == 'none']
1068                         if audio_formats:
1069                             yield audio_formats[-1]
1070                     elif format_spec == 'worstaudio':
1071                         audio_formats = [
1072                             f for f in formats
1073                             if f.get('vcodec') == 'none']
1074                         if audio_formats:
1075                             yield audio_formats[0]
1076                     elif format_spec == 'bestvideo':
1077                         video_formats = [
1078                             f for f in formats
1079                             if f.get('acodec') == 'none']
1080                         if video_formats:
1081                             yield video_formats[-1]
1082                     elif format_spec == 'worstvideo':
1083                         video_formats = [
1084                             f for f in formats
1085                             if f.get('acodec') == 'none']
1086                         if video_formats:
1087                             yield video_formats[0]
1088                     else:
1089                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1090                         if format_spec in extensions:
1091                             filter_f = lambda f: f['ext'] == format_spec
1092                         else:
1093                             filter_f = lambda f: f['format_id'] == format_spec
1094                         matches = list(filter(filter_f, formats))
1095                         if matches:
1096                             yield matches[-1]
1097             elif selector.type == MERGE:
1098                 def _merge(formats_info):
1099                     format_1, format_2 = [f['format_id'] for f in formats_info]
1100                     # The first format must contain the video and the
1101                     # second the audio
1102                     if formats_info[0].get('vcodec') == 'none':
1103                         self.report_error('The first format must '
1104                                           'contain the video, try using '
1105                                           '"-f %s+%s"' % (format_2, format_1))
1106                         return
1107                     output_ext = (
1108                         formats_info[0]['ext']
1109                         if self.params.get('merge_output_format') is None
1110                         else self.params['merge_output_format'])
1111                     return {
1112                         'requested_formats': formats_info,
1113                         'format': '%s+%s' % (formats_info[0].get('format'),
1114                                              formats_info[1].get('format')),
1115                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1116                                                 formats_info[1].get('format_id')),
1117                         'width': formats_info[0].get('width'),
1118                         'height': formats_info[0].get('height'),
1119                         'resolution': formats_info[0].get('resolution'),
1120                         'fps': formats_info[0].get('fps'),
1121                         'vcodec': formats_info[0].get('vcodec'),
1122                         'vbr': formats_info[0].get('vbr'),
1123                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1124                         'acodec': formats_info[1].get('acodec'),
1125                         'abr': formats_info[1].get('abr'),
1126                         'ext': output_ext,
1127                     }
1128                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1129
1130                 def selector_function(formats):
1131                     formats = list(formats)
1132                     for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1133                         yield _merge(pair)
1134
1135             filters = [self._build_format_filter(f) for f in selector.filters]
1136
1137             def final_selector(formats):
1138                 for _filter in filters:
1139                     formats = list(filter(_filter, formats))
1140                 return selector_function(formats)
1141             return final_selector
1142
1143         stream = io.BytesIO(format_spec.encode('utf-8'))
1144         try:
1145             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1146         except tokenize.TokenError:
1147             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1148
1149         class TokenIterator(object):
1150             def __init__(self, tokens):
1151                 self.tokens = tokens
1152                 self.counter = 0
1153
1154             def __iter__(self):
1155                 return self
1156
1157             def __next__(self):
1158                 if self.counter >= len(self.tokens):
1159                     raise StopIteration()
1160                 value = self.tokens[self.counter]
1161                 self.counter += 1
1162                 return value
1163
1164             next = __next__
1165
1166             def restore_last_token(self):
1167                 self.counter -= 1
1168
1169         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1170         return _build_selector_function(parsed_selector)
1171
1172     def _calc_headers(self, info_dict):
1173         res = std_headers.copy()
1174
1175         add_headers = info_dict.get('http_headers')
1176         if add_headers:
1177             res.update(add_headers)
1178
1179         cookies = self._calc_cookies(info_dict)
1180         if cookies:
1181             res['Cookie'] = cookies
1182
1183         return res
1184
1185     def _calc_cookies(self, info_dict):
1186         pr = compat_urllib_request.Request(info_dict['url'])
1187         self.cookiejar.add_cookie_header(pr)
1188         return pr.get_header('Cookie')
1189
1190     def process_video_result(self, info_dict, download=True):
1191         assert info_dict.get('_type', 'video') == 'video'
1192
1193         if 'id' not in info_dict:
1194             raise ExtractorError('Missing "id" field in extractor result')
1195         if 'title' not in info_dict:
1196             raise ExtractorError('Missing "title" field in extractor result')
1197
1198         if 'playlist' not in info_dict:
1199             # It isn't part of a playlist
1200             info_dict['playlist'] = None
1201             info_dict['playlist_index'] = None
1202
1203         thumbnails = info_dict.get('thumbnails')
1204         if thumbnails is None:
1205             thumbnail = info_dict.get('thumbnail')
1206             if thumbnail:
1207                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1208         if thumbnails:
1209             thumbnails.sort(key=lambda t: (
1210                 t.get('preference'), t.get('width'), t.get('height'),
1211                 t.get('id'), t.get('url')))
1212             for i, t in enumerate(thumbnails):
1213                 if t.get('width') and t.get('height'):
1214                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1215                 if t.get('id') is None:
1216                     t['id'] = '%d' % i
1217
1218         if thumbnails and 'thumbnail' not in info_dict:
1219             info_dict['thumbnail'] = thumbnails[-1]['url']
1220
1221         if 'display_id' not in info_dict and 'id' in info_dict:
1222             info_dict['display_id'] = info_dict['id']
1223
1224         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1225             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1226             # see http://bugs.python.org/issue1646728)
1227             try:
1228                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1229                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1230             except (ValueError, OverflowError, OSError):
1231                 pass
1232
1233         if self.params.get('listsubtitles', False):
1234             if 'automatic_captions' in info_dict:
1235                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1236             self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
1237             return
1238         info_dict['requested_subtitles'] = self.process_subtitles(
1239             info_dict['id'], info_dict.get('subtitles'),
1240             info_dict.get('automatic_captions'))
1241
1242         # We now pick which formats have to be downloaded
1243         if info_dict.get('formats') is None:
1244             # There's only one format available
1245             formats = [info_dict]
1246         else:
1247             formats = info_dict['formats']
1248
1249         if not formats:
1250             raise ExtractorError('No video formats found!')
1251
1252         formats_dict = {}
1253
1254         # We check that all the formats have the format and format_id fields
1255         for i, format in enumerate(formats):
1256             if 'url' not in format:
1257                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1258
1259             if format.get('format_id') is None:
1260                 format['format_id'] = compat_str(i)
1261             format_id = format['format_id']
1262             if format_id not in formats_dict:
1263                 formats_dict[format_id] = []
1264             formats_dict[format_id].append(format)
1265
1266         # Make sure all formats have unique format_id
1267         for format_id, ambiguous_formats in formats_dict.items():
1268             if len(ambiguous_formats) > 1:
1269                 for i, format in enumerate(ambiguous_formats):
1270                     format['format_id'] = '%s-%d' % (format_id, i)
1271
1272         for i, format in enumerate(formats):
1273             if format.get('format') is None:
1274                 format['format'] = '{id} - {res}{note}'.format(
1275                     id=format['format_id'],
1276                     res=self.format_resolution(format),
1277                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1278                 )
1279             # Automatically determine file extension if missing
1280             if 'ext' not in format:
1281                 format['ext'] = determine_ext(format['url']).lower()
1282             # Add HTTP headers, so that external programs can use them from the
1283             # json output
1284             full_format_info = info_dict.copy()
1285             full_format_info.update(format)
1286             format['http_headers'] = self._calc_headers(full_format_info)
1287
1288         # TODO Central sorting goes here
1289
1290         if formats[0] is not info_dict:
1291             # only set the 'formats' fields if the original info_dict list them
1292             # otherwise we end up with a circular reference, the first (and unique)
1293             # element in the 'formats' field in info_dict is info_dict itself,
1294             # wich can't be exported to json
1295             info_dict['formats'] = formats
1296         if self.params.get('listformats'):
1297             self.list_formats(info_dict)
1298             return
1299         if self.params.get('list_thumbnails'):
1300             self.list_thumbnails(info_dict)
1301             return
1302
1303         req_format = self.params.get('format')
1304         if req_format is None:
1305             req_format_list = []
1306             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1307                     info_dict['extractor'] in ['youtube', 'ted'] and
1308                     not info_dict.get('is_live')):
1309                 merger = FFmpegMergerPP(self)
1310                 if merger.available and merger.can_merge():
1311                     req_format_list.append('bestvideo+bestaudio')
1312             req_format_list.append('best')
1313             req_format = '/'.join(req_format_list)
1314         format_selector = self.build_format_selector(req_format)
1315         formats_to_download = list(format_selector(formats))
1316         if not formats_to_download:
1317             raise ExtractorError('requested format not available',
1318                                  expected=True)
1319
1320         if download:
1321             if len(formats_to_download) > 1:
1322                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1323             for format in formats_to_download:
1324                 new_info = dict(info_dict)
1325                 new_info.update(format)
1326                 self.process_info(new_info)
1327         # We update the info dict with the best quality format (backwards compatibility)
1328         info_dict.update(formats_to_download[-1])
1329         return info_dict
1330
1331     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1332         """Select the requested subtitles and their format"""
1333         available_subs = {}
1334         if normal_subtitles and self.params.get('writesubtitles'):
1335             available_subs.update(normal_subtitles)
1336         if automatic_captions and self.params.get('writeautomaticsub'):
1337             for lang, cap_info in automatic_captions.items():
1338                 if lang not in available_subs:
1339                     available_subs[lang] = cap_info
1340
1341         if (not self.params.get('writesubtitles') and not
1342                 self.params.get('writeautomaticsub') or not
1343                 available_subs):
1344             return None
1345
1346         if self.params.get('allsubtitles', False):
1347             requested_langs = available_subs.keys()
1348         else:
1349             if self.params.get('subtitleslangs', False):
1350                 requested_langs = self.params.get('subtitleslangs')
1351             elif 'en' in available_subs:
1352                 requested_langs = ['en']
1353             else:
1354                 requested_langs = [list(available_subs.keys())[0]]
1355
1356         formats_query = self.params.get('subtitlesformat', 'best')
1357         formats_preference = formats_query.split('/') if formats_query else []
1358         subs = {}
1359         for lang in requested_langs:
1360             formats = available_subs.get(lang)
1361             if formats is None:
1362                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1363                 continue
1364             for ext in formats_preference:
1365                 if ext == 'best':
1366                     f = formats[-1]
1367                     break
1368                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1369                 if matches:
1370                     f = matches[-1]
1371                     break
1372             else:
1373                 f = formats[-1]
1374                 self.report_warning(
1375                     'No subtitle format found matching "%s" for language %s, '
1376                     'using %s' % (formats_query, lang, f['ext']))
1377             subs[lang] = f
1378         return subs
1379
1380     def process_info(self, info_dict):
1381         """Process a single resolved IE result."""
1382
1383         assert info_dict.get('_type', 'video') == 'video'
1384
1385         max_downloads = self.params.get('max_downloads')
1386         if max_downloads is not None:
1387             if self._num_downloads >= int(max_downloads):
1388                 raise MaxDownloadsReached()
1389
1390         info_dict['fulltitle'] = info_dict['title']
1391         if len(info_dict['title']) > 200:
1392             info_dict['title'] = info_dict['title'][:197] + '...'
1393
1394         if 'format' not in info_dict:
1395             info_dict['format'] = info_dict['ext']
1396
1397         reason = self._match_entry(info_dict, incomplete=False)
1398         if reason is not None:
1399             self.to_screen('[download] ' + reason)
1400             return
1401
1402         self._num_downloads += 1
1403
1404         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1405
1406         # Forced printings
1407         if self.params.get('forcetitle', False):
1408             self.to_stdout(info_dict['fulltitle'])
1409         if self.params.get('forceid', False):
1410             self.to_stdout(info_dict['id'])
1411         if self.params.get('forceurl', False):
1412             if info_dict.get('requested_formats') is not None:
1413                 for f in info_dict['requested_formats']:
1414                     self.to_stdout(f['url'] + f.get('play_path', ''))
1415             else:
1416                 # For RTMP URLs, also include the playpath
1417                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1418         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1419             self.to_stdout(info_dict['thumbnail'])
1420         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1421             self.to_stdout(info_dict['description'])
1422         if self.params.get('forcefilename', False) and filename is not None:
1423             self.to_stdout(filename)
1424         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1425             self.to_stdout(formatSeconds(info_dict['duration']))
1426         if self.params.get('forceformat', False):
1427             self.to_stdout(info_dict['format'])
1428         if self.params.get('forcejson', False):
1429             self.to_stdout(json.dumps(info_dict))
1430
1431         # Do nothing else if in simulate mode
1432         if self.params.get('simulate', False):
1433             return
1434
1435         if filename is None:
1436             return
1437
1438         try:
1439             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1440             if dn and not os.path.exists(dn):
1441                 os.makedirs(dn)
1442         except (OSError, IOError) as err:
1443             self.report_error('unable to create directory ' + compat_str(err))
1444             return
1445
1446         if self.params.get('writedescription', False):
1447             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1448             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1449                 self.to_screen('[info] Video description is already present')
1450             elif info_dict.get('description') is None:
1451                 self.report_warning('There\'s no description to write.')
1452             else:
1453                 try:
1454                     self.to_screen('[info] Writing video description to: ' + descfn)
1455                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1456                         descfile.write(info_dict['description'])
1457                 except (OSError, IOError):
1458                     self.report_error('Cannot write description file ' + descfn)
1459                     return
1460
1461         if self.params.get('writeannotations', False):
1462             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1463             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1464                 self.to_screen('[info] Video annotations are already present')
1465             else:
1466                 try:
1467                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1468                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1469                         annofile.write(info_dict['annotations'])
1470                 except (KeyError, TypeError):
1471                     self.report_warning('There are no annotations to write.')
1472                 except (OSError, IOError):
1473                     self.report_error('Cannot write annotations file: ' + annofn)
1474                     return
1475
1476         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1477                                        self.params.get('writeautomaticsub')])
1478
1479         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1480             # subtitles download errors are already managed as troubles in relevant IE
1481             # that way it will silently go on when used with unsupporting IE
1482             subtitles = info_dict['requested_subtitles']
1483             ie = self.get_info_extractor(info_dict['extractor_key'])
1484             for sub_lang, sub_info in subtitles.items():
1485                 sub_format = sub_info['ext']
1486                 if sub_info.get('data') is not None:
1487                     sub_data = sub_info['data']
1488                 else:
1489                     try:
1490                         sub_data = ie._download_webpage(
1491                             sub_info['url'], info_dict['id'], note=False)
1492                     except ExtractorError as err:
1493                         self.report_warning('Unable to download subtitle for "%s": %s' %
1494                                             (sub_lang, compat_str(err.cause)))
1495                         continue
1496                 try:
1497                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1498                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1499                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1500                     else:
1501                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1502                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1503                             subfile.write(sub_data)
1504                 except (OSError, IOError):
1505                     self.report_error('Cannot write subtitles file ' + sub_filename)
1506                     return
1507
1508         if self.params.get('writeinfojson', False):
1509             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1510             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1511                 self.to_screen('[info] Video description metadata is already present')
1512             else:
1513                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1514                 try:
1515                     write_json_file(self.filter_requested_info(info_dict), infofn)
1516                 except (OSError, IOError):
1517                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1518                     return
1519
1520         self._write_thumbnails(info_dict, filename)
1521
1522         if not self.params.get('skip_download', False):
1523             try:
1524                 def dl(name, info):
1525                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1526                     for ph in self._progress_hooks:
1527                         fd.add_progress_hook(ph)
1528                     if self.params.get('verbose'):
1529                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1530                     return fd.download(name, info)
1531
1532                 if info_dict.get('requested_formats') is not None:
1533                     downloaded = []
1534                     success = True
1535                     merger = FFmpegMergerPP(self)
1536                     if not merger.available:
1537                         postprocessors = []
1538                         self.report_warning('You have requested multiple '
1539                                             'formats but ffmpeg or avconv are not installed.'
1540                                             ' The formats won\'t be merged.')
1541                     else:
1542                         postprocessors = [merger]
1543
1544                     def compatible_formats(formats):
1545                         video, audio = formats
1546                         # Check extension
1547                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1548                         if video_ext and audio_ext:
1549                             COMPATIBLE_EXTS = (
1550                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1551                                 ('webm')
1552                             )
1553                             for exts in COMPATIBLE_EXTS:
1554                                 if video_ext in exts and audio_ext in exts:
1555                                     return True
1556                         # TODO: Check acodec/vcodec
1557                         return False
1558
1559                     filename_real_ext = os.path.splitext(filename)[1][1:]
1560                     filename_wo_ext = (
1561                         os.path.splitext(filename)[0]
1562                         if filename_real_ext == info_dict['ext']
1563                         else filename)
1564                     requested_formats = info_dict['requested_formats']
1565                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1566                         info_dict['ext'] = 'mkv'
1567                         self.report_warning(
1568                             'Requested formats are incompatible for merge and will be merged into mkv.')
1569                     # Ensure filename always has a correct extension for successful merge
1570                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1571                     if os.path.exists(encodeFilename(filename)):
1572                         self.to_screen(
1573                             '[download] %s has already been downloaded and '
1574                             'merged' % filename)
1575                     else:
1576                         for f in requested_formats:
1577                             new_info = dict(info_dict)
1578                             new_info.update(f)
1579                             fname = self.prepare_filename(new_info)
1580                             fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1581                             downloaded.append(fname)
1582                             partial_success = dl(fname, new_info)
1583                             success = success and partial_success
1584                         info_dict['__postprocessors'] = postprocessors
1585                         info_dict['__files_to_merge'] = downloaded
1586                 else:
1587                     # Just a single file
1588                     success = dl(filename, info_dict)
1589             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590                 self.report_error('unable to download video data: %s' % str(err))
1591                 return
1592             except (OSError, IOError) as err:
1593                 raise UnavailableVideoError(err)
1594             except (ContentTooShortError, ) as err:
1595                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1596                 return
1597
1598             if success:
1599                 # Fixup content
1600                 fixup_policy = self.params.get('fixup')
1601                 if fixup_policy is None:
1602                     fixup_policy = 'detect_or_warn'
1603
1604                 stretched_ratio = info_dict.get('stretched_ratio')
1605                 if stretched_ratio is not None and stretched_ratio != 1:
1606                     if fixup_policy == 'warn':
1607                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1608                             info_dict['id'], stretched_ratio))
1609                     elif fixup_policy == 'detect_or_warn':
1610                         stretched_pp = FFmpegFixupStretchedPP(self)
1611                         if stretched_pp.available:
1612                             info_dict.setdefault('__postprocessors', [])
1613                             info_dict['__postprocessors'].append(stretched_pp)
1614                         else:
1615                             self.report_warning(
1616                                 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1617                                     info_dict['id'], stretched_ratio))
1618                     else:
1619                         assert fixup_policy in ('ignore', 'never')
1620
1621                 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1622                     if fixup_policy == 'warn':
1623                         self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1624                             info_dict['id']))
1625                     elif fixup_policy == 'detect_or_warn':
1626                         fixup_pp = FFmpegFixupM4aPP(self)
1627                         if fixup_pp.available:
1628                             info_dict.setdefault('__postprocessors', [])
1629                             info_dict['__postprocessors'].append(fixup_pp)
1630                         else:
1631                             self.report_warning(
1632                                 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1633                                     info_dict['id']))
1634                     else:
1635                         assert fixup_policy in ('ignore', 'never')
1636
1637                 try:
1638                     self.post_process(filename, info_dict)
1639                 except (PostProcessingError) as err:
1640                     self.report_error('postprocessing: %s' % str(err))
1641                     return
1642                 self.record_download_archive(info_dict)
1643
1644     def download(self, url_list):
1645         """Download a given list of URLs."""
1646         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1647         if (len(url_list) > 1 and
1648                 '%' not in outtmpl and
1649                 self.params.get('max_downloads') != 1):
1650             raise SameFileError(outtmpl)
1651
1652         for url in url_list:
1653             try:
1654                 # It also downloads the videos
1655                 res = self.extract_info(
1656                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1657             except UnavailableVideoError:
1658                 self.report_error('unable to download video')
1659             except MaxDownloadsReached:
1660                 self.to_screen('[info] Maximum number of downloaded files reached.')
1661                 raise
1662             else:
1663                 if self.params.get('dump_single_json', False):
1664                     self.to_stdout(json.dumps(res))
1665
1666         return self._download_retcode
1667
1668     def download_with_info_file(self, info_filename):
1669         with contextlib.closing(fileinput.FileInput(
1670                 [info_filename], mode='r',
1671                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1672             # FileInput doesn't have a read method, we can't call json.load
1673             info = self.filter_requested_info(json.loads('\n'.join(f)))
1674         try:
1675             self.process_ie_result(info, download=True)
1676         except DownloadError:
1677             webpage_url = info.get('webpage_url')
1678             if webpage_url is not None:
1679                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1680                 return self.download([webpage_url])
1681             else:
1682                 raise
1683         return self._download_retcode
1684
1685     @staticmethod
1686     def filter_requested_info(info_dict):
1687         return dict(
1688             (k, v) for k, v in info_dict.items()
1689             if k not in ['requested_formats', 'requested_subtitles'])
1690
1691     def post_process(self, filename, ie_info):
1692         """Run all the postprocessors on the given file."""
1693         info = dict(ie_info)
1694         info['filepath'] = filename
1695         pps_chain = []
1696         if ie_info.get('__postprocessors') is not None:
1697             pps_chain.extend(ie_info['__postprocessors'])
1698         pps_chain.extend(self._pps)
1699         for pp in pps_chain:
1700             files_to_delete = []
1701             try:
1702                 files_to_delete, info = pp.run(info)
1703             except PostProcessingError as e:
1704                 self.report_error(e.msg)
1705             if files_to_delete and not self.params.get('keepvideo', False):
1706                 for old_filename in files_to_delete:
1707                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1708                     try:
1709                         os.remove(encodeFilename(old_filename))
1710                     except (IOError, OSError):
1711                         self.report_warning('Unable to remove downloaded original file')
1712
1713     def _make_archive_id(self, info_dict):
1714         # Future-proof against any change in case
1715         # and backwards compatibility with prior versions
1716         extractor = info_dict.get('extractor_key')
1717         if extractor is None:
1718             if 'id' in info_dict:
1719                 extractor = info_dict.get('ie_key')  # key in a playlist
1720         if extractor is None:
1721             return None  # Incomplete video information
1722         return extractor.lower() + ' ' + info_dict['id']
1723
1724     def in_download_archive(self, info_dict):
1725         fn = self.params.get('download_archive')
1726         if fn is None:
1727             return False
1728
1729         vid_id = self._make_archive_id(info_dict)
1730         if vid_id is None:
1731             return False  # Incomplete video information
1732
1733         try:
1734             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1735                 for line in archive_file:
1736                     if line.strip() == vid_id:
1737                         return True
1738         except IOError as ioe:
1739             if ioe.errno != errno.ENOENT:
1740                 raise
1741         return False
1742
1743     def record_download_archive(self, info_dict):
1744         fn = self.params.get('download_archive')
1745         if fn is None:
1746             return
1747         vid_id = self._make_archive_id(info_dict)
1748         assert vid_id
1749         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1750             archive_file.write(vid_id + '\n')
1751
1752     @staticmethod
1753     def format_resolution(format, default='unknown'):
1754         if format.get('vcodec') == 'none':
1755             return 'audio only'
1756         if format.get('resolution') is not None:
1757             return format['resolution']
1758         if format.get('height') is not None:
1759             if format.get('width') is not None:
1760                 res = '%sx%s' % (format['width'], format['height'])
1761             else:
1762                 res = '%sp' % format['height']
1763         elif format.get('width') is not None:
1764             res = '?x%d' % format['width']
1765         else:
1766             res = default
1767         return res
1768
1769     def _format_note(self, fdict):
1770         res = ''
1771         if fdict.get('ext') in ['f4f', 'f4m']:
1772             res += '(unsupported) '
1773         if fdict.get('format_note') is not None:
1774             res += fdict['format_note'] + ' '
1775         if fdict.get('tbr') is not None:
1776             res += '%4dk ' % fdict['tbr']
1777         if fdict.get('container') is not None:
1778             if res:
1779                 res += ', '
1780             res += '%s container' % fdict['container']
1781         if (fdict.get('vcodec') is not None and
1782                 fdict.get('vcodec') != 'none'):
1783             if res:
1784                 res += ', '
1785             res += fdict['vcodec']
1786             if fdict.get('vbr') is not None:
1787                 res += '@'
1788         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1789             res += 'video@'
1790         if fdict.get('vbr') is not None:
1791             res += '%4dk' % fdict['vbr']
1792         if fdict.get('fps') is not None:
1793             res += ', %sfps' % fdict['fps']
1794         if fdict.get('acodec') is not None:
1795             if res:
1796                 res += ', '
1797             if fdict['acodec'] == 'none':
1798                 res += 'video only'
1799             else:
1800                 res += '%-5s' % fdict['acodec']
1801         elif fdict.get('abr') is not None:
1802             if res:
1803                 res += ', '
1804             res += 'audio'
1805         if fdict.get('abr') is not None:
1806             res += '@%3dk' % fdict['abr']
1807         if fdict.get('asr') is not None:
1808             res += ' (%5dHz)' % fdict['asr']
1809         if fdict.get('filesize') is not None:
1810             if res:
1811                 res += ', '
1812             res += format_bytes(fdict['filesize'])
1813         elif fdict.get('filesize_approx') is not None:
1814             if res:
1815                 res += ', '
1816             res += '~' + format_bytes(fdict['filesize_approx'])
1817         return res
1818
1819     def list_formats(self, info_dict):
1820         formats = info_dict.get('formats', [info_dict])
1821         table = [
1822             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1823             for f in formats
1824             if f.get('preference') is None or f['preference'] >= -1000]
1825         if len(formats) > 1:
1826             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1827
1828         header_line = ['format code', 'extension', 'resolution', 'note']
1829         self.to_screen(
1830             '[info] Available formats for %s:\n%s' %
1831             (info_dict['id'], render_table(header_line, table)))
1832
1833     def list_thumbnails(self, info_dict):
1834         thumbnails = info_dict.get('thumbnails')
1835         if not thumbnails:
1836             tn_url = info_dict.get('thumbnail')
1837             if tn_url:
1838                 thumbnails = [{'id': '0', 'url': tn_url}]
1839             else:
1840                 self.to_screen(
1841                     '[info] No thumbnails present for %s' % info_dict['id'])
1842                 return
1843
1844         self.to_screen(
1845             '[info] Thumbnails for %s:' % info_dict['id'])
1846         self.to_screen(render_table(
1847             ['ID', 'width', 'height', 'URL'],
1848             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1849
1850     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1851         if not subtitles:
1852             self.to_screen('%s has no %s' % (video_id, name))
1853             return
1854         self.to_screen(
1855             'Available %s for %s:' % (name, video_id))
1856         self.to_screen(render_table(
1857             ['Language', 'formats'],
1858             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1859                 for lang, formats in subtitles.items()]))
1860
1861     def urlopen(self, req):
1862         """ Start an HTTP download """
1863
1864         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1865         # always respected by websites, some tend to give out URLs with non percent-encoded
1866         # non-ASCII characters (see telemb.py, ard.py [#3412])
1867         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1868         # To work around aforementioned issue we will replace request's original URL with
1869         # percent-encoded one
1870         req_is_string = isinstance(req, compat_basestring)
1871         url = req if req_is_string else req.get_full_url()
1872         url_escaped = escape_url(url)
1873
1874         # Substitute URL if any change after escaping
1875         if url != url_escaped:
1876             if req_is_string:
1877                 req = url_escaped
1878             else:
1879                 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1880                 req = req_type(
1881                     url_escaped, data=req.data, headers=req.headers,
1882                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1883
1884         return self._opener.open(req, timeout=self._socket_timeout)
1885
1886     def print_debug_header(self):
1887         if not self.params.get('verbose'):
1888             return
1889
1890         if type('') is not compat_str:
1891             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1892             self.report_warning(
1893                 'Your Python is broken! Update to a newer and supported version')
1894
1895         stdout_encoding = getattr(
1896             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1897         encoding_str = (
1898             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1899                 locale.getpreferredencoding(),
1900                 sys.getfilesystemencoding(),
1901                 stdout_encoding,
1902                 self.get_encoding()))
1903         write_string(encoding_str, encoding=None)
1904
1905         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1906         try:
1907             sp = subprocess.Popen(
1908                 ['git', 'rev-parse', '--short', 'HEAD'],
1909                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1910                 cwd=os.path.dirname(os.path.abspath(__file__)))
1911             out, err = sp.communicate()
1912             out = out.decode().strip()
1913             if re.match('[0-9a-f]+', out):
1914                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1915         except Exception:
1916             try:
1917                 sys.exc_clear()
1918             except Exception:
1919                 pass
1920         self._write_string('[debug] Python version %s - %s\n' % (
1921             platform.python_version(), platform_name()))
1922
1923         exe_versions = FFmpegPostProcessor.get_versions(self)
1924         exe_versions['rtmpdump'] = rtmpdump_version()
1925         exe_str = ', '.join(
1926             '%s %s' % (exe, v)
1927             for exe, v in sorted(exe_versions.items())
1928             if v
1929         )
1930         if not exe_str:
1931             exe_str = 'none'
1932         self._write_string('[debug] exe versions: %s\n' % exe_str)
1933
1934         proxy_map = {}
1935         for handler in self._opener.handlers:
1936             if hasattr(handler, 'proxies'):
1937                 proxy_map.update(handler.proxies)
1938         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1939
1940         if self.params.get('call_home', False):
1941             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1942             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1943             latest_version = self.urlopen(
1944                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1945             if version_tuple(latest_version) > version_tuple(__version__):
1946                 self.report_warning(
1947                     'You are using an outdated version (newest version: %s)! '
1948                     'See https://yt-dl.org/update if you need help updating.' %
1949                     latest_version)
1950
1951     def _setup_opener(self):
1952         timeout_val = self.params.get('socket_timeout')
1953         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1954
1955         opts_cookiefile = self.params.get('cookiefile')
1956         opts_proxy = self.params.get('proxy')
1957
1958         if opts_cookiefile is None:
1959             self.cookiejar = compat_cookiejar.CookieJar()
1960         else:
1961             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1962                 opts_cookiefile)
1963             if os.access(opts_cookiefile, os.R_OK):
1964                 self.cookiejar.load()
1965
1966         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1967             self.cookiejar)
1968         if opts_proxy is not None:
1969             if opts_proxy == '':
1970                 proxies = {}
1971             else:
1972                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1973         else:
1974             proxies = compat_urllib_request.getproxies()
1975             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1976             if 'http' in proxies and 'https' not in proxies:
1977                 proxies['https'] = proxies['http']
1978         proxy_handler = PerRequestProxyHandler(proxies)
1979
1980         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1981         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1982         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1983         opener = compat_urllib_request.build_opener(
1984             proxy_handler, https_handler, cookie_processor, ydlh)
1985
1986         # Delete the default user-agent header, which would otherwise apply in
1987         # cases where our custom HTTP handler doesn't come into play
1988         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1989         opener.addheaders = []
1990         self._opener = opener
1991
1992     def encode(self, s):
1993         if isinstance(s, bytes):
1994             return s  # Already encoded
1995
1996         try:
1997             return s.encode(self.get_encoding())
1998         except UnicodeEncodeError as err:
1999             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2000             raise
2001
2002     def get_encoding(self):
2003         encoding = self.params.get('encoding')
2004         if encoding is None:
2005             encoding = preferredencoding()
2006         return encoding
2007
2008     def _write_thumbnails(self, info_dict, filename):
2009         if self.params.get('writethumbnail', False):
2010             thumbnails = info_dict.get('thumbnails')
2011             if thumbnails:
2012                 thumbnails = [thumbnails[-1]]
2013         elif self.params.get('write_all_thumbnails', False):
2014             thumbnails = info_dict.get('thumbnails')
2015         else:
2016             return
2017
2018         if not thumbnails:
2019             # No thumbnails present, so return immediately
2020             return
2021
2022         for t in thumbnails:
2023             thumb_ext = determine_ext(t['url'], 'jpg')
2024             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2025             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2026             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2027
2028             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2029                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2030                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2031             else:
2032                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2033                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2034                 try:
2035                     uf = self.urlopen(t['url'])
2036                     with open(thumb_filename, 'wb') as thumbf:
2037                         shutil.copyfileobj(uf, thumbf)
2038                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2039                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2040                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2041                     self.report_warning('Unable to download thumbnail "%s": %s' %
2042                                         (t['url'], compat_str(err)))