Merge branch 'douyutv' of https://github.com/bonfy/youtube-dl into bonfy-douyutv
[youtube-dl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import traceback
25
26 if os.name == 'nt':
27     import ctypes
28
29 from .compat import (
30     compat_basestring,
31     compat_cookiejar,
32     compat_expanduser,
33     compat_get_terminal_size,
34     compat_http_client,
35     compat_kwargs,
36     compat_str,
37     compat_urllib_error,
38     compat_urllib_request,
39 )
40 from .utils import (
41     escape_url,
42     ContentTooShortError,
43     date_from_str,
44     DateRange,
45     DEFAULT_OUTTMPL,
46     determine_ext,
47     DownloadError,
48     encodeFilename,
49     ExtractorError,
50     format_bytes,
51     formatSeconds,
52     locked_file,
53     make_HTTPS_handler,
54     MaxDownloadsReached,
55     PagedList,
56     parse_filesize,
57     PerRequestProxyHandler,
58     PostProcessingError,
59     platform_name,
60     preferredencoding,
61     render_table,
62     SameFileError,
63     sanitize_filename,
64     sanitize_path,
65     std_headers,
66     subtitles_filename,
67     takewhile_inclusive,
68     UnavailableVideoError,
69     url_basename,
70     version_tuple,
71     write_json_file,
72     write_string,
73     YoutubeDLHandler,
74     prepend_extension,
75     args_to_str,
76     age_restricted,
77 )
78 from .cache import Cache
79 from .extractor import get_info_extractor, gen_extractors
80 from .downloader import get_suitable_downloader
81 from .downloader.rtmp import rtmpdump_version
82 from .postprocessor import (
83     FFmpegFixupM4aPP,
84     FFmpegFixupStretchedPP,
85     FFmpegMergerPP,
86     FFmpegPostProcessor,
87     get_postprocessor,
88 )
89 from .version import __version__
90
91
92 class YoutubeDL(object):
93     """YoutubeDL class.
94
95     YoutubeDL objects are the ones responsible of downloading the
96     actual video file and writing it to disk if the user has requested
97     it, among some other tasks. In most cases there should be one per
98     program. As, given a video URL, the downloader doesn't know how to
99     extract all the needed information, task that InfoExtractors do, it
100     has to pass the URL to one of them.
101
102     For this, YoutubeDL objects have a method that allows
103     InfoExtractors to be registered in a given order. When it is passed
104     a URL, the YoutubeDL object handles it to the first InfoExtractor it
105     finds that reports being able to handle it. The InfoExtractor extracts
106     all the information about the video or videos the URL refers to, and
107     YoutubeDL process the extracted information, possibly using a File
108     Downloader to download the video.
109
110     YoutubeDL objects accept a lot of parameters. In order not to saturate
111     the object constructor with arguments, it receives a dictionary of
112     options instead. These options are available through the params
113     attribute for the InfoExtractors to use. The YoutubeDL also
114     registers itself as the downloader in charge for the InfoExtractors
115     that are added to it, so this is a "mutual registration".
116
117     Available options:
118
119     username:          Username for authentication purposes.
120     password:          Password for authentication purposes.
121     videopassword:     Password for acces a video.
122     usenetrc:          Use netrc for authentication instead.
123     verbose:           Print additional info to stdout.
124     quiet:             Do not print messages to stdout.
125     no_warnings:       Do not print out anything for warnings.
126     forceurl:          Force printing final URL.
127     forcetitle:        Force printing title.
128     forceid:           Force printing ID.
129     forcethumbnail:    Force printing thumbnail URL.
130     forcedescription:  Force printing description.
131     forcefilename:     Force printing final filename.
132     forceduration:     Force printing duration.
133     forcejson:         Force printing info_dict as JSON.
134     dump_single_json:  Force printing the info_dict of the whole playlist
135                        (or video) as a single JSON line.
136     simulate:          Do not download the video files.
137     format:            Video format code. See options.py for more information.
138     format_limit:      Highest quality format to try.
139     outtmpl:           Template for output names.
140     restrictfilenames: Do not allow "&" and spaces in file names
141     ignoreerrors:      Do not stop on download errors.
142     nooverwrites:      Prevent overwriting files.
143     playliststart:     Playlist item to start at.
144     playlistend:       Playlist item to end at.
145     playlist_items:    Specific indices of playlist to download.
146     playlistreverse:   Download playlist items in reverse order.
147     matchtitle:        Download only matching titles.
148     rejecttitle:       Reject downloads for matching titles.
149     logger:            Log messages to a logging.Logger instance.
150     logtostderr:       Log messages to stderr instead of stdout.
151     writedescription:  Write the video description to a .description file
152     writeinfojson:     Write the video description to a .info.json file
153     writeannotations:  Write the video annotations to a .annotations.xml file
154     writethumbnail:    Write the thumbnail image to a file
155     write_all_thumbnails:  Write all thumbnail formats to files
156     writesubtitles:    Write the video subtitles to a file
157     writeautomaticsub: Write the automatic subtitles to a file
158     allsubtitles:      Downloads all the subtitles of the video
159                        (requires writesubtitles or writeautomaticsub)
160     listsubtitles:     Lists all available subtitles for the video
161     subtitlesformat:   The format code for subtitles
162     subtitleslangs:    List of languages of the subtitles to download
163     keepvideo:         Keep the video file after post-processing
164     daterange:         A DateRange object, download only if the upload_date is in the range.
165     skip_download:     Skip the actual download of the video file
166     cachedir:          Location of the cache files in the filesystem.
167                        False to disable filesystem cache.
168     noplaylist:        Download single video instead of a playlist if in doubt.
169     age_limit:         An integer representing the user's age in years.
170                        Unsuitable videos for the given age are skipped.
171     min_views:         An integer representing the minimum view count the video
172                        must have in order to not be skipped.
173                        Videos without view count information are always
174                        downloaded. None for no limit.
175     max_views:         An integer representing the maximum view count.
176                        Videos that are more popular than that are not
177                        downloaded.
178                        Videos without view count information are always
179                        downloaded. None for no limit.
180     download_archive:  File name of a file where all downloads are recorded.
181                        Videos already present in the file are not downloaded
182                        again.
183     cookiefile:        File name where cookies should be read from and dumped to.
184     nocheckcertificate:Do not verify SSL certificates
185     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
186                        At the moment, this is only supported by YouTube.
187     proxy:             URL of the proxy server to use
188     cn_verification_proxy:  URL of the proxy to use for IP address verification
189                        on Chinese sites. (Experimental)
190     socket_timeout:    Time to wait for unresponsive hosts, in seconds
191     bidi_workaround:   Work around buggy terminals without bidirectional text
192                        support, using fridibi
193     debug_printtraffic:Print out sent and received HTTP traffic
194     include_ads:       Download ads as well
195     default_search:    Prepend this string if an input url is not valid.
196                        'auto' for elaborate guessing
197     encoding:          Use this encoding instead of the system-specified.
198     extract_flat:      Do not resolve URLs, return the immediate result.
199                        Pass in 'in_playlist' to only show this behavior for
200                        playlist items.
201     postprocessors:    A list of dictionaries, each with an entry
202                        * key:  The name of the postprocessor. See
203                                youtube_dl/postprocessor/__init__.py for a list.
204                        as well as any further keyword arguments for the
205                        postprocessor.
206     progress_hooks:    A list of functions that get called on download
207                        progress, with a dictionary with the entries
208                        * status: One of "downloading", "error", or "finished".
209                                  Check this first and ignore unknown values.
210
211                        If status is one of "downloading", or "finished", the
212                        following properties may also be present:
213                        * filename: The final filename (always present)
214                        * tmpfilename: The filename we're currently writing to
215                        * downloaded_bytes: Bytes on disk
216                        * total_bytes: Size of the whole file, None if unknown
217                        * total_bytes_estimate: Guess of the eventual file size,
218                                                None if unavailable.
219                        * elapsed: The number of seconds since download started.
220                        * eta: The estimated time in seconds, None if unknown
221                        * speed: The download speed in bytes/second, None if
222                                 unknown
223                        * fragment_index: The counter of the currently
224                                          downloaded video fragment.
225                        * fragment_count: The number of fragments (= individual
226                                          files that will be merged)
227
228                        Progress hooks are guaranteed to be called at least once
229                        (with status "finished") if the download is successful.
230     merge_output_format: Extension to use when merging formats.
231     fixup:             Automatically correct known faults of the file.
232                        One of:
233                        - "never": do nothing
234                        - "warn": only emit a warning
235                        - "detect_or_warn": check whether we can do anything
236                                            about it, warn otherwise (default)
237     source_address:    (Experimental) Client-side IP address to bind to.
238     call_home:         Boolean, true iff we are allowed to contact the
239                        youtube-dl servers for debugging.
240     sleep_interval:    Number of seconds to sleep before each download.
241     listformats:       Print an overview of available video formats and exit.
242     list_thumbnails:   Print a table of all thumbnails and exit.
243     match_filter:      A function that gets called with the info_dict of
244                        every video.
245                        If it returns a message, the video is ignored.
246                        If it returns None, the video is downloaded.
247                        match_filter_func in utils.py is one example for this.
248     no_color:          Do not emit color codes in output.
249
250     The following options determine which downloader is picked:
251     external_downloader: Executable of the external downloader to call.
252                        None or unset for standard (built-in) downloader.
253     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
254
255     The following parameters are not used by YoutubeDL itself, they are used by
256     the downloader (see youtube_dl/downloader/common.py):
257     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
258     noresizebuffer, retries, continuedl, noprogress, consoletitle,
259     xattr_set_filesize, external_downloader_args.
260
261     The following options are used by the post processors:
262     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
263                        otherwise prefer avconv.
264     exec_cmd:          Arbitrary command to run after downloading
265     """
266
267     params = None
268     _ies = []
269     _pps = []
270     _download_retcode = None
271     _num_downloads = None
272     _screen_file = None
273
274     def __init__(self, params=None, auto_init=True):
275         """Create a FileDownloader object with the given options."""
276         if params is None:
277             params = {}
278         self._ies = []
279         self._ies_instances = {}
280         self._pps = []
281         self._progress_hooks = []
282         self._download_retcode = 0
283         self._num_downloads = 0
284         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
285         self._err_file = sys.stderr
286         self.params = params
287         self.cache = Cache(self)
288
289         if params.get('bidi_workaround', False):
290             try:
291                 import pty
292                 master, slave = pty.openpty()
293                 width = compat_get_terminal_size().columns
294                 if width is None:
295                     width_args = []
296                 else:
297                     width_args = ['-w', str(width)]
298                 sp_kwargs = dict(
299                     stdin=subprocess.PIPE,
300                     stdout=slave,
301                     stderr=self._err_file)
302                 try:
303                     self._output_process = subprocess.Popen(
304                         ['bidiv'] + width_args, **sp_kwargs
305                     )
306                 except OSError:
307                     self._output_process = subprocess.Popen(
308                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
309                 self._output_channel = os.fdopen(master, 'rb')
310             except OSError as ose:
311                 if ose.errno == 2:
312                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
313                 else:
314                     raise
315
316         if (sys.version_info >= (3,) and sys.platform != 'win32' and
317                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
318                 not params.get('restrictfilenames', False)):
319             # On Python 3, the Unicode filesystem API will throw errors (#1474)
320             self.report_warning(
321                 'Assuming --restrict-filenames since file system encoding '
322                 'cannot encode all characters. '
323                 'Set the LC_ALL environment variable to fix this.')
324             self.params['restrictfilenames'] = True
325
326         if isinstance(params.get('outtmpl'), bytes):
327             self.report_warning(
328                 'Parameter outtmpl is bytes, but should be a unicode string. '
329                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
330
331         if '%(stitle)s' in self.params.get('outtmpl', ''):
332             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
333
334         self._setup_opener()
335
336         if auto_init:
337             self.print_debug_header()
338             self.add_default_info_extractors()
339
340         for pp_def_raw in self.params.get('postprocessors', []):
341             pp_class = get_postprocessor(pp_def_raw['key'])
342             pp_def = dict(pp_def_raw)
343             del pp_def['key']
344             pp = pp_class(self, **compat_kwargs(pp_def))
345             self.add_post_processor(pp)
346
347         for ph in self.params.get('progress_hooks', []):
348             self.add_progress_hook(ph)
349
350     def warn_if_short_id(self, argv):
351         # short YouTube ID starting with dash?
352         idxs = [
353             i for i, a in enumerate(argv)
354             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
355         if idxs:
356             correct_argv = (
357                 ['youtube-dl'] +
358                 [a for i, a in enumerate(argv) if i not in idxs] +
359                 ['--'] + [argv[i] for i in idxs]
360             )
361             self.report_warning(
362                 'Long argument string detected. '
363                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
364                 args_to_str(correct_argv))
365
366     def add_info_extractor(self, ie):
367         """Add an InfoExtractor object to the end of the list."""
368         self._ies.append(ie)
369         self._ies_instances[ie.ie_key()] = ie
370         ie.set_downloader(self)
371
372     def get_info_extractor(self, ie_key):
373         """
374         Get an instance of an IE with name ie_key, it will try to get one from
375         the _ies list, if there's no instance it will create a new one and add
376         it to the extractor list.
377         """
378         ie = self._ies_instances.get(ie_key)
379         if ie is None:
380             ie = get_info_extractor(ie_key)()
381             self.add_info_extractor(ie)
382         return ie
383
384     def add_default_info_extractors(self):
385         """
386         Add the InfoExtractors returned by gen_extractors to the end of the list
387         """
388         for ie in gen_extractors():
389             self.add_info_extractor(ie)
390
391     def add_post_processor(self, pp):
392         """Add a PostProcessor object to the end of the chain."""
393         self._pps.append(pp)
394         pp.set_downloader(self)
395
396     def add_progress_hook(self, ph):
397         """Add the progress hook (currently only for the file downloader)"""
398         self._progress_hooks.append(ph)
399
400     def _bidi_workaround(self, message):
401         if not hasattr(self, '_output_channel'):
402             return message
403
404         assert hasattr(self, '_output_process')
405         assert isinstance(message, compat_str)
406         line_count = message.count('\n') + 1
407         self._output_process.stdin.write((message + '\n').encode('utf-8'))
408         self._output_process.stdin.flush()
409         res = ''.join(self._output_channel.readline().decode('utf-8')
410                       for _ in range(line_count))
411         return res[:-len('\n')]
412
413     def to_screen(self, message, skip_eol=False):
414         """Print message to stdout if not in quiet mode."""
415         return self.to_stdout(message, skip_eol, check_quiet=True)
416
417     def _write_string(self, s, out=None):
418         write_string(s, out=out, encoding=self.params.get('encoding'))
419
420     def to_stdout(self, message, skip_eol=False, check_quiet=False):
421         """Print message to stdout if not in quiet mode."""
422         if self.params.get('logger'):
423             self.params['logger'].debug(message)
424         elif not check_quiet or not self.params.get('quiet', False):
425             message = self._bidi_workaround(message)
426             terminator = ['\n', ''][skip_eol]
427             output = message + terminator
428
429             self._write_string(output, self._screen_file)
430
431     def to_stderr(self, message):
432         """Print message to stderr."""
433         assert isinstance(message, compat_str)
434         if self.params.get('logger'):
435             self.params['logger'].error(message)
436         else:
437             message = self._bidi_workaround(message)
438             output = message + '\n'
439             self._write_string(output, self._err_file)
440
441     def to_console_title(self, message):
442         if not self.params.get('consoletitle', False):
443             return
444         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
445             # c_wchar_p() might not be necessary if `message` is
446             # already of type unicode()
447             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
448         elif 'TERM' in os.environ:
449             self._write_string('\033]0;%s\007' % message, self._screen_file)
450
451     def save_console_title(self):
452         if not self.params.get('consoletitle', False):
453             return
454         if 'TERM' in os.environ:
455             # Save the title on stack
456             self._write_string('\033[22;0t', self._screen_file)
457
458     def restore_console_title(self):
459         if not self.params.get('consoletitle', False):
460             return
461         if 'TERM' in os.environ:
462             # Restore the title from stack
463             self._write_string('\033[23;0t', self._screen_file)
464
465     def __enter__(self):
466         self.save_console_title()
467         return self
468
469     def __exit__(self, *args):
470         self.restore_console_title()
471
472         if self.params.get('cookiefile') is not None:
473             self.cookiejar.save()
474
475     def trouble(self, message=None, tb=None):
476         """Determine action to take when a download problem appears.
477
478         Depending on if the downloader has been configured to ignore
479         download errors or not, this method may throw an exception or
480         not when errors are found, after printing the message.
481
482         tb, if given, is additional traceback information.
483         """
484         if message is not None:
485             self.to_stderr(message)
486         if self.params.get('verbose'):
487             if tb is None:
488                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
489                     tb = ''
490                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
491                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
492                     tb += compat_str(traceback.format_exc())
493                 else:
494                     tb_data = traceback.format_list(traceback.extract_stack())
495                     tb = ''.join(tb_data)
496             self.to_stderr(tb)
497         if not self.params.get('ignoreerrors', False):
498             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
499                 exc_info = sys.exc_info()[1].exc_info
500             else:
501                 exc_info = sys.exc_info()
502             raise DownloadError(message, exc_info)
503         self._download_retcode = 1
504
505     def report_warning(self, message):
506         '''
507         Print the message to stderr, it will be prefixed with 'WARNING:'
508         If stderr is a tty file the 'WARNING:' will be colored
509         '''
510         if self.params.get('logger') is not None:
511             self.params['logger'].warning(message)
512         else:
513             if self.params.get('no_warnings'):
514                 return
515             if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
516                 _msg_header = '\033[0;33mWARNING:\033[0m'
517             else:
518                 _msg_header = 'WARNING:'
519             warning_message = '%s %s' % (_msg_header, message)
520             self.to_stderr(warning_message)
521
522     def report_error(self, message, tb=None):
523         '''
524         Do the same as trouble, but prefixes the message with 'ERROR:', colored
525         in red if stderr is a tty file.
526         '''
527         if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
528             _msg_header = '\033[0;31mERROR:\033[0m'
529         else:
530             _msg_header = 'ERROR:'
531         error_message = '%s %s' % (_msg_header, message)
532         self.trouble(error_message, tb)
533
534     def report_file_already_downloaded(self, file_name):
535         """Report file has already been fully downloaded."""
536         try:
537             self.to_screen('[download] %s has already been downloaded' % file_name)
538         except UnicodeEncodeError:
539             self.to_screen('[download] The file has already been downloaded')
540
541     def prepare_filename(self, info_dict):
542         """Generate the output filename."""
543         try:
544             template_dict = dict(info_dict)
545
546             template_dict['epoch'] = int(time.time())
547             autonumber_size = self.params.get('autonumber_size')
548             if autonumber_size is None:
549                 autonumber_size = 5
550             autonumber_templ = '%0' + str(autonumber_size) + 'd'
551             template_dict['autonumber'] = autonumber_templ % self._num_downloads
552             if template_dict.get('playlist_index') is not None:
553                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
554             if template_dict.get('resolution') is None:
555                 if template_dict.get('width') and template_dict.get('height'):
556                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
557                 elif template_dict.get('height'):
558                     template_dict['resolution'] = '%sp' % template_dict['height']
559                 elif template_dict.get('width'):
560                     template_dict['resolution'] = '?x%d' % template_dict['width']
561
562             sanitize = lambda k, v: sanitize_filename(
563                 compat_str(v),
564                 restricted=self.params.get('restrictfilenames'),
565                 is_id=(k == 'id'))
566             template_dict = dict((k, sanitize(k, v))
567                                  for k, v in template_dict.items()
568                                  if v is not None)
569             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
570
571             outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
572             tmpl = compat_expanduser(outtmpl)
573             filename = tmpl % template_dict
574             # Temporary fix for #4787
575             # 'Treat' all problem characters by passing filename through preferredencoding
576             # to workaround encoding issues with subprocess on python2 @ Windows
577             if sys.version_info < (3, 0) and sys.platform == 'win32':
578                 filename = encodeFilename(filename, True).decode(preferredencoding())
579             return filename
580         except ValueError as err:
581             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
582             return None
583
584     def _match_entry(self, info_dict, incomplete):
585         """ Returns None iff the file should be downloaded """
586
587         video_title = info_dict.get('title', info_dict.get('id', 'video'))
588         if 'title' in info_dict:
589             # This can happen when we're just evaluating the playlist
590             title = info_dict['title']
591             matchtitle = self.params.get('matchtitle', False)
592             if matchtitle:
593                 if not re.search(matchtitle, title, re.IGNORECASE):
594                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
595             rejecttitle = self.params.get('rejecttitle', False)
596             if rejecttitle:
597                 if re.search(rejecttitle, title, re.IGNORECASE):
598                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
599         date = info_dict.get('upload_date', None)
600         if date is not None:
601             dateRange = self.params.get('daterange', DateRange())
602             if date not in dateRange:
603                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
604         view_count = info_dict.get('view_count', None)
605         if view_count is not None:
606             min_views = self.params.get('min_views')
607             if min_views is not None and view_count < min_views:
608                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
609             max_views = self.params.get('max_views')
610             if max_views is not None and view_count > max_views:
611                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
612         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
613             return 'Skipping "%s" because it is age restricted' % video_title
614         if self.in_download_archive(info_dict):
615             return '%s has already been recorded in archive' % video_title
616
617         if not incomplete:
618             match_filter = self.params.get('match_filter')
619             if match_filter is not None:
620                 ret = match_filter(info_dict)
621                 if ret is not None:
622                     return ret
623
624         return None
625
626     @staticmethod
627     def add_extra_info(info_dict, extra_info):
628         '''Set the keys from extra_info in info dict if they are missing'''
629         for key, value in extra_info.items():
630             info_dict.setdefault(key, value)
631
632     def extract_info(self, url, download=True, ie_key=None, extra_info={},
633                      process=True):
634         '''
635         Returns a list with a dictionary for each video we find.
636         If 'download', also downloads the videos.
637         extra_info is a dict containing the extra values to add to each result
638         '''
639
640         if ie_key:
641             ies = [self.get_info_extractor(ie_key)]
642         else:
643             ies = self._ies
644
645         for ie in ies:
646             if not ie.suitable(url):
647                 continue
648
649             if not ie.working():
650                 self.report_warning('The program functionality for this site has been marked as broken, '
651                                     'and will probably not work.')
652
653             try:
654                 ie_result = ie.extract(url)
655                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
656                     break
657                 if isinstance(ie_result, list):
658                     # Backwards compatibility: old IE result format
659                     ie_result = {
660                         '_type': 'compat_list',
661                         'entries': ie_result,
662                     }
663                 self.add_default_extra_info(ie_result, ie, url)
664                 if process:
665                     return self.process_ie_result(ie_result, download, extra_info)
666                 else:
667                     return ie_result
668             except ExtractorError as de:  # An error we somewhat expected
669                 self.report_error(compat_str(de), de.format_traceback())
670                 break
671             except MaxDownloadsReached:
672                 raise
673             except Exception as e:
674                 if self.params.get('ignoreerrors', False):
675                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
676                     break
677                 else:
678                     raise
679         else:
680             self.report_error('no suitable InfoExtractor for URL %s' % url)
681
682     def add_default_extra_info(self, ie_result, ie, url):
683         self.add_extra_info(ie_result, {
684             'extractor': ie.IE_NAME,
685             'webpage_url': url,
686             'webpage_url_basename': url_basename(url),
687             'extractor_key': ie.ie_key(),
688         })
689
690     def process_ie_result(self, ie_result, download=True, extra_info={}):
691         """
692         Take the result of the ie(may be modified) and resolve all unresolved
693         references (URLs, playlist items).
694
695         It will also download the videos if 'download'.
696         Returns the resolved ie_result.
697         """
698
699         result_type = ie_result.get('_type', 'video')
700
701         if result_type in ('url', 'url_transparent'):
702             extract_flat = self.params.get('extract_flat', False)
703             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
704                     extract_flat is True):
705                 if self.params.get('forcejson', False):
706                     self.to_stdout(json.dumps(ie_result))
707                 return ie_result
708
709         if result_type == 'video':
710             self.add_extra_info(ie_result, extra_info)
711             return self.process_video_result(ie_result, download=download)
712         elif result_type == 'url':
713             # We have to add extra_info to the results because it may be
714             # contained in a playlist
715             return self.extract_info(ie_result['url'],
716                                      download,
717                                      ie_key=ie_result.get('ie_key'),
718                                      extra_info=extra_info)
719         elif result_type == 'url_transparent':
720             # Use the information from the embedding page
721             info = self.extract_info(
722                 ie_result['url'], ie_key=ie_result.get('ie_key'),
723                 extra_info=extra_info, download=False, process=False)
724
725             force_properties = dict(
726                 (k, v) for k, v in ie_result.items() if v is not None)
727             for f in ('_type', 'url'):
728                 if f in force_properties:
729                     del force_properties[f]
730             new_result = info.copy()
731             new_result.update(force_properties)
732
733             assert new_result.get('_type') != 'url_transparent'
734
735             return self.process_ie_result(
736                 new_result, download=download, extra_info=extra_info)
737         elif result_type == 'playlist' or result_type == 'multi_video':
738             # We process each entry in the playlist
739             playlist = ie_result.get('title', None) or ie_result.get('id', None)
740             self.to_screen('[download] Downloading playlist: %s' % playlist)
741
742             playlist_results = []
743
744             playliststart = self.params.get('playliststart', 1) - 1
745             playlistend = self.params.get('playlistend', None)
746             # For backwards compatibility, interpret -1 as whole list
747             if playlistend == -1:
748                 playlistend = None
749
750             playlistitems_str = self.params.get('playlist_items', None)
751             playlistitems = None
752             if playlistitems_str is not None:
753                 def iter_playlistitems(format):
754                     for string_segment in format.split(','):
755                         if '-' in string_segment:
756                             start, end = string_segment.split('-')
757                             for item in range(int(start), int(end) + 1):
758                                 yield int(item)
759                         else:
760                             yield int(string_segment)
761                 playlistitems = iter_playlistitems(playlistitems_str)
762
763             ie_entries = ie_result['entries']
764             if isinstance(ie_entries, list):
765                 n_all_entries = len(ie_entries)
766                 if playlistitems:
767                     entries = [ie_entries[i - 1] for i in playlistitems]
768                 else:
769                     entries = ie_entries[playliststart:playlistend]
770                 n_entries = len(entries)
771                 self.to_screen(
772                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
773                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
774             elif isinstance(ie_entries, PagedList):
775                 if playlistitems:
776                     entries = []
777                     for item in playlistitems:
778                         entries.extend(ie_entries.getslice(
779                             item - 1, item
780                         ))
781                 else:
782                     entries = ie_entries.getslice(
783                         playliststart, playlistend)
784                 n_entries = len(entries)
785                 self.to_screen(
786                     "[%s] playlist %s: Downloading %d videos" %
787                     (ie_result['extractor'], playlist, n_entries))
788             else:  # iterable
789                 if playlistitems:
790                     entry_list = list(ie_entries)
791                     entries = [entry_list[i - 1] for i in playlistitems]
792                 else:
793                     entries = list(itertools.islice(
794                         ie_entries, playliststart, playlistend))
795                 n_entries = len(entries)
796                 self.to_screen(
797                     "[%s] playlist %s: Downloading %d videos" %
798                     (ie_result['extractor'], playlist, n_entries))
799
800             if self.params.get('playlistreverse', False):
801                 entries = entries[::-1]
802
803             for i, entry in enumerate(entries, 1):
804                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
805                 extra = {
806                     'n_entries': n_entries,
807                     'playlist': playlist,
808                     'playlist_id': ie_result.get('id'),
809                     'playlist_title': ie_result.get('title'),
810                     'playlist_index': i + playliststart,
811                     'extractor': ie_result['extractor'],
812                     'webpage_url': ie_result['webpage_url'],
813                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
814                     'extractor_key': ie_result['extractor_key'],
815                 }
816
817                 reason = self._match_entry(entry, incomplete=True)
818                 if reason is not None:
819                     self.to_screen('[download] ' + reason)
820                     continue
821
822                 entry_result = self.process_ie_result(entry,
823                                                       download=download,
824                                                       extra_info=extra)
825                 playlist_results.append(entry_result)
826             ie_result['entries'] = playlist_results
827             return ie_result
828         elif result_type == 'compat_list':
829             self.report_warning(
830                 'Extractor %s returned a compat_list result. '
831                 'It needs to be updated.' % ie_result.get('extractor'))
832
833             def _fixup(r):
834                 self.add_extra_info(
835                     r,
836                     {
837                         'extractor': ie_result['extractor'],
838                         'webpage_url': ie_result['webpage_url'],
839                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
840                         'extractor_key': ie_result['extractor_key'],
841                     }
842                 )
843                 return r
844             ie_result['entries'] = [
845                 self.process_ie_result(_fixup(r), download, extra_info)
846                 for r in ie_result['entries']
847             ]
848             return ie_result
849         else:
850             raise Exception('Invalid result type: %s' % result_type)
851
852     def _apply_format_filter(self, format_spec, available_formats):
853         " Returns a tuple of the remaining format_spec and filtered formats "
854
855         OPERATORS = {
856             '<': operator.lt,
857             '<=': operator.le,
858             '>': operator.gt,
859             '>=': operator.ge,
860             '=': operator.eq,
861             '!=': operator.ne,
862         }
863         operator_rex = re.compile(r'''(?x)\s*\[
864             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
865             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
866             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
867             \]$
868             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
869         m = operator_rex.search(format_spec)
870         if m:
871             try:
872                 comparison_value = int(m.group('value'))
873             except ValueError:
874                 comparison_value = parse_filesize(m.group('value'))
875                 if comparison_value is None:
876                     comparison_value = parse_filesize(m.group('value') + 'B')
877                 if comparison_value is None:
878                     raise ValueError(
879                         'Invalid value %r in format specification %r' % (
880                             m.group('value'), format_spec))
881             op = OPERATORS[m.group('op')]
882
883         if not m:
884             STR_OPERATORS = {
885                 '=': operator.eq,
886                 '!=': operator.ne,
887             }
888             str_operator_rex = re.compile(r'''(?x)\s*\[
889                 \s*(?P<key>ext|acodec|vcodec|container|protocol)
890                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
891                 \s*(?P<value>[a-zA-Z0-9_-]+)
892                 \s*\]$
893                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
894             m = str_operator_rex.search(format_spec)
895             if m:
896                 comparison_value = m.group('value')
897                 op = STR_OPERATORS[m.group('op')]
898
899         if not m:
900             raise ValueError('Invalid format specification %r' % format_spec)
901
902         def _filter(f):
903             actual_value = f.get(m.group('key'))
904             if actual_value is None:
905                 return m.group('none_inclusive')
906             return op(actual_value, comparison_value)
907         new_formats = [f for f in available_formats if _filter(f)]
908
909         new_format_spec = format_spec[:-len(m.group(0))]
910         if not new_format_spec:
911             new_format_spec = 'best'
912
913         return (new_format_spec, new_formats)
914
915     def select_format(self, format_spec, available_formats):
916         while format_spec.endswith(']'):
917             format_spec, available_formats = self._apply_format_filter(
918                 format_spec, available_formats)
919         if not available_formats:
920             return None
921
922         if format_spec == 'best' or format_spec is None:
923             return available_formats[-1]
924         elif format_spec == 'worst':
925             return available_formats[0]
926         elif format_spec == 'bestaudio':
927             audio_formats = [
928                 f for f in available_formats
929                 if f.get('vcodec') == 'none']
930             if audio_formats:
931                 return audio_formats[-1]
932         elif format_spec == 'worstaudio':
933             audio_formats = [
934                 f for f in available_formats
935                 if f.get('vcodec') == 'none']
936             if audio_formats:
937                 return audio_formats[0]
938         elif format_spec == 'bestvideo':
939             video_formats = [
940                 f for f in available_formats
941                 if f.get('acodec') == 'none']
942             if video_formats:
943                 return video_formats[-1]
944         elif format_spec == 'worstvideo':
945             video_formats = [
946                 f for f in available_formats
947                 if f.get('acodec') == 'none']
948             if video_formats:
949                 return video_formats[0]
950         else:
951             extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
952             if format_spec in extensions:
953                 filter_f = lambda f: f['ext'] == format_spec
954             else:
955                 filter_f = lambda f: f['format_id'] == format_spec
956             matches = list(filter(filter_f, available_formats))
957             if matches:
958                 return matches[-1]
959         return None
960
961     def _calc_headers(self, info_dict):
962         res = std_headers.copy()
963
964         add_headers = info_dict.get('http_headers')
965         if add_headers:
966             res.update(add_headers)
967
968         cookies = self._calc_cookies(info_dict)
969         if cookies:
970             res['Cookie'] = cookies
971
972         return res
973
974     def _calc_cookies(self, info_dict):
975         pr = compat_urllib_request.Request(info_dict['url'])
976         self.cookiejar.add_cookie_header(pr)
977         return pr.get_header('Cookie')
978
979     def process_video_result(self, info_dict, download=True):
980         assert info_dict.get('_type', 'video') == 'video'
981
982         if 'id' not in info_dict:
983             raise ExtractorError('Missing "id" field in extractor result')
984         if 'title' not in info_dict:
985             raise ExtractorError('Missing "title" field in extractor result')
986
987         if 'playlist' not in info_dict:
988             # It isn't part of a playlist
989             info_dict['playlist'] = None
990             info_dict['playlist_index'] = None
991
992         thumbnails = info_dict.get('thumbnails')
993         if thumbnails is None:
994             thumbnail = info_dict.get('thumbnail')
995             if thumbnail:
996                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
997         if thumbnails:
998             thumbnails.sort(key=lambda t: (
999                 t.get('preference'), t.get('width'), t.get('height'),
1000                 t.get('id'), t.get('url')))
1001             for i, t in enumerate(thumbnails):
1002                 if 'width' in t and 'height' in t:
1003                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1004                 if t.get('id') is None:
1005                     t['id'] = '%d' % i
1006
1007         if thumbnails and 'thumbnail' not in info_dict:
1008             info_dict['thumbnail'] = thumbnails[-1]['url']
1009
1010         if 'display_id' not in info_dict and 'id' in info_dict:
1011             info_dict['display_id'] = info_dict['id']
1012
1013         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1014             # Working around negative timestamps in Windows
1015             # (see http://bugs.python.org/issue1646728)
1016             if info_dict['timestamp'] < 0 and os.name == 'nt':
1017                 info_dict['timestamp'] = 0
1018             upload_date = datetime.datetime.utcfromtimestamp(
1019                 info_dict['timestamp'])
1020             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1021
1022         if self.params.get('listsubtitles', False):
1023             if 'automatic_captions' in info_dict:
1024                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1025             self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
1026             return
1027         info_dict['requested_subtitles'] = self.process_subtitles(
1028             info_dict['id'], info_dict.get('subtitles'),
1029             info_dict.get('automatic_captions'))
1030
1031         # This extractors handle format selection themselves
1032         if info_dict['extractor'] in ['Youku']:
1033             if download:
1034                 self.process_info(info_dict)
1035             return info_dict
1036
1037         # We now pick which formats have to be downloaded
1038         if info_dict.get('formats') is None:
1039             # There's only one format available
1040             formats = [info_dict]
1041         else:
1042             formats = info_dict['formats']
1043
1044         if not formats:
1045             raise ExtractorError('No video formats found!')
1046
1047         # We check that all the formats have the format and format_id fields
1048         for i, format in enumerate(formats):
1049             if 'url' not in format:
1050                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1051
1052             if format.get('format_id') is None:
1053                 format['format_id'] = compat_str(i)
1054             if format.get('format') is None:
1055                 format['format'] = '{id} - {res}{note}'.format(
1056                     id=format['format_id'],
1057                     res=self.format_resolution(format),
1058                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1059                 )
1060             # Automatically determine file extension if missing
1061             if 'ext' not in format:
1062                 format['ext'] = determine_ext(format['url']).lower()
1063             # Add HTTP headers, so that external programs can use them from the
1064             # json output
1065             full_format_info = info_dict.copy()
1066             full_format_info.update(format)
1067             format['http_headers'] = self._calc_headers(full_format_info)
1068
1069         format_limit = self.params.get('format_limit', None)
1070         if format_limit:
1071             formats = list(takewhile_inclusive(
1072                 lambda f: f['format_id'] != format_limit, formats
1073             ))
1074
1075         # TODO Central sorting goes here
1076
1077         if formats[0] is not info_dict:
1078             # only set the 'formats' fields if the original info_dict list them
1079             # otherwise we end up with a circular reference, the first (and unique)
1080             # element in the 'formats' field in info_dict is info_dict itself,
1081             # wich can't be exported to json
1082             info_dict['formats'] = formats
1083         if self.params.get('listformats'):
1084             self.list_formats(info_dict)
1085             return
1086         if self.params.get('list_thumbnails'):
1087             self.list_thumbnails(info_dict)
1088             return
1089
1090         req_format = self.params.get('format')
1091         if req_format is None:
1092             req_format = 'best'
1093         formats_to_download = []
1094         if req_format == 'all':
1095             formats_to_download = formats
1096         else:
1097             for rfstr in req_format.split(','):
1098                 # We can accept formats requested in the format: 34/5/best, we pick
1099                 # the first that is available, starting from left
1100                 req_formats = rfstr.split('/')
1101                 for rf in req_formats:
1102                     if re.match(r'.+?\+.+?', rf) is not None:
1103                         # Two formats have been requested like '137+139'
1104                         format_1, format_2 = rf.split('+')
1105                         formats_info = (self.select_format(format_1, formats),
1106                                         self.select_format(format_2, formats))
1107                         if all(formats_info):
1108                             # The first format must contain the video and the
1109                             # second the audio
1110                             if formats_info[0].get('vcodec') == 'none':
1111                                 self.report_error('The first format must '
1112                                                   'contain the video, try using '
1113                                                   '"-f %s+%s"' % (format_2, format_1))
1114                                 return
1115                             output_ext = (
1116                                 formats_info[0]['ext']
1117                                 if self.params.get('merge_output_format') is None
1118                                 else self.params['merge_output_format'])
1119                             selected_format = {
1120                                 'requested_formats': formats_info,
1121                                 'format': '%s+%s' % (formats_info[0].get('format'),
1122                                                      formats_info[1].get('format')),
1123                                 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1124                                                         formats_info[1].get('format_id')),
1125                                 'width': formats_info[0].get('width'),
1126                                 'height': formats_info[0].get('height'),
1127                                 'resolution': formats_info[0].get('resolution'),
1128                                 'fps': formats_info[0].get('fps'),
1129                                 'vcodec': formats_info[0].get('vcodec'),
1130                                 'vbr': formats_info[0].get('vbr'),
1131                                 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1132                                 'acodec': formats_info[1].get('acodec'),
1133                                 'abr': formats_info[1].get('abr'),
1134                                 'ext': output_ext,
1135                             }
1136                         else:
1137                             selected_format = None
1138                     else:
1139                         selected_format = self.select_format(rf, formats)
1140                     if selected_format is not None:
1141                         formats_to_download.append(selected_format)
1142                         break
1143         if not formats_to_download:
1144             raise ExtractorError('requested format not available',
1145                                  expected=True)
1146
1147         if download:
1148             if len(formats_to_download) > 1:
1149                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1150             for format in formats_to_download:
1151                 new_info = dict(info_dict)
1152                 new_info.update(format)
1153                 self.process_info(new_info)
1154         # We update the info dict with the best quality format (backwards compatibility)
1155         info_dict.update(formats_to_download[-1])
1156         return info_dict
1157
1158     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1159         """Select the requested subtitles and their format"""
1160         available_subs = {}
1161         if normal_subtitles and self.params.get('writesubtitles'):
1162             available_subs.update(normal_subtitles)
1163         if automatic_captions and self.params.get('writeautomaticsub'):
1164             for lang, cap_info in automatic_captions.items():
1165                 if lang not in available_subs:
1166                     available_subs[lang] = cap_info
1167
1168         if (not self.params.get('writesubtitles') and not
1169                 self.params.get('writeautomaticsub') or not
1170                 available_subs):
1171             return None
1172
1173         if self.params.get('allsubtitles', False):
1174             requested_langs = available_subs.keys()
1175         else:
1176             if self.params.get('subtitleslangs', False):
1177                 requested_langs = self.params.get('subtitleslangs')
1178             elif 'en' in available_subs:
1179                 requested_langs = ['en']
1180             else:
1181                 requested_langs = [list(available_subs.keys())[0]]
1182
1183         formats_query = self.params.get('subtitlesformat', 'best')
1184         formats_preference = formats_query.split('/') if formats_query else []
1185         subs = {}
1186         for lang in requested_langs:
1187             formats = available_subs.get(lang)
1188             if formats is None:
1189                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1190                 continue
1191             for ext in formats_preference:
1192                 if ext == 'best':
1193                     f = formats[-1]
1194                     break
1195                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1196                 if matches:
1197                     f = matches[-1]
1198                     break
1199             else:
1200                 f = formats[-1]
1201                 self.report_warning(
1202                     'No subtitle format found matching "%s" for language %s, '
1203                     'using %s' % (formats_query, lang, f['ext']))
1204             subs[lang] = f
1205         return subs
1206
1207     def process_info(self, info_dict):
1208         """Process a single resolved IE result."""
1209
1210         assert info_dict.get('_type', 'video') == 'video'
1211
1212         max_downloads = self.params.get('max_downloads')
1213         if max_downloads is not None:
1214             if self._num_downloads >= int(max_downloads):
1215                 raise MaxDownloadsReached()
1216
1217         info_dict['fulltitle'] = info_dict['title']
1218         if len(info_dict['title']) > 200:
1219             info_dict['title'] = info_dict['title'][:197] + '...'
1220
1221         # Keep for backwards compatibility
1222         info_dict['stitle'] = info_dict['title']
1223
1224         if 'format' not in info_dict:
1225             info_dict['format'] = info_dict['ext']
1226
1227         reason = self._match_entry(info_dict, incomplete=False)
1228         if reason is not None:
1229             self.to_screen('[download] ' + reason)
1230             return
1231
1232         self._num_downloads += 1
1233
1234         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1235
1236         # Forced printings
1237         if self.params.get('forcetitle', False):
1238             self.to_stdout(info_dict['fulltitle'])
1239         if self.params.get('forceid', False):
1240             self.to_stdout(info_dict['id'])
1241         if self.params.get('forceurl', False):
1242             if info_dict.get('requested_formats') is not None:
1243                 for f in info_dict['requested_formats']:
1244                     self.to_stdout(f['url'] + f.get('play_path', ''))
1245             else:
1246                 # For RTMP URLs, also include the playpath
1247                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1248         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1249             self.to_stdout(info_dict['thumbnail'])
1250         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1251             self.to_stdout(info_dict['description'])
1252         if self.params.get('forcefilename', False) and filename is not None:
1253             self.to_stdout(filename)
1254         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1255             self.to_stdout(formatSeconds(info_dict['duration']))
1256         if self.params.get('forceformat', False):
1257             self.to_stdout(info_dict['format'])
1258         if self.params.get('forcejson', False):
1259             self.to_stdout(json.dumps(info_dict))
1260
1261         # Do nothing else if in simulate mode
1262         if self.params.get('simulate', False):
1263             return
1264
1265         if filename is None:
1266             return
1267
1268         try:
1269             dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1270             if dn and not os.path.exists(dn):
1271                 os.makedirs(dn)
1272         except (OSError, IOError) as err:
1273             self.report_error('unable to create directory ' + compat_str(err))
1274             return
1275
1276         if self.params.get('writedescription', False):
1277             descfn = filename + '.description'
1278             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1279                 self.to_screen('[info] Video description is already present')
1280             elif info_dict.get('description') is None:
1281                 self.report_warning('There\'s no description to write.')
1282             else:
1283                 try:
1284                     self.to_screen('[info] Writing video description to: ' + descfn)
1285                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1286                         descfile.write(info_dict['description'])
1287                 except (OSError, IOError):
1288                     self.report_error('Cannot write description file ' + descfn)
1289                     return
1290
1291         if self.params.get('writeannotations', False):
1292             annofn = filename + '.annotations.xml'
1293             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1294                 self.to_screen('[info] Video annotations are already present')
1295             else:
1296                 try:
1297                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1298                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1299                         annofile.write(info_dict['annotations'])
1300                 except (KeyError, TypeError):
1301                     self.report_warning('There are no annotations to write.')
1302                 except (OSError, IOError):
1303                     self.report_error('Cannot write annotations file: ' + annofn)
1304                     return
1305
1306         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1307                                        self.params.get('writeautomaticsub')])
1308
1309         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1310             # subtitles download errors are already managed as troubles in relevant IE
1311             # that way it will silently go on when used with unsupporting IE
1312             subtitles = info_dict['requested_subtitles']
1313             ie = self.get_info_extractor(info_dict['extractor_key'])
1314             for sub_lang, sub_info in subtitles.items():
1315                 sub_format = sub_info['ext']
1316                 if sub_info.get('data') is not None:
1317                     sub_data = sub_info['data']
1318                 else:
1319                     try:
1320                         sub_data = ie._download_webpage(
1321                             sub_info['url'], info_dict['id'], note=False)
1322                     except ExtractorError as err:
1323                         self.report_warning('Unable to download subtitle for "%s": %s' %
1324                                             (sub_lang, compat_str(err.cause)))
1325                         continue
1326                 try:
1327                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1328                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1329                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1330                     else:
1331                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1332                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1333                             subfile.write(sub_data)
1334                 except (OSError, IOError):
1335                     self.report_error('Cannot write subtitles file ' + sub_filename)
1336                     return
1337
1338         if self.params.get('writeinfojson', False):
1339             infofn = os.path.splitext(filename)[0] + '.info.json'
1340             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1341                 self.to_screen('[info] Video description metadata is already present')
1342             else:
1343                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1344                 try:
1345                     write_json_file(info_dict, infofn)
1346                 except (OSError, IOError):
1347                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1348                     return
1349
1350         self._write_thumbnails(info_dict, filename)
1351
1352         if not self.params.get('skip_download', False):
1353             try:
1354                 def dl(name, info):
1355                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1356                     for ph in self._progress_hooks:
1357                         fd.add_progress_hook(ph)
1358                     if self.params.get('verbose'):
1359                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1360                     return fd.download(name, info)
1361
1362                 if info_dict.get('requested_formats') is not None:
1363                     downloaded = []
1364                     success = True
1365                     merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1366                     if not merger.available:
1367                         postprocessors = []
1368                         self.report_warning('You have requested multiple '
1369                                             'formats but ffmpeg or avconv are not installed.'
1370                                             ' The formats won\'t be merged')
1371                     else:
1372                         postprocessors = [merger]
1373                     for f in info_dict['requested_formats']:
1374                         new_info = dict(info_dict)
1375                         new_info.update(f)
1376                         fname = self.prepare_filename(new_info)
1377                         fname = prepend_extension(fname, 'f%s' % f['format_id'])
1378                         downloaded.append(fname)
1379                         partial_success = dl(fname, new_info)
1380                         success = success and partial_success
1381                     info_dict['__postprocessors'] = postprocessors
1382                     info_dict['__files_to_merge'] = downloaded
1383                 else:
1384                     # Just a single file
1385                     success = dl(filename, info_dict)
1386             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1387                 self.report_error('unable to download video data: %s' % str(err))
1388                 return
1389             except (OSError, IOError) as err:
1390                 raise UnavailableVideoError(err)
1391             except (ContentTooShortError, ) as err:
1392                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1393                 return
1394
1395             if success:
1396                 # Fixup content
1397                 fixup_policy = self.params.get('fixup')
1398                 if fixup_policy is None:
1399                     fixup_policy = 'detect_or_warn'
1400
1401                 stretched_ratio = info_dict.get('stretched_ratio')
1402                 if stretched_ratio is not None and stretched_ratio != 1:
1403                     if fixup_policy == 'warn':
1404                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1405                             info_dict['id'], stretched_ratio))
1406                     elif fixup_policy == 'detect_or_warn':
1407                         stretched_pp = FFmpegFixupStretchedPP(self)
1408                         if stretched_pp.available:
1409                             info_dict.setdefault('__postprocessors', [])
1410                             info_dict['__postprocessors'].append(stretched_pp)
1411                         else:
1412                             self.report_warning(
1413                                 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1414                                     info_dict['id'], stretched_ratio))
1415                     else:
1416                         assert fixup_policy in ('ignore', 'never')
1417
1418                 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1419                     if fixup_policy == 'warn':
1420                         self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1421                             info_dict['id']))
1422                     elif fixup_policy == 'detect_or_warn':
1423                         fixup_pp = FFmpegFixupM4aPP(self)
1424                         if fixup_pp.available:
1425                             info_dict.setdefault('__postprocessors', [])
1426                             info_dict['__postprocessors'].append(fixup_pp)
1427                         else:
1428                             self.report_warning(
1429                                 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1430                                     info_dict['id']))
1431                     else:
1432                         assert fixup_policy in ('ignore', 'never')
1433
1434                 try:
1435                     self.post_process(filename, info_dict)
1436                 except (PostProcessingError) as err:
1437                     self.report_error('postprocessing: %s' % str(err))
1438                     return
1439                 self.record_download_archive(info_dict)
1440
1441     def download(self, url_list):
1442         """Download a given list of URLs."""
1443         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1444         if (len(url_list) > 1 and
1445                 '%' not in outtmpl and
1446                 self.params.get('max_downloads') != 1):
1447             raise SameFileError(outtmpl)
1448
1449         for url in url_list:
1450             try:
1451                 # It also downloads the videos
1452                 res = self.extract_info(url)
1453             except UnavailableVideoError:
1454                 self.report_error('unable to download video')
1455             except MaxDownloadsReached:
1456                 self.to_screen('[info] Maximum number of downloaded files reached.')
1457                 raise
1458             else:
1459                 if self.params.get('dump_single_json', False):
1460                     self.to_stdout(json.dumps(res))
1461
1462         return self._download_retcode
1463
1464     def download_with_info_file(self, info_filename):
1465         with contextlib.closing(fileinput.FileInput(
1466                 [info_filename], mode='r',
1467                 openhook=fileinput.hook_encoded('utf-8'))) as f:
1468             # FileInput doesn't have a read method, we can't call json.load
1469             info = json.loads('\n'.join(f))
1470         try:
1471             self.process_ie_result(info, download=True)
1472         except DownloadError:
1473             webpage_url = info.get('webpage_url')
1474             if webpage_url is not None:
1475                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1476                 return self.download([webpage_url])
1477             else:
1478                 raise
1479         return self._download_retcode
1480
1481     def post_process(self, filename, ie_info):
1482         """Run all the postprocessors on the given file."""
1483         info = dict(ie_info)
1484         info['filepath'] = filename
1485         pps_chain = []
1486         if ie_info.get('__postprocessors') is not None:
1487             pps_chain.extend(ie_info['__postprocessors'])
1488         pps_chain.extend(self._pps)
1489         for pp in pps_chain:
1490             keep_video = None
1491             old_filename = info['filepath']
1492             try:
1493                 keep_video_wish, info = pp.run(info)
1494                 if keep_video_wish is not None:
1495                     if keep_video_wish:
1496                         keep_video = keep_video_wish
1497                     elif keep_video is None:
1498                         # No clear decision yet, let IE decide
1499                         keep_video = keep_video_wish
1500             except PostProcessingError as e:
1501                 self.report_error(e.msg)
1502             if keep_video is False and not self.params.get('keepvideo', False):
1503                 try:
1504                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1505                     os.remove(encodeFilename(old_filename))
1506                 except (IOError, OSError):
1507                     self.report_warning('Unable to remove downloaded video file')
1508
1509     def _make_archive_id(self, info_dict):
1510         # Future-proof against any change in case
1511         # and backwards compatibility with prior versions
1512         extractor = info_dict.get('extractor_key')
1513         if extractor is None:
1514             if 'id' in info_dict:
1515                 extractor = info_dict.get('ie_key')  # key in a playlist
1516         if extractor is None:
1517             return None  # Incomplete video information
1518         return extractor.lower() + ' ' + info_dict['id']
1519
1520     def in_download_archive(self, info_dict):
1521         fn = self.params.get('download_archive')
1522         if fn is None:
1523             return False
1524
1525         vid_id = self._make_archive_id(info_dict)
1526         if vid_id is None:
1527             return False  # Incomplete video information
1528
1529         try:
1530             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1531                 for line in archive_file:
1532                     if line.strip() == vid_id:
1533                         return True
1534         except IOError as ioe:
1535             if ioe.errno != errno.ENOENT:
1536                 raise
1537         return False
1538
1539     def record_download_archive(self, info_dict):
1540         fn = self.params.get('download_archive')
1541         if fn is None:
1542             return
1543         vid_id = self._make_archive_id(info_dict)
1544         assert vid_id
1545         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1546             archive_file.write(vid_id + '\n')
1547
1548     @staticmethod
1549     def format_resolution(format, default='unknown'):
1550         if format.get('vcodec') == 'none':
1551             return 'audio only'
1552         if format.get('resolution') is not None:
1553             return format['resolution']
1554         if format.get('height') is not None:
1555             if format.get('width') is not None:
1556                 res = '%sx%s' % (format['width'], format['height'])
1557             else:
1558                 res = '%sp' % format['height']
1559         elif format.get('width') is not None:
1560             res = '?x%d' % format['width']
1561         else:
1562             res = default
1563         return res
1564
1565     def _format_note(self, fdict):
1566         res = ''
1567         if fdict.get('ext') in ['f4f', 'f4m']:
1568             res += '(unsupported) '
1569         if fdict.get('format_note') is not None:
1570             res += fdict['format_note'] + ' '
1571         if fdict.get('tbr') is not None:
1572             res += '%4dk ' % fdict['tbr']
1573         if fdict.get('container') is not None:
1574             if res:
1575                 res += ', '
1576             res += '%s container' % fdict['container']
1577         if (fdict.get('vcodec') is not None and
1578                 fdict.get('vcodec') != 'none'):
1579             if res:
1580                 res += ', '
1581             res += fdict['vcodec']
1582             if fdict.get('vbr') is not None:
1583                 res += '@'
1584         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1585             res += 'video@'
1586         if fdict.get('vbr') is not None:
1587             res += '%4dk' % fdict['vbr']
1588         if fdict.get('fps') is not None:
1589             res += ', %sfps' % fdict['fps']
1590         if fdict.get('acodec') is not None:
1591             if res:
1592                 res += ', '
1593             if fdict['acodec'] == 'none':
1594                 res += 'video only'
1595             else:
1596                 res += '%-5s' % fdict['acodec']
1597         elif fdict.get('abr') is not None:
1598             if res:
1599                 res += ', '
1600             res += 'audio'
1601         if fdict.get('abr') is not None:
1602             res += '@%3dk' % fdict['abr']
1603         if fdict.get('asr') is not None:
1604             res += ' (%5dHz)' % fdict['asr']
1605         if fdict.get('filesize') is not None:
1606             if res:
1607                 res += ', '
1608             res += format_bytes(fdict['filesize'])
1609         elif fdict.get('filesize_approx') is not None:
1610             if res:
1611                 res += ', '
1612             res += '~' + format_bytes(fdict['filesize_approx'])
1613         return res
1614
1615     def list_formats(self, info_dict):
1616         formats = info_dict.get('formats', [info_dict])
1617         table = [
1618             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1619             for f in formats
1620             if f.get('preference') is None or f['preference'] >= -1000]
1621         if len(formats) > 1:
1622             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1623
1624         header_line = ['format code', 'extension', 'resolution', 'note']
1625         self.to_screen(
1626             '[info] Available formats for %s:\n%s' %
1627             (info_dict['id'], render_table(header_line, table)))
1628
1629     def list_thumbnails(self, info_dict):
1630         thumbnails = info_dict.get('thumbnails')
1631         if not thumbnails:
1632             tn_url = info_dict.get('thumbnail')
1633             if tn_url:
1634                 thumbnails = [{'id': '0', 'url': tn_url}]
1635             else:
1636                 self.to_screen(
1637                     '[info] No thumbnails present for %s' % info_dict['id'])
1638                 return
1639
1640         self.to_screen(
1641             '[info] Thumbnails for %s:' % info_dict['id'])
1642         self.to_screen(render_table(
1643             ['ID', 'width', 'height', 'URL'],
1644             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1645
1646     def list_subtitles(self, video_id, subtitles, name='subtitles'):
1647         if not subtitles:
1648             self.to_screen('%s has no %s' % (video_id, name))
1649             return
1650         self.to_screen(
1651             'Available %s for %s:' % (name, video_id))
1652         self.to_screen(render_table(
1653             ['Language', 'formats'],
1654             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1655                 for lang, formats in subtitles.items()]))
1656
1657     def urlopen(self, req):
1658         """ Start an HTTP download """
1659
1660         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1661         # always respected by websites, some tend to give out URLs with non percent-encoded
1662         # non-ASCII characters (see telemb.py, ard.py [#3412])
1663         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1664         # To work around aforementioned issue we will replace request's original URL with
1665         # percent-encoded one
1666         req_is_string = isinstance(req, compat_basestring)
1667         url = req if req_is_string else req.get_full_url()
1668         url_escaped = escape_url(url)
1669
1670         # Substitute URL if any change after escaping
1671         if url != url_escaped:
1672             if req_is_string:
1673                 req = url_escaped
1674             else:
1675                 req = compat_urllib_request.Request(
1676                     url_escaped, data=req.data, headers=req.headers,
1677                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1678
1679         return self._opener.open(req, timeout=self._socket_timeout)
1680
1681     def print_debug_header(self):
1682         if not self.params.get('verbose'):
1683             return
1684
1685         if type('') is not compat_str:
1686             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1687             self.report_warning(
1688                 'Your Python is broken! Update to a newer and supported version')
1689
1690         stdout_encoding = getattr(
1691             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1692         encoding_str = (
1693             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1694                 locale.getpreferredencoding(),
1695                 sys.getfilesystemencoding(),
1696                 stdout_encoding,
1697                 self.get_encoding()))
1698         write_string(encoding_str, encoding=None)
1699
1700         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1701         try:
1702             sp = subprocess.Popen(
1703                 ['git', 'rev-parse', '--short', 'HEAD'],
1704                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1705                 cwd=os.path.dirname(os.path.abspath(__file__)))
1706             out, err = sp.communicate()
1707             out = out.decode().strip()
1708             if re.match('[0-9a-f]+', out):
1709                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1710         except:
1711             try:
1712                 sys.exc_clear()
1713             except:
1714                 pass
1715         self._write_string('[debug] Python version %s - %s\n' % (
1716             platform.python_version(), platform_name()))
1717
1718         exe_versions = FFmpegPostProcessor.get_versions(self)
1719         exe_versions['rtmpdump'] = rtmpdump_version()
1720         exe_str = ', '.join(
1721             '%s %s' % (exe, v)
1722             for exe, v in sorted(exe_versions.items())
1723             if v
1724         )
1725         if not exe_str:
1726             exe_str = 'none'
1727         self._write_string('[debug] exe versions: %s\n' % exe_str)
1728
1729         proxy_map = {}
1730         for handler in self._opener.handlers:
1731             if hasattr(handler, 'proxies'):
1732                 proxy_map.update(handler.proxies)
1733         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1734
1735         if self.params.get('call_home', False):
1736             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1737             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1738             latest_version = self.urlopen(
1739                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1740             if version_tuple(latest_version) > version_tuple(__version__):
1741                 self.report_warning(
1742                     'You are using an outdated version (newest version: %s)! '
1743                     'See https://yt-dl.org/update if you need help updating.' %
1744                     latest_version)
1745
1746     def _setup_opener(self):
1747         timeout_val = self.params.get('socket_timeout')
1748         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1749
1750         opts_cookiefile = self.params.get('cookiefile')
1751         opts_proxy = self.params.get('proxy')
1752
1753         if opts_cookiefile is None:
1754             self.cookiejar = compat_cookiejar.CookieJar()
1755         else:
1756             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1757                 opts_cookiefile)
1758             if os.access(opts_cookiefile, os.R_OK):
1759                 self.cookiejar.load()
1760
1761         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1762             self.cookiejar)
1763         if opts_proxy is not None:
1764             if opts_proxy == '':
1765                 proxies = {}
1766             else:
1767                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1768         else:
1769             proxies = compat_urllib_request.getproxies()
1770             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1771             if 'http' in proxies and 'https' not in proxies:
1772                 proxies['https'] = proxies['http']
1773         proxy_handler = PerRequestProxyHandler(proxies)
1774
1775         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1776         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1777         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1778         opener = compat_urllib_request.build_opener(
1779             proxy_handler, https_handler, cookie_processor, ydlh)
1780
1781         # Delete the default user-agent header, which would otherwise apply in
1782         # cases where our custom HTTP handler doesn't come into play
1783         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1784         opener.addheaders = []
1785         self._opener = opener
1786
1787     def encode(self, s):
1788         if isinstance(s, bytes):
1789             return s  # Already encoded
1790
1791         try:
1792             return s.encode(self.get_encoding())
1793         except UnicodeEncodeError as err:
1794             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1795             raise
1796
1797     def get_encoding(self):
1798         encoding = self.params.get('encoding')
1799         if encoding is None:
1800             encoding = preferredencoding()
1801         return encoding
1802
1803     def _write_thumbnails(self, info_dict, filename):
1804         if self.params.get('writethumbnail', False):
1805             thumbnails = info_dict.get('thumbnails')
1806             if thumbnails:
1807                 thumbnails = [thumbnails[-1]]
1808         elif self.params.get('write_all_thumbnails', False):
1809             thumbnails = info_dict.get('thumbnails')
1810         else:
1811             return
1812
1813         if not thumbnails:
1814             # No thumbnails present, so return immediately
1815             return
1816
1817         for t in thumbnails:
1818             thumb_ext = determine_ext(t['url'], 'jpg')
1819             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
1820             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
1821             thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
1822
1823             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1824                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
1825                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1826             else:
1827                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
1828                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1829                 try:
1830                     uf = self.urlopen(t['url'])
1831                     with open(thumb_filename, 'wb') as thumbf:
1832                         shutil.copyfileobj(uf, thumbf)
1833                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
1834                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
1835                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836                     self.report_warning('Unable to download thumbnail "%s": %s' %
1837                                         (t['url'], compat_str(err)))