_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import datetime
   8 import errno
   9 import io
  10 import itertools
  11 import json
  12 import locale
  13 import operator
  14 import os
  15 import platform
  16 import re
  17 import shutil
  18 import subprocess
  19 import socket
  20 import sys
  21 import time
  22 import traceback
  23
  24 if os.name == 'nt':
  25     import ctypes
  26
  27 from .compat import (
  28     compat_cookiejar,
  29     compat_expanduser,
  30     compat_http_client,
  31     compat_kwargs,
  32     compat_str,
  33     compat_urllib_error,
  34     compat_urllib_request,
  35 )
  36 from .utils import (
  37     escape_url,
  38     ContentTooShortError,
  39     date_from_str,
  40     DateRange,
  41     DEFAULT_OUTTMPL,
  42     determine_ext,
  43     DownloadError,
  44     encodeFilename,
  45     ExtractorError,
  46     format_bytes,
  47     formatSeconds,
  48     get_term_width,
  49     locked_file,
  50     make_HTTPS_handler,
  51     MaxDownloadsReached,
  52     PagedList,
  53     parse_filesize,
  54     PostProcessingError,
  55     platform_name,
  56     preferredencoding,
  57     render_table,
  58     SameFileError,
  59     sanitize_filename,
  60     std_headers,
  61     subtitles_filename,
  62     takewhile_inclusive,
  63     UnavailableVideoError,
  64     url_basename,
  65     version_tuple,
  66     write_json_file,
  67     write_string,
  68     YoutubeDLHandler,
  69     prepend_extension,
  70     args_to_str,
  71     age_restricted,
  72 )
  73 from .cache import Cache
  74 from .extractor import get_info_extractor, gen_extractors
  75 from .downloader import get_suitable_downloader
  76 from .downloader.rtmp import rtmpdump_version
  77 from .postprocessor import (
  78     FFmpegFixupM4aPP,
  79     FFmpegFixupStretchedPP,
  80     FFmpegMergerPP,
  81     FFmpegPostProcessor,
  82     get_postprocessor,
  83 )
  84 from .version import __version__
  85
  86
  87 class YoutubeDL(object):
  88     """YoutubeDL class.
  89
  90     YoutubeDL objects are the ones responsible of downloading the
  91     actual video file and writing it to disk if the user has requested
  92     it, among some other tasks. In most cases there should be one per
  93     program. As, given a video URL, the downloader doesn't know how to
  94     extract all the needed information, task that InfoExtractors do, it
  95     has to pass the URL to one of them.
  96
  97     For this, YoutubeDL objects have a method that allows
  98     InfoExtractors to be registered in a given order. When it is passed
  99     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 100     finds that reports being able to handle it. The InfoExtractor extracts
 101     all the information about the video or videos the URL refers to, and
 102     YoutubeDL process the extracted information, possibly using a File
 103     Downloader to download the video.
 104
 105     YoutubeDL objects accept a lot of parameters. In order not to saturate
 106     the object constructor with arguments, it receives a dictionary of
 107     options instead. These options are available through the params
 108     attribute for the InfoExtractors to use. The YoutubeDL also
 109     registers itself as the downloader in charge for the InfoExtractors
 110     that are added to it, so this is a "mutual registration".
 111
 112     Available options:
 113
 114     username:          Username for authentication purposes.
 115     password:          Password for authentication purposes.
 116     videopassword:     Password for acces a video.
 117     usenetrc:          Use netrc for authentication instead.
 118     verbose:           Print additional info to stdout.
 119     quiet:             Do not print messages to stdout.
 120     no_warnings:       Do not print out anything for warnings.
 121     forceurl:          Force printing final URL.
 122     forcetitle:        Force printing title.
 123     forceid:           Force printing ID.
 124     forcethumbnail:    Force printing thumbnail URL.
 125     forcedescription:  Force printing description.
 126     forcefilename:     Force printing final filename.
 127     forceduration:     Force printing duration.
 128     forcejson:         Force printing info_dict as JSON.
 129     dump_single_json:  Force printing the info_dict of the whole playlist
 130                        (or video) as a single JSON line.
 131     simulate:          Do not download the video files.
 132     format:            Video format code. See options.py for more information.
 133     format_limit:      Highest quality format to try.
 134     outtmpl:           Template for output names.
 135     restrictfilenames: Do not allow "&" and spaces in file names
 136     ignoreerrors:      Do not stop on download errors.
 137     nooverwrites:      Prevent overwriting files.
 138     playliststart:     Playlist item to start at.
 139     playlistend:       Playlist item to end at.
 140     playlist_items:    Specific indices of playlist to download.
 141     playlistreverse:   Download playlist items in reverse order.
 142     matchtitle:        Download only matching titles.
 143     rejecttitle:       Reject downloads for matching titles.
 144     logger:            Log messages to a logging.Logger instance.
 145     logtostderr:       Log messages to stderr instead of stdout.
 146     writedescription:  Write the video description to a .description file
 147     writeinfojson:     Write the video description to a .info.json file
 148     writeannotations:  Write the video annotations to a .annotations.xml file
 149     writethumbnail:    Write the thumbnail image to a file
 150     write_all_thumbnails:  Write all thumbnail formats to files
 151     writesubtitles:    Write the video subtitles to a file
 152     writeautomaticsub: Write the automatic subtitles to a file
 153     allsubtitles:      Downloads all the subtitles of the video
 154                        (requires writesubtitles or writeautomaticsub)
 155     listsubtitles:     Lists all available subtitles for the video
 156     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 157     subtitleslangs:    List of languages of the subtitles to download
 158     keepvideo:         Keep the video file after post-processing
 159     daterange:         A DateRange object, download only if the upload_date is in the range.
 160     skip_download:     Skip the actual download of the video file
 161     cachedir:          Location of the cache files in the filesystem.
 162                        False to disable filesystem cache.
 163     noplaylist:        Download single video instead of a playlist if in doubt.
 164     age_limit:         An integer representing the user's age in years.
 165                        Unsuitable videos for the given age are skipped.
 166     min_views:         An integer representing the minimum view count the video
 167                        must have in order to not be skipped.
 168                        Videos without view count information are always
 169                        downloaded. None for no limit.
 170     max_views:         An integer representing the maximum view count.
 171                        Videos that are more popular than that are not
 172                        downloaded.
 173                        Videos without view count information are always
 174                        downloaded. None for no limit.
 175     download_archive:  File name of a file where all downloads are recorded.
 176                        Videos already present in the file are not downloaded
 177                        again.
 178     cookiefile:        File name where cookies should be read from and dumped to.
 179     nocheckcertificate:Do not verify SSL certificates
 180     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 181                        At the moment, this is only supported by YouTube.
 182     proxy:             URL of the proxy server to use
 183     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 184     bidi_workaround:   Work around buggy terminals without bidirectional text
 185                        support, using fridibi
 186     debug_printtraffic:Print out sent and received HTTP traffic
 187     include_ads:       Download ads as well
 188     default_search:    Prepend this string if an input url is not valid.
 189                        'auto' for elaborate guessing
 190     encoding:          Use this encoding instead of the system-specified.
 191     extract_flat:      Do not resolve URLs, return the immediate result.
 192                        Pass in 'in_playlist' to only show this behavior for
 193                        playlist items.
 194     postprocessors:    A list of dictionaries, each with an entry
 195                        * key:  The name of the postprocessor. See
 196                                youtube_dl/postprocessor/__init__.py for a list.
 197                        as well as any further keyword arguments for the
 198                        postprocessor.
 199     progress_hooks:    A list of functions that get called on download
 200                        progress, with a dictionary with the entries
 201                        * status: One of "downloading" and "finished".
 202                                  Check this first and ignore unknown values.
 203
 204                        If status is one of "downloading" or "finished", the
 205                        following properties may also be present:
 206                        * filename: The final filename (always present)
 207                        * downloaded_bytes: Bytes on disk
 208                        * total_bytes: Size of the whole file, None if unknown
 209                        * tmpfilename: The filename we're currently writing to
 210                        * eta: The estimated time in seconds, None if unknown
 211                        * speed: The download speed in bytes/second, None if
 212                                 unknown
 213
 214                        Progress hooks are guaranteed to be called at least once
 215                        (with status "finished") if the download is successful.
 216     merge_output_format: Extension to use when merging formats.
 217     fixup:             Automatically correct known faults of the file.
 218                        One of:
 219                        - "never": do nothing
 220                        - "warn": only emit a warning
 221                        - "detect_or_warn": check whether we can do anything
 222                                            about it, warn otherwise (default)
 223     source_address:    (Experimental) Client-side IP address to bind to.
 224     call_home:         Boolean, true iff we are allowed to contact the
 225                        youtube-dl servers for debugging.
 226     sleep_interval:    Number of seconds to sleep before each download.
 227     external_downloader:  Executable of the external downloader to call.
 228     listformats:       Print an overview of available video formats and exit.
 229     list_thumbnails:   Print a table of all thumbnails and exit.
 230
 231
 232     The following parameters are not used by YoutubeDL itself, they are used by
 233     the FileDownloader:
 234     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 235     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 236     xattr_set_filesize.
 237
 238     The following options are used by the post processors:
 239     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 240                        otherwise prefer avconv.
 241     exec_cmd:          Arbitrary command to run after downloading
 242     """
 243
 244     params = None
 245     _ies = []
 246     _pps = []
 247     _download_retcode = None
 248     _num_downloads = None
 249     _screen_file = None
 250
 251     def __init__(self, params=None, auto_init=True):
 252         """Create a FileDownloader object with the given options."""
 253         if params is None:
 254             params = {}
 255         self._ies = []
 256         self._ies_instances = {}
 257         self._pps = []
 258         self._progress_hooks = []
 259         self._download_retcode = 0
 260         self._num_downloads = 0
 261         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 262         self._err_file = sys.stderr
 263         self.params = params
 264         self.cache = Cache(self)
 265
 266         if params.get('bidi_workaround', False):
 267             try:
 268                 import pty
 269                 master, slave = pty.openpty()
 270                 width = get_term_width()
 271                 if width is None:
 272                     width_args = []
 273                 else:
 274                     width_args = ['-w', str(width)]
 275                 sp_kwargs = dict(
 276                     stdin=subprocess.PIPE,
 277                     stdout=slave,
 278                     stderr=self._err_file)
 279                 try:
 280                     self._output_process = subprocess.Popen(
 281                         ['bidiv'] + width_args, **sp_kwargs
 282                     )
 283                 except OSError:
 284                     self._output_process = subprocess.Popen(
 285                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 286                 self._output_channel = os.fdopen(master, 'rb')
 287             except OSError as ose:
 288                 if ose.errno == 2:
 289                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 290                 else:
 291                     raise
 292
 293         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 294                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 295                 and not params.get('restrictfilenames', False)):
 296             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 297             self.report_warning(
 298                 'Assuming --restrict-filenames since file system encoding '
 299                 'cannot encode all characters. '
 300                 'Set the LC_ALL environment variable to fix this.')
 301             self.params['restrictfilenames'] = True
 302
 303         if '%(stitle)s' in self.params.get('outtmpl', ''):
 304             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 305
 306         self._setup_opener()
 307
 308         if auto_init:
 309             self.print_debug_header()
 310             self.add_default_info_extractors()
 311
 312         for pp_def_raw in self.params.get('postprocessors', []):
 313             pp_class = get_postprocessor(pp_def_raw['key'])
 314             pp_def = dict(pp_def_raw)
 315             del pp_def['key']
 316             pp = pp_class(self, **compat_kwargs(pp_def))
 317             self.add_post_processor(pp)
 318
 319         for ph in self.params.get('progress_hooks', []):
 320             self.add_progress_hook(ph)
 321
 322     def warn_if_short_id(self, argv):
 323         # short YouTube ID starting with dash?
 324         idxs = [
 325             i for i, a in enumerate(argv)
 326             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 327         if idxs:
 328             correct_argv = (
 329                 ['youtube-dl'] +
 330                 [a for i, a in enumerate(argv) if i not in idxs] +
 331                 ['--'] + [argv[i] for i in idxs]
 332             )
 333             self.report_warning(
 334                 'Long argument string detected. '
 335                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 336                 args_to_str(correct_argv))
 337
 338     def add_info_extractor(self, ie):
 339         """Add an InfoExtractor object to the end of the list."""
 340         self._ies.append(ie)
 341         self._ies_instances[ie.ie_key()] = ie
 342         ie.set_downloader(self)
 343
 344     def get_info_extractor(self, ie_key):
 345         """
 346         Get an instance of an IE with name ie_key, it will try to get one from
 347         the _ies list, if there's no instance it will create a new one and add
 348         it to the extractor list.
 349         """
 350         ie = self._ies_instances.get(ie_key)
 351         if ie is None:
 352             ie = get_info_extractor(ie_key)()
 353             self.add_info_extractor(ie)
 354         return ie
 355
 356     def add_default_info_extractors(self):
 357         """
 358         Add the InfoExtractors returned by gen_extractors to the end of the list
 359         """
 360         for ie in gen_extractors():
 361             self.add_info_extractor(ie)
 362
 363     def add_post_processor(self, pp):
 364         """Add a PostProcessor object to the end of the chain."""
 365         self._pps.append(pp)
 366         pp.set_downloader(self)
 367
 368     def add_progress_hook(self, ph):
 369         """Add the progress hook (currently only for the file downloader)"""
 370         self._progress_hooks.append(ph)
 371
 372     def _bidi_workaround(self, message):
 373         if not hasattr(self, '_output_channel'):
 374             return message
 375
 376         assert hasattr(self, '_output_process')
 377         assert isinstance(message, compat_str)
 378         line_count = message.count('\n') + 1
 379         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 380         self._output_process.stdin.flush()
 381         res = ''.join(self._output_channel.readline().decode('utf-8')
 382                       for _ in range(line_count))
 383         return res[:-len('\n')]
 384
 385     def to_screen(self, message, skip_eol=False):
 386         """Print message to stdout if not in quiet mode."""
 387         return self.to_stdout(message, skip_eol, check_quiet=True)
 388
 389     def _write_string(self, s, out=None):
 390         write_string(s, out=out, encoding=self.params.get('encoding'))
 391
 392     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 393         """Print message to stdout if not in quiet mode."""
 394         if self.params.get('logger'):
 395             self.params['logger'].debug(message)
 396         elif not check_quiet or not self.params.get('quiet', False):
 397             message = self._bidi_workaround(message)
 398             terminator = ['\n', ''][skip_eol]
 399             output = message + terminator
 400
 401             self._write_string(output, self._screen_file)
 402
 403     def to_stderr(self, message):
 404         """Print message to stderr."""
 405         assert isinstance(message, compat_str)
 406         if self.params.get('logger'):
 407             self.params['logger'].error(message)
 408         else:
 409             message = self._bidi_workaround(message)
 410             output = message + '\n'
 411             self._write_string(output, self._err_file)
 412
 413     def to_console_title(self, message):
 414         if not self.params.get('consoletitle', False):
 415             return
 416         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 417             # c_wchar_p() might not be necessary if `message` is
 418             # already of type unicode()
 419             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 420         elif 'TERM' in os.environ:
 421             self._write_string('\033]0;%s\007' % message, self._screen_file)
 422
 423     def save_console_title(self):
 424         if not self.params.get('consoletitle', False):
 425             return
 426         if 'TERM' in os.environ:
 427             # Save the title on stack
 428             self._write_string('\033[22;0t', self._screen_file)
 429
 430     def restore_console_title(self):
 431         if not self.params.get('consoletitle', False):
 432             return
 433         if 'TERM' in os.environ:
 434             # Restore the title from stack
 435             self._write_string('\033[23;0t', self._screen_file)
 436
 437     def __enter__(self):
 438         self.save_console_title()
 439         return self
 440
 441     def __exit__(self, *args):
 442         self.restore_console_title()
 443
 444         if self.params.get('cookiefile') is not None:
 445             self.cookiejar.save()
 446
 447     def trouble(self, message=None, tb=None):
 448         """Determine action to take when a download problem appears.
 449
 450         Depending on if the downloader has been configured to ignore
 451         download errors or not, this method may throw an exception or
 452         not when errors are found, after printing the message.
 453
 454         tb, if given, is additional traceback information.
 455         """
 456         if message is not None:
 457             self.to_stderr(message)
 458         if self.params.get('verbose'):
 459             if tb is None:
 460                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 461                     tb = ''
 462                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 463                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 464                     tb += compat_str(traceback.format_exc())
 465                 else:
 466                     tb_data = traceback.format_list(traceback.extract_stack())
 467                     tb = ''.join(tb_data)
 468             self.to_stderr(tb)
 469         if not self.params.get('ignoreerrors', False):
 470             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 471                 exc_info = sys.exc_info()[1].exc_info
 472             else:
 473                 exc_info = sys.exc_info()
 474             raise DownloadError(message, exc_info)
 475         self._download_retcode = 1
 476
 477     def report_warning(self, message):
 478         '''
 479         Print the message to stderr, it will be prefixed with 'WARNING:'
 480         If stderr is a tty file the 'WARNING:' will be colored
 481         '''
 482         if self.params.get('logger') is not None:
 483             self.params['logger'].warning(message)
 484         else:
 485             if self.params.get('no_warnings'):
 486                 return
 487             if self._err_file.isatty() and os.name != 'nt':
 488                 _msg_header = '\033[0;33mWARNING:\033[0m'
 489             else:
 490                 _msg_header = 'WARNING:'
 491             warning_message = '%s %s' % (_msg_header, message)
 492             self.to_stderr(warning_message)
 493
 494     def report_error(self, message, tb=None):
 495         '''
 496         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 497         in red if stderr is a tty file.
 498         '''
 499         if self._err_file.isatty() and os.name != 'nt':
 500             _msg_header = '\033[0;31mERROR:\033[0m'
 501         else:
 502             _msg_header = 'ERROR:'
 503         error_message = '%s %s' % (_msg_header, message)
 504         self.trouble(error_message, tb)
 505
 506     def report_file_already_downloaded(self, file_name):
 507         """Report file has already been fully downloaded."""
 508         try:
 509             self.to_screen('[download] %s has already been downloaded' % file_name)
 510         except UnicodeEncodeError:
 511             self.to_screen('[download] The file has already been downloaded')
 512
 513     def prepare_filename(self, info_dict):
 514         """Generate the output filename."""
 515         try:
 516             template_dict = dict(info_dict)
 517
 518             template_dict['epoch'] = int(time.time())
 519             autonumber_size = self.params.get('autonumber_size')
 520             if autonumber_size is None:
 521                 autonumber_size = 5
 522             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 523             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 524             if template_dict.get('playlist_index') is not None:
 525                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
 526             if template_dict.get('resolution') is None:
 527                 if template_dict.get('width') and template_dict.get('height'):
 528                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 529                 elif template_dict.get('height'):
 530                     template_dict['resolution'] = '%sp' % template_dict['height']
 531                 elif template_dict.get('width'):
 532                     template_dict['resolution'] = '?x%d' % template_dict['width']
 533
 534             sanitize = lambda k, v: sanitize_filename(
 535                 compat_str(v),
 536                 restricted=self.params.get('restrictfilenames'),
 537                 is_id=(k == 'id'))
 538             template_dict = dict((k, sanitize(k, v))
 539                                  for k, v in template_dict.items()
 540                                  if v is not None)
 541             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 542
 543             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 544             tmpl = compat_expanduser(outtmpl)
 545             filename = tmpl % template_dict
 546             # Temporary fix for #4787
 547             # 'Treat' all problem characters by passing filename through preferredencoding
 548             # to workaround encoding issues with subprocess on python2 @ Windows
 549             if sys.version_info < (3, 0) and sys.platform == 'win32':
 550                 filename = encodeFilename(filename, True).decode(preferredencoding())
 551             return filename
 552         except ValueError as err:
 553             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 554             return None
 555
 556     def _match_entry(self, info_dict):
 557         """ Returns None iff the file should be downloaded """
 558
 559         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 560         if 'title' in info_dict:
 561             # This can happen when we're just evaluating the playlist
 562             title = info_dict['title']
 563             matchtitle = self.params.get('matchtitle', False)
 564             if matchtitle:
 565                 if not re.search(matchtitle, title, re.IGNORECASE):
 566                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 567             rejecttitle = self.params.get('rejecttitle', False)
 568             if rejecttitle:
 569                 if re.search(rejecttitle, title, re.IGNORECASE):
 570                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 571         date = info_dict.get('upload_date', None)
 572         if date is not None:
 573             dateRange = self.params.get('daterange', DateRange())
 574             if date not in dateRange:
 575                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 576         view_count = info_dict.get('view_count', None)
 577         if view_count is not None:
 578             min_views = self.params.get('min_views')
 579             if min_views is not None and view_count < min_views:
 580                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 581             max_views = self.params.get('max_views')
 582             if max_views is not None and view_count > max_views:
 583                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 584         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 585             return 'Skipping "%s" because it is age restricted' % title
 586         if self.in_download_archive(info_dict):
 587             return '%s has already been recorded in archive' % video_title
 588         return None
 589
 590     @staticmethod
 591     def add_extra_info(info_dict, extra_info):
 592         '''Set the keys from extra_info in info dict if they are missing'''
 593         for key, value in extra_info.items():
 594             info_dict.setdefault(key, value)
 595
 596     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 597                      process=True):
 598         '''
 599         Returns a list with a dictionary for each video we find.
 600         If 'download', also downloads the videos.
 601         extra_info is a dict containing the extra values to add to each result
 602          '''
 603
 604         if ie_key:
 605             ies = [self.get_info_extractor(ie_key)]
 606         else:
 607             ies = self._ies
 608
 609         for ie in ies:
 610             if not ie.suitable(url):
 611                 continue
 612
 613             if not ie.working():
 614                 self.report_warning('The program functionality for this site has been marked as broken, '
 615                                     'and will probably not work.')
 616
 617             try:
 618                 ie_result = ie.extract(url)
 619                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 620                     break
 621                 if isinstance(ie_result, list):
 622                     # Backwards compatibility: old IE result format
 623                     ie_result = {
 624                         '_type': 'compat_list',
 625                         'entries': ie_result,
 626                     }
 627                 self.add_default_extra_info(ie_result, ie, url)
 628                 if process:
 629                     return self.process_ie_result(ie_result, download, extra_info)
 630                 else:
 631                     return ie_result
 632             except ExtractorError as de:  # An error we somewhat expected
 633                 self.report_error(compat_str(de), de.format_traceback())
 634                 break
 635             except MaxDownloadsReached:
 636                 raise
 637             except Exception as e:
 638                 if self.params.get('ignoreerrors', False):
 639                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 640                     break
 641                 else:
 642                     raise
 643         else:
 644             self.report_error('no suitable InfoExtractor for URL %s' % url)
 645
 646     def add_default_extra_info(self, ie_result, ie, url):
 647         self.add_extra_info(ie_result, {
 648             'extractor': ie.IE_NAME,
 649             'webpage_url': url,
 650             'webpage_url_basename': url_basename(url),
 651             'extractor_key': ie.ie_key(),
 652         })
 653
 654     def process_ie_result(self, ie_result, download=True, extra_info={}):
 655         """
 656         Take the result of the ie(may be modified) and resolve all unresolved
 657         references (URLs, playlist items).
 658
 659         It will also download the videos if 'download'.
 660         Returns the resolved ie_result.
 661         """
 662
 663         result_type = ie_result.get('_type', 'video')
 664
 665         if result_type in ('url', 'url_transparent'):
 666             extract_flat = self.params.get('extract_flat', False)
 667             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 668                     extract_flat is True):
 669                 if self.params.get('forcejson', False):
 670                     self.to_stdout(json.dumps(ie_result))
 671                 return ie_result
 672
 673         if result_type == 'video':
 674             self.add_extra_info(ie_result, extra_info)
 675             return self.process_video_result(ie_result, download=download)
 676         elif result_type == 'url':
 677             # We have to add extra_info to the results because it may be
 678             # contained in a playlist
 679             return self.extract_info(ie_result['url'],
 680                                      download,
 681                                      ie_key=ie_result.get('ie_key'),
 682                                      extra_info=extra_info)
 683         elif result_type == 'url_transparent':
 684             # Use the information from the embedding page
 685             info = self.extract_info(
 686                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 687                 extra_info=extra_info, download=False, process=False)
 688
 689             force_properties = dict(
 690                 (k, v) for k, v in ie_result.items() if v is not None)
 691             for f in ('_type', 'url'):
 692                 if f in force_properties:
 693                     del force_properties[f]
 694             new_result = info.copy()
 695             new_result.update(force_properties)
 696
 697             assert new_result.get('_type') != 'url_transparent'
 698
 699             return self.process_ie_result(
 700                 new_result, download=download, extra_info=extra_info)
 701         elif result_type == 'playlist' or result_type == 'multi_video':
 702             # We process each entry in the playlist
 703             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 704             self.to_screen('[download] Downloading playlist: %s' % playlist)
 705
 706             playlist_results = []
 707
 708             playliststart = self.params.get('playliststart', 1) - 1
 709             playlistend = self.params.get('playlistend', None)
 710             # For backwards compatibility, interpret -1 as whole list
 711             if playlistend == -1:
 712                 playlistend = None
 713
 714             playlistitems_str = self.params.get('playlist_items', None)
 715             playlistitems = None
 716             if playlistitems_str is not None:
 717                 def iter_playlistitems(format):
 718                     for string_segment in format.split(','):
 719                         if '-' in string_segment:
 720                             start, end = string_segment.split('-')
 721                             for item in range(int(start), int(end) + 1):
 722                                 yield int(item)
 723                         else:
 724                             yield int(string_segment)
 725                 playlistitems = iter_playlistitems(playlistitems_str)
 726
 727             ie_entries = ie_result['entries']
 728             if isinstance(ie_entries, list):
 729                 n_all_entries = len(ie_entries)
 730                 if playlistitems:
 731                     entries = [ie_entries[i - 1] for i in playlistitems]
 732                 else:
 733                     entries = ie_entries[playliststart:playlistend]
 734                 n_entries = len(entries)
 735                 self.to_screen(
 736                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
 737                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 738             elif isinstance(ie_entries, PagedList):
 739                 if playlistitems:
 740                     entries = []
 741                     for item in playlistitems:
 742                         entries.extend(ie_entries.getslice(
 743                             item - 1, item
 744                         ))
 745                 else:
 746                     entries = ie_entries.getslice(
 747                         playliststart, playlistend)
 748                 n_entries = len(entries)
 749                 self.to_screen(
 750                     "[%s] playlist %s: Downloading %d videos" %
 751                     (ie_result['extractor'], playlist, n_entries))
 752             else:  # iterable
 753                 if playlistitems:
 754                     entry_list = list(ie_entries)
 755                     entries = [entry_list[i - 1] for i in playlistitems]
 756                 else:
 757                     entries = list(itertools.islice(
 758                         ie_entries, playliststart, playlistend))
 759                 n_entries = len(entries)
 760                 self.to_screen(
 761                     "[%s] playlist %s: Downloading %d videos" %
 762                     (ie_result['extractor'], playlist, n_entries))
 763
 764             if self.params.get('playlistreverse', False):
 765                 entries = entries[::-1]
 766
 767             for i, entry in enumerate(entries, 1):
 768                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 769                 extra = {
 770                     'n_entries': n_entries,
 771                     'playlist': playlist,
 772                     'playlist_id': ie_result.get('id'),
 773                     'playlist_title': ie_result.get('title'),
 774                     'playlist_index': i + playliststart,
 775                     'extractor': ie_result['extractor'],
 776                     'webpage_url': ie_result['webpage_url'],
 777                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 778                     'extractor_key': ie_result['extractor_key'],
 779                 }
 780
 781                 reason = self._match_entry(entry)
 782                 if reason is not None:
 783                     self.to_screen('[download] ' + reason)
 784                     continue
 785
 786                 entry_result = self.process_ie_result(entry,
 787                                                       download=download,
 788                                                       extra_info=extra)
 789                 playlist_results.append(entry_result)
 790             ie_result['entries'] = playlist_results
 791             return ie_result
 792         elif result_type == 'compat_list':
 793             self.report_warning(
 794                 'Extractor %s returned a compat_list result. '
 795                 'It needs to be updated.' % ie_result.get('extractor'))
 796
 797             def _fixup(r):
 798                 self.add_extra_info(
 799                     r,
 800                     {
 801                         'extractor': ie_result['extractor'],
 802                         'webpage_url': ie_result['webpage_url'],
 803                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 804                         'extractor_key': ie_result['extractor_key'],
 805                     }
 806                 )
 807                 return r
 808             ie_result['entries'] = [
 809                 self.process_ie_result(_fixup(r), download, extra_info)
 810                 for r in ie_result['entries']
 811             ]
 812             return ie_result
 813         else:
 814             raise Exception('Invalid result type: %s' % result_type)
 815
 816     def _apply_format_filter(self, format_spec, available_formats):
 817         " Returns a tuple of the remaining format_spec and filtered formats "
 818
 819         OPERATORS = {
 820             '<': operator.lt,
 821             '<=': operator.le,
 822             '>': operator.gt,
 823             '>=': operator.ge,
 824             '=': operator.eq,
 825             '!=': operator.ne,
 826         }
 827         operator_rex = re.compile(r'''(?x)\s*\[
 828             (?P<key>width|height|tbr|abr|vbr|filesize|fps)
 829             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
 830             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
 831             \]$
 832             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
 833         m = operator_rex.search(format_spec)
 834         if not m:
 835             raise ValueError('Invalid format specification %r' % format_spec)
 836
 837         try:
 838             comparison_value = int(m.group('value'))
 839         except ValueError:
 840             comparison_value = parse_filesize(m.group('value'))
 841             if comparison_value is None:
 842                 comparison_value = parse_filesize(m.group('value') + 'B')
 843             if comparison_value is None:
 844                 raise ValueError(
 845                     'Invalid value %r in format specification %r' % (
 846                         m.group('value'), format_spec))
 847         op = OPERATORS[m.group('op')]
 848
 849         def _filter(f):
 850             actual_value = f.get(m.group('key'))
 851             if actual_value is None:
 852                 return m.group('none_inclusive')
 853             return op(actual_value, comparison_value)
 854         new_formats = [f for f in available_formats if _filter(f)]
 855
 856         new_format_spec = format_spec[:-len(m.group(0))]
 857         if not new_format_spec:
 858             new_format_spec = 'best'
 859
 860         return (new_format_spec, new_formats)
 861
 862     def select_format(self, format_spec, available_formats):
 863         while format_spec.endswith(']'):
 864             format_spec, available_formats = self._apply_format_filter(
 865                 format_spec, available_formats)
 866         if not available_formats:
 867             return None
 868
 869         if format_spec == 'best' or format_spec is None:
 870             return available_formats[-1]
 871         elif format_spec == 'worst':
 872             return available_formats[0]
 873         elif format_spec == 'bestaudio':
 874             audio_formats = [
 875                 f for f in available_formats
 876                 if f.get('vcodec') == 'none']
 877             if audio_formats:
 878                 return audio_formats[-1]
 879         elif format_spec == 'worstaudio':
 880             audio_formats = [
 881                 f for f in available_formats
 882                 if f.get('vcodec') == 'none']
 883             if audio_formats:
 884                 return audio_formats[0]
 885         elif format_spec == 'bestvideo':
 886             video_formats = [
 887                 f for f in available_formats
 888                 if f.get('acodec') == 'none']
 889             if video_formats:
 890                 return video_formats[-1]
 891         elif format_spec == 'worstvideo':
 892             video_formats = [
 893                 f for f in available_formats
 894                 if f.get('acodec') == 'none']
 895             if video_formats:
 896                 return video_formats[0]
 897         else:
 898             extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
 899             if format_spec in extensions:
 900                 filter_f = lambda f: f['ext'] == format_spec
 901             else:
 902                 filter_f = lambda f: f['format_id'] == format_spec
 903             matches = list(filter(filter_f, available_formats))
 904             if matches:
 905                 return matches[-1]
 906         return None
 907
 908     def _calc_headers(self, info_dict):
 909         res = std_headers.copy()
 910
 911         add_headers = info_dict.get('http_headers')
 912         if add_headers:
 913             res.update(add_headers)
 914
 915         cookies = self._calc_cookies(info_dict)
 916         if cookies:
 917             res['Cookie'] = cookies
 918
 919         return res
 920
 921     def _calc_cookies(self, info_dict):
 922         class _PseudoRequest(object):
 923             def __init__(self, url):
 924                 self.url = url
 925                 self.headers = {}
 926                 self.unverifiable = False
 927
 928             def add_unredirected_header(self, k, v):
 929                 self.headers[k] = v
 930
 931             def get_full_url(self):
 932                 return self.url
 933
 934             def is_unverifiable(self):
 935                 return self.unverifiable
 936
 937             def has_header(self, h):
 938                 return h in self.headers
 939
 940         pr = _PseudoRequest(info_dict['url'])
 941         self.cookiejar.add_cookie_header(pr)
 942         return pr.headers.get('Cookie')
 943
 944     def process_video_result(self, info_dict, download=True):
 945         assert info_dict.get('_type', 'video') == 'video'
 946
 947         if 'id' not in info_dict:
 948             raise ExtractorError('Missing "id" field in extractor result')
 949         if 'title' not in info_dict:
 950             raise ExtractorError('Missing "title" field in extractor result')
 951
 952         if 'playlist' not in info_dict:
 953             # It isn't part of a playlist
 954             info_dict['playlist'] = None
 955             info_dict['playlist_index'] = None
 956
 957         thumbnails = info_dict.get('thumbnails')
 958         if thumbnails is None:
 959             thumbnail = info_dict.get('thumbnail')
 960             if thumbnail:
 961                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
 962         if thumbnails:
 963             thumbnails.sort(key=lambda t: (
 964                 t.get('preference'), t.get('width'), t.get('height'),
 965                 t.get('id'), t.get('url')))
 966             for t in thumbnails:
 967                 if 'width' in t and 'height' in t:
 968                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
 969
 970         if thumbnails and 'thumbnail' not in info_dict:
 971             info_dict['thumbnail'] = thumbnails[-1]['url']
 972
 973         if 'display_id' not in info_dict and 'id' in info_dict:
 974             info_dict['display_id'] = info_dict['id']
 975
 976         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
 977             # Working around negative timestamps in Windows
 978             # (see http://bugs.python.org/issue1646728)
 979             if info_dict['timestamp'] < 0 and os.name == 'nt':
 980                 info_dict['timestamp'] = 0
 981             upload_date = datetime.datetime.utcfromtimestamp(
 982                 info_dict['timestamp'])
 983             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 984
 985         # This extractors handle format selection themselves
 986         if info_dict['extractor'] in ['Youku']:
 987             if download:
 988                 self.process_info(info_dict)
 989             return info_dict
 990
 991         # We now pick which formats have to be downloaded
 992         if info_dict.get('formats') is None:
 993             # There's only one format available
 994             formats = [info_dict]
 995         else:
 996             formats = info_dict['formats']
 997
 998         if not formats:
 999             raise ExtractorError('No video formats found!')
1000
1001         # We check that all the formats have the format and format_id fields
1002         for i, format in enumerate(formats):
1003             if 'url' not in format:
1004                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1005
1006             if format.get('format_id') is None:
1007                 format['format_id'] = compat_str(i)
1008             if format.get('format') is None:
1009                 format['format'] = '{id} - {res}{note}'.format(
1010                     id=format['format_id'],
1011                     res=self.format_resolution(format),
1012                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1013                 )
1014             # Automatically determine file extension if missing
1015             if 'ext' not in format:
1016                 format['ext'] = determine_ext(format['url']).lower()
1017             # Add HTTP headers, so that external programs can use them from the
1018             # json output
1019             full_format_info = info_dict.copy()
1020             full_format_info.update(format)
1021             format['http_headers'] = self._calc_headers(full_format_info)
1022
1023         format_limit = self.params.get('format_limit', None)
1024         if format_limit:
1025             formats = list(takewhile_inclusive(
1026                 lambda f: f['format_id'] != format_limit, formats
1027             ))
1028
1029         # TODO Central sorting goes here
1030
1031         if formats[0] is not info_dict:
1032             # only set the 'formats' fields if the original info_dict list them
1033             # otherwise we end up with a circular reference, the first (and unique)
1034             # element in the 'formats' field in info_dict is info_dict itself,
1035             # wich can't be exported to json
1036             info_dict['formats'] = formats
1037         if self.params.get('listformats'):
1038             self.list_formats(info_dict)
1039             return
1040         if self.params.get('list_thumbnails'):
1041             self.list_thumbnails(info_dict)
1042             return
1043
1044         req_format = self.params.get('format')
1045         if req_format is None:
1046             req_format = 'best'
1047         formats_to_download = []
1048         # The -1 is for supporting YoutubeIE
1049         if req_format in ('-1', 'all'):
1050             formats_to_download = formats
1051         else:
1052             for rfstr in req_format.split(','):
1053                 # We can accept formats requested in the format: 34/5/best, we pick
1054                 # the first that is available, starting from left
1055                 req_formats = rfstr.split('/')
1056                 for rf in req_formats:
1057                     if re.match(r'.+?\+.+?', rf) is not None:
1058                         # Two formats have been requested like '137+139'
1059                         format_1, format_2 = rf.split('+')
1060                         formats_info = (self.select_format(format_1, formats),
1061                                         self.select_format(format_2, formats))
1062                         if all(formats_info):
1063                             # The first format must contain the video and the
1064                             # second the audio
1065                             if formats_info[0].get('vcodec') == 'none':
1066                                 self.report_error('The first format must '
1067                                                   'contain the video, try using '
1068                                                   '"-f %s+%s"' % (format_2, format_1))
1069                                 return
1070                             output_ext = (
1071                                 formats_info[0]['ext']
1072                                 if self.params.get('merge_output_format') is None
1073                                 else self.params['merge_output_format'])
1074                             selected_format = {
1075                                 'requested_formats': formats_info,
1076                                 'format': rf,
1077                                 'format_id': rf,
1078                                 'ext': formats_info[0]['ext'],
1079                                 'width': formats_info[0].get('width'),
1080                                 'height': formats_info[0].get('height'),
1081                                 'resolution': formats_info[0].get('resolution'),
1082                                 'fps': formats_info[0].get('fps'),
1083                                 'vcodec': formats_info[0].get('vcodec'),
1084                                 'vbr': formats_info[0].get('vbr'),
1085                                 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1086                                 'acodec': formats_info[1].get('acodec'),
1087                                 'abr': formats_info[1].get('abr'),
1088                                 'ext': output_ext,
1089                             }
1090                         else:
1091                             selected_format = None
1092                     else:
1093                         selected_format = self.select_format(rf, formats)
1094                     if selected_format is not None:
1095                         formats_to_download.append(selected_format)
1096                         break
1097         if not formats_to_download:
1098             raise ExtractorError('requested format not available',
1099                                  expected=True)
1100
1101         if download:
1102             if len(formats_to_download) > 1:
1103                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1104             for format in formats_to_download:
1105                 new_info = dict(info_dict)
1106                 new_info.update(format)
1107                 self.process_info(new_info)
1108         # We update the info dict with the best quality format (backwards compatibility)
1109         info_dict.update(formats_to_download[-1])
1110         return info_dict
1111
1112     def process_info(self, info_dict):
1113         """Process a single resolved IE result."""
1114
1115         assert info_dict.get('_type', 'video') == 'video'
1116
1117         max_downloads = self.params.get('max_downloads')
1118         if max_downloads is not None:
1119             if self._num_downloads >= int(max_downloads):
1120                 raise MaxDownloadsReached()
1121
1122         info_dict['fulltitle'] = info_dict['title']
1123         if len(info_dict['title']) > 200:
1124             info_dict['title'] = info_dict['title'][:197] + '...'
1125
1126         # Keep for backwards compatibility
1127         info_dict['stitle'] = info_dict['title']
1128
1129         if 'format' not in info_dict:
1130             info_dict['format'] = info_dict['ext']
1131
1132         reason = self._match_entry(info_dict)
1133         if reason is not None:
1134             self.to_screen('[download] ' + reason)
1135             return
1136
1137         self._num_downloads += 1
1138
1139         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1140
1141         # Forced printings
1142         if self.params.get('forcetitle', False):
1143             self.to_stdout(info_dict['fulltitle'])
1144         if self.params.get('forceid', False):
1145             self.to_stdout(info_dict['id'])
1146         if self.params.get('forceurl', False):
1147             if info_dict.get('requested_formats') is not None:
1148                 for f in info_dict['requested_formats']:
1149                     self.to_stdout(f['url'] + f.get('play_path', ''))
1150             else:
1151                 # For RTMP URLs, also include the playpath
1152                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1153         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1154             self.to_stdout(info_dict['thumbnail'])
1155         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1156             self.to_stdout(info_dict['description'])
1157         if self.params.get('forcefilename', False) and filename is not None:
1158             self.to_stdout(filename)
1159         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1160             self.to_stdout(formatSeconds(info_dict['duration']))
1161         if self.params.get('forceformat', False):
1162             self.to_stdout(info_dict['format'])
1163         if self.params.get('forcejson', False):
1164             self.to_stdout(json.dumps(info_dict))
1165
1166         # Do nothing else if in simulate mode
1167         if self.params.get('simulate', False):
1168             return
1169
1170         if filename is None:
1171             return
1172
1173         try:
1174             dn = os.path.dirname(encodeFilename(filename))
1175             if dn and not os.path.exists(dn):
1176                 os.makedirs(dn)
1177         except (OSError, IOError) as err:
1178             self.report_error('unable to create directory ' + compat_str(err))
1179             return
1180
1181         if self.params.get('writedescription', False):
1182             descfn = filename + '.description'
1183             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1184                 self.to_screen('[info] Video description is already present')
1185             elif info_dict.get('description') is None:
1186                 self.report_warning('There\'s no description to write.')
1187             else:
1188                 try:
1189                     self.to_screen('[info] Writing video description to: ' + descfn)
1190                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1191                         descfile.write(info_dict['description'])
1192                 except (OSError, IOError):
1193                     self.report_error('Cannot write description file ' + descfn)
1194                     return
1195
1196         if self.params.get('writeannotations', False):
1197             annofn = filename + '.annotations.xml'
1198             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1199                 self.to_screen('[info] Video annotations are already present')
1200             else:
1201                 try:
1202                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1203                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1204                         annofile.write(info_dict['annotations'])
1205                 except (KeyError, TypeError):
1206                     self.report_warning('There are no annotations to write.')
1207                 except (OSError, IOError):
1208                     self.report_error('Cannot write annotations file: ' + annofn)
1209                     return
1210
1211         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1212                                        self.params.get('writeautomaticsub')])
1213
1214         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
1215             # subtitles download errors are already managed as troubles in relevant IE
1216             # that way it will silently go on when used with unsupporting IE
1217             subtitles = info_dict['subtitles']
1218             sub_format = self.params.get('subtitlesformat', 'srt')
1219             for sub_lang in subtitles.keys():
1220                 sub = subtitles[sub_lang]
1221                 if sub is None:
1222                     continue
1223                 try:
1224                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1225                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1226                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1227                     else:
1228                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1229                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1230                             subfile.write(sub)
1231                 except (OSError, IOError):
1232                     self.report_error('Cannot write subtitles file ' + sub_filename)
1233                     return
1234
1235         if self.params.get('writeinfojson', False):
1236             infofn = os.path.splitext(filename)[0] + '.info.json'
1237             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1238                 self.to_screen('[info] Video description metadata is already present')
1239             else:
1240                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1241                 try:
1242                     write_json_file(info_dict, infofn)
1243                 except (OSError, IOError):
1244                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1245                     return
1246
1247         self._write_thumbnails(info_dict, filename)
1248
1249         if not self.params.get('skip_download', False):
1250             try:
1251                 def dl(name, info):
1252                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1253                     for ph in self._progress_hooks:
1254                         fd.add_progress_hook(ph)
1255                     if self.params.get('verbose'):
1256                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1257                     return fd.download(name, info)
1258
1259                 if info_dict.get('requested_formats') is not None:
1260                     downloaded = []
1261                     success = True
1262                     merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1263                     if not merger._executable:
1264                         postprocessors = []
1265                         self.report_warning('You have requested multiple '
1266                                             'formats but ffmpeg or avconv are not installed.'
1267                                             ' The formats won\'t be merged')
1268                     else:
1269                         postprocessors = [merger]
1270                     for f in info_dict['requested_formats']:
1271                         new_info = dict(info_dict)
1272                         new_info.update(f)
1273                         fname = self.prepare_filename(new_info)
1274                         fname = prepend_extension(fname, 'f%s' % f['format_id'])
1275                         downloaded.append(fname)
1276                         partial_success = dl(fname, new_info)
1277                         success = success and partial_success
1278                     info_dict['__postprocessors'] = postprocessors
1279                     info_dict['__files_to_merge'] = downloaded
1280                 else:
1281                     # Just a single file
1282                     success = dl(filename, info_dict)
1283             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1284                 self.report_error('unable to download video data: %s' % str(err))
1285                 return
1286             except (OSError, IOError) as err:
1287                 raise UnavailableVideoError(err)
1288             except (ContentTooShortError, ) as err:
1289                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1290                 return
1291
1292             if success:
1293                 # Fixup content
1294                 fixup_policy = self.params.get('fixup')
1295                 if fixup_policy is None:
1296                     fixup_policy = 'detect_or_warn'
1297
1298                 stretched_ratio = info_dict.get('stretched_ratio')
1299                 if stretched_ratio is not None and stretched_ratio != 1:
1300                     if fixup_policy == 'warn':
1301                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1302                             info_dict['id'], stretched_ratio))
1303                     elif fixup_policy == 'detect_or_warn':
1304                         stretched_pp = FFmpegFixupStretchedPP(self)
1305                         if stretched_pp.available:
1306                             info_dict.setdefault('__postprocessors', [])
1307                             info_dict['__postprocessors'].append(stretched_pp)
1308                         else:
1309                             self.report_warning(
1310                                 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1311                                     info_dict['id'], stretched_ratio))
1312                     else:
1313                         assert fixup_policy in ('ignore', 'never')
1314
1315                 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1316                     if fixup_policy == 'warn':
1317                         self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1318                             info_dict['id']))
1319                     elif fixup_policy == 'detect_or_warn':
1320                         fixup_pp = FFmpegFixupM4aPP(self)
1321                         if fixup_pp.available:
1322                             info_dict.setdefault('__postprocessors', [])
1323                             info_dict['__postprocessors'].append(fixup_pp)
1324                         else:
1325                             self.report_warning(
1326                                 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1327                                     info_dict['id']))
1328                     else:
1329                         assert fixup_policy in ('ignore', 'never')
1330
1331                 try:
1332                     self.post_process(filename, info_dict)
1333                 except (PostProcessingError) as err:
1334                     self.report_error('postprocessing: %s' % str(err))
1335                     return
1336                 self.record_download_archive(info_dict)
1337
1338     def download(self, url_list):
1339         """Download a given list of URLs."""
1340         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1341         if (len(url_list) > 1 and
1342                 '%' not in outtmpl
1343                 and self.params.get('max_downloads') != 1):
1344             raise SameFileError(outtmpl)
1345
1346         for url in url_list:
1347             try:
1348                 # It also downloads the videos
1349                 res = self.extract_info(url)
1350             except UnavailableVideoError:
1351                 self.report_error('unable to download video')
1352             except MaxDownloadsReached:
1353                 self.to_screen('[info] Maximum number of downloaded files reached.')
1354                 raise
1355             else:
1356                 if self.params.get('dump_single_json', False):
1357                     self.to_stdout(json.dumps(res))
1358
1359         return self._download_retcode
1360
1361     def download_with_info_file(self, info_filename):
1362         with io.open(info_filename, 'r', encoding='utf-8') as f:
1363             info = json.load(f)
1364         try:
1365             self.process_ie_result(info, download=True)
1366         except DownloadError:
1367             webpage_url = info.get('webpage_url')
1368             if webpage_url is not None:
1369                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1370                 return self.download([webpage_url])
1371             else:
1372                 raise
1373         return self._download_retcode
1374
1375     def post_process(self, filename, ie_info):
1376         """Run all the postprocessors on the given file."""
1377         info = dict(ie_info)
1378         info['filepath'] = filename
1379         pps_chain = []
1380         if ie_info.get('__postprocessors') is not None:
1381             pps_chain.extend(ie_info['__postprocessors'])
1382         pps_chain.extend(self._pps)
1383         for pp in pps_chain:
1384             keep_video = None
1385             old_filename = info['filepath']
1386             try:
1387                 keep_video_wish, info = pp.run(info)
1388                 if keep_video_wish is not None:
1389                     if keep_video_wish:
1390                         keep_video = keep_video_wish
1391                     elif keep_video is None:
1392                         # No clear decision yet, let IE decide
1393                         keep_video = keep_video_wish
1394             except PostProcessingError as e:
1395                 self.report_error(e.msg)
1396             if keep_video is False and not self.params.get('keepvideo', False):
1397                 try:
1398                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1399                     os.remove(encodeFilename(old_filename))
1400                 except (IOError, OSError):
1401                     self.report_warning('Unable to remove downloaded video file')
1402
1403     def _make_archive_id(self, info_dict):
1404         # Future-proof against any change in case
1405         # and backwards compatibility with prior versions
1406         extractor = info_dict.get('extractor_key')
1407         if extractor is None:
1408             if 'id' in info_dict:
1409                 extractor = info_dict.get('ie_key')  # key in a playlist
1410         if extractor is None:
1411             return None  # Incomplete video information
1412         return extractor.lower() + ' ' + info_dict['id']
1413
1414     def in_download_archive(self, info_dict):
1415         fn = self.params.get('download_archive')
1416         if fn is None:
1417             return False
1418
1419         vid_id = self._make_archive_id(info_dict)
1420         if vid_id is None:
1421             return False  # Incomplete video information
1422
1423         try:
1424             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1425                 for line in archive_file:
1426                     if line.strip() == vid_id:
1427                         return True
1428         except IOError as ioe:
1429             if ioe.errno != errno.ENOENT:
1430                 raise
1431         return False
1432
1433     def record_download_archive(self, info_dict):
1434         fn = self.params.get('download_archive')
1435         if fn is None:
1436             return
1437         vid_id = self._make_archive_id(info_dict)
1438         assert vid_id
1439         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1440             archive_file.write(vid_id + '\n')
1441
1442     @staticmethod
1443     def format_resolution(format, default='unknown'):
1444         if format.get('vcodec') == 'none':
1445             return 'audio only'
1446         if format.get('resolution') is not None:
1447             return format['resolution']
1448         if format.get('height') is not None:
1449             if format.get('width') is not None:
1450                 res = '%sx%s' % (format['width'], format['height'])
1451             else:
1452                 res = '%sp' % format['height']
1453         elif format.get('width') is not None:
1454             res = '?x%d' % format['width']
1455         else:
1456             res = default
1457         return res
1458
1459     def _format_note(self, fdict):
1460         res = ''
1461         if fdict.get('ext') in ['f4f', 'f4m']:
1462             res += '(unsupported) '
1463         if fdict.get('format_note') is not None:
1464             res += fdict['format_note'] + ' '
1465         if fdict.get('tbr') is not None:
1466             res += '%4dk ' % fdict['tbr']
1467         if fdict.get('container') is not None:
1468             if res:
1469                 res += ', '
1470             res += '%s container' % fdict['container']
1471         if (fdict.get('vcodec') is not None and
1472                 fdict.get('vcodec') != 'none'):
1473             if res:
1474                 res += ', '
1475             res += fdict['vcodec']
1476             if fdict.get('vbr') is not None:
1477                 res += '@'
1478         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1479             res += 'video@'
1480         if fdict.get('vbr') is not None:
1481             res += '%4dk' % fdict['vbr']
1482         if fdict.get('fps') is not None:
1483             res += ', %sfps' % fdict['fps']
1484         if fdict.get('acodec') is not None:
1485             if res:
1486                 res += ', '
1487             if fdict['acodec'] == 'none':
1488                 res += 'video only'
1489             else:
1490                 res += '%-5s' % fdict['acodec']
1491         elif fdict.get('abr') is not None:
1492             if res:
1493                 res += ', '
1494             res += 'audio'
1495         if fdict.get('abr') is not None:
1496             res += '@%3dk' % fdict['abr']
1497         if fdict.get('asr') is not None:
1498             res += ' (%5dHz)' % fdict['asr']
1499         if fdict.get('filesize') is not None:
1500             if res:
1501                 res += ', '
1502             res += format_bytes(fdict['filesize'])
1503         elif fdict.get('filesize_approx') is not None:
1504             if res:
1505                 res += ', '
1506             res += '~' + format_bytes(fdict['filesize_approx'])
1507         return res
1508
1509     def list_formats(self, info_dict):
1510         def line(format, idlen=20):
1511             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1512                 format['format_id'],
1513                 format['ext'],
1514                 self.format_resolution(format),
1515                 self._format_note(format),
1516             ))
1517
1518         formats = info_dict.get('formats', [info_dict])
1519         idlen = max(len('format code'),
1520                     max(len(f['format_id']) for f in formats))
1521         formats_s = [
1522             line(f, idlen) for f in formats
1523             if f.get('preference') is None or f['preference'] >= -1000]
1524         if len(formats) > 1:
1525             formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
1526             formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
1527
1528         header_line = line({
1529             'format_id': 'format code', 'ext': 'extension',
1530             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1531         self.to_screen(
1532             '[info] Available formats for %s:\n%s\n%s' %
1533             (info_dict['id'], header_line, '\n'.join(formats_s)))
1534
1535     def list_thumbnails(self, info_dict):
1536         thumbnails = info_dict.get('thumbnails')
1537         if not thumbnails:
1538             tn_url = info_dict.get('thumbnail')
1539             if tn_url:
1540                 thumbnails = [{'id': '0', 'url': tn_url}]
1541             else:
1542                 self.to_screen(
1543                     '[info] No thumbnails present for %s' % info_dict['id'])
1544                 return
1545
1546         self.to_screen(
1547             '[info] Thumbnails for %s:' % info_dict['id'])
1548         self.to_screen(render_table(
1549             ['ID', 'width', 'height', 'URL'],
1550             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1551
1552     def urlopen(self, req):
1553         """ Start an HTTP download """
1554
1555         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1556         # always respected by websites, some tend to give out URLs with non percent-encoded
1557         # non-ASCII characters (see telemb.py, ard.py [#3412])
1558         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1559         # To work around aforementioned issue we will replace request's original URL with
1560         # percent-encoded one
1561         req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str)
1562         url = req if req_is_string else req.get_full_url()
1563         url_escaped = escape_url(url)
1564
1565         # Substitute URL if any change after escaping
1566         if url != url_escaped:
1567             if req_is_string:
1568                 req = url_escaped
1569             else:
1570                 req = compat_urllib_request.Request(
1571                     url_escaped, data=req.data, headers=req.headers,
1572                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1573
1574         return self._opener.open(req, timeout=self._socket_timeout)
1575
1576     def print_debug_header(self):
1577         if not self.params.get('verbose'):
1578             return
1579
1580         if type('') is not compat_str:
1581             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1582             self.report_warning(
1583                 'Your Python is broken! Update to a newer and supported version')
1584
1585         stdout_encoding = getattr(
1586             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1587         encoding_str = (
1588             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1589                 locale.getpreferredencoding(),
1590                 sys.getfilesystemencoding(),
1591                 stdout_encoding,
1592                 self.get_encoding()))
1593         write_string(encoding_str, encoding=None)
1594
1595         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1596         try:
1597             sp = subprocess.Popen(
1598                 ['git', 'rev-parse', '--short', 'HEAD'],
1599                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1600                 cwd=os.path.dirname(os.path.abspath(__file__)))
1601             out, err = sp.communicate()
1602             out = out.decode().strip()
1603             if re.match('[0-9a-f]+', out):
1604                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1605         except:
1606             try:
1607                 sys.exc_clear()
1608             except:
1609                 pass
1610         self._write_string('[debug] Python version %s - %s\n' % (
1611             platform.python_version(), platform_name()))
1612
1613         exe_versions = FFmpegPostProcessor.get_versions()
1614         exe_versions['rtmpdump'] = rtmpdump_version()
1615         exe_str = ', '.join(
1616             '%s %s' % (exe, v)
1617             for exe, v in sorted(exe_versions.items())
1618             if v
1619         )
1620         if not exe_str:
1621             exe_str = 'none'
1622         self._write_string('[debug] exe versions: %s\n' % exe_str)
1623
1624         proxy_map = {}
1625         for handler in self._opener.handlers:
1626             if hasattr(handler, 'proxies'):
1627                 proxy_map.update(handler.proxies)
1628         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1629
1630         if self.params.get('call_home', False):
1631             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1632             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1633             latest_version = self.urlopen(
1634                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1635             if version_tuple(latest_version) > version_tuple(__version__):
1636                 self.report_warning(
1637                     'You are using an outdated version (newest version: %s)! '
1638                     'See https://yt-dl.org/update if you need help updating.' %
1639                     latest_version)
1640
1641     def _setup_opener(self):
1642         timeout_val = self.params.get('socket_timeout')
1643         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1644
1645         opts_cookiefile = self.params.get('cookiefile')
1646         opts_proxy = self.params.get('proxy')
1647
1648         if opts_cookiefile is None:
1649             self.cookiejar = compat_cookiejar.CookieJar()
1650         else:
1651             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1652                 opts_cookiefile)
1653             if os.access(opts_cookiefile, os.R_OK):
1654                 self.cookiejar.load()
1655
1656         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1657             self.cookiejar)
1658         if opts_proxy is not None:
1659             if opts_proxy == '':
1660                 proxies = {}
1661             else:
1662                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1663         else:
1664             proxies = compat_urllib_request.getproxies()
1665             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1666             if 'http' in proxies and 'https' not in proxies:
1667                 proxies['https'] = proxies['http']
1668         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1669
1670         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1671         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1672         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1673         opener = compat_urllib_request.build_opener(
1674             https_handler, proxy_handler, cookie_processor, ydlh)
1675         # Delete the default user-agent header, which would otherwise apply in
1676         # cases where our custom HTTP handler doesn't come into play
1677         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1678         opener.addheaders = []
1679         self._opener = opener
1680
1681     def encode(self, s):
1682         if isinstance(s, bytes):
1683             return s  # Already encoded
1684
1685         try:
1686             return s.encode(self.get_encoding())
1687         except UnicodeEncodeError as err:
1688             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1689             raise
1690
1691     def get_encoding(self):
1692         encoding = self.params.get('encoding')
1693         if encoding is None:
1694             encoding = preferredencoding()
1695         return encoding
1696
1697     def _write_thumbnails(self, info_dict, filename):
1698         if self.params.get('writethumbnail', False):
1699             thumbnails = info_dict.get('thumbnails')
1700             if thumbnails:
1701                 thumbnails = [thumbnails[-1]]
1702         elif self.params.get('write_all_thumbnails', False):
1703             thumbnails = info_dict.get('thumbnails')
1704         else:
1705             return
1706
1707         if not thumbnails:
1708             # No thumbnails present, so return immediately
1709             return
1710
1711         for t in thumbnails:
1712             thumb_ext = determine_ext(t['url'], 'jpg')
1713             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
1714             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
1715             thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
1716
1717             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1718                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
1719                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1720             else:
1721                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
1722                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1723                 try:
1724                     uf = self.urlopen(t['url'])
1725                     with open(thumb_filename, 'wb') as thumbf:
1726                         shutil.copyfileobj(uf, thumbf)
1727                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
1728                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
1729                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1730                     self.report_warning('Unable to download thumbnail "%s": %s' %
1731                                         (t['url'], compat_str(err)))