_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import datetime
   8 import errno
   9 import io
  10 import itertools
  11 import json
  12 import locale
  13 import operator
  14 import os
  15 import platform
  16 import re
  17 import shutil
  18 import subprocess
  19 import socket
  20 import sys
  21 import time
  22 import traceback
  23
  24 if os.name == 'nt':
  25     import ctypes
  26
  27 from .compat import (
  28     compat_cookiejar,
  29     compat_expanduser,
  30     compat_http_client,
  31     compat_kwargs,
  32     compat_str,
  33     compat_urllib_error,
  34     compat_urllib_request,
  35 )
  36 from .utils import (
  37     escape_url,
  38     ContentTooShortError,
  39     date_from_str,
  40     DateRange,
  41     DEFAULT_OUTTMPL,
  42     determine_ext,
  43     DownloadError,
  44     encodeFilename,
  45     ExtractorError,
  46     format_bytes,
  47     formatSeconds,
  48     get_term_width,
  49     locked_file,
  50     make_HTTPS_handler,
  51     MaxDownloadsReached,
  52     PagedList,
  53     parse_filesize,
  54     PostProcessingError,
  55     platform_name,
  56     preferredencoding,
  57     render_table,
  58     SameFileError,
  59     sanitize_filename,
  60     std_headers,
  61     subtitles_filename,
  62     takewhile_inclusive,
  63     UnavailableVideoError,
  64     url_basename,
  65     version_tuple,
  66     write_json_file,
  67     write_string,
  68     YoutubeDLHandler,
  69     prepend_extension,
  70     args_to_str,
  71     age_restricted,
  72 )
  73 from .cache import Cache
  74 from .extractor import get_info_extractor, gen_extractors
  75 from .downloader import get_suitable_downloader
  76 from .downloader.rtmp import rtmpdump_version
  77 from .postprocessor import (
  78     FFmpegFixupM4aPP,
  79     FFmpegFixupStretchedPP,
  80     FFmpegMergerPP,
  81     FFmpegPostProcessor,
  82     get_postprocessor,
  83 )
  84 from .version import __version__
  85
  86
  87 class YoutubeDL(object):
  88     """YoutubeDL class.
  89
  90     YoutubeDL objects are the ones responsible of downloading the
  91     actual video file and writing it to disk if the user has requested
  92     it, among some other tasks. In most cases there should be one per
  93     program. As, given a video URL, the downloader doesn't know how to
  94     extract all the needed information, task that InfoExtractors do, it
  95     has to pass the URL to one of them.
  96
  97     For this, YoutubeDL objects have a method that allows
  98     InfoExtractors to be registered in a given order. When it is passed
  99     a URL, the YoutubeDL object handles it to the first InfoExtractor it
 100     finds that reports being able to handle it. The InfoExtractor extracts
 101     all the information about the video or videos the URL refers to, and
 102     YoutubeDL process the extracted information, possibly using a File
 103     Downloader to download the video.
 104
 105     YoutubeDL objects accept a lot of parameters. In order not to saturate
 106     the object constructor with arguments, it receives a dictionary of
 107     options instead. These options are available through the params
 108     attribute for the InfoExtractors to use. The YoutubeDL also
 109     registers itself as the downloader in charge for the InfoExtractors
 110     that are added to it, so this is a "mutual registration".
 111
 112     Available options:
 113
 114     username:          Username for authentication purposes.
 115     password:          Password for authentication purposes.
 116     videopassword:     Password for acces a video.
 117     usenetrc:          Use netrc for authentication instead.
 118     verbose:           Print additional info to stdout.
 119     quiet:             Do not print messages to stdout.
 120     no_warnings:       Do not print out anything for warnings.
 121     forceurl:          Force printing final URL.
 122     forcetitle:        Force printing title.
 123     forceid:           Force printing ID.
 124     forcethumbnail:    Force printing thumbnail URL.
 125     forcedescription:  Force printing description.
 126     forcefilename:     Force printing final filename.
 127     forceduration:     Force printing duration.
 128     forcejson:         Force printing info_dict as JSON.
 129     dump_single_json:  Force printing the info_dict of the whole playlist
 130                        (or video) as a single JSON line.
 131     simulate:          Do not download the video files.
 132     format:            Video format code. See options.py for more information.
 133     format_limit:      Highest quality format to try.
 134     outtmpl:           Template for output names.
 135     restrictfilenames: Do not allow "&" and spaces in file names
 136     ignoreerrors:      Do not stop on download errors.
 137     nooverwrites:      Prevent overwriting files.
 138     playliststart:     Playlist item to start at.
 139     playlistend:       Playlist item to end at.
 140     playlist_items:    Specific indices of playlist to download.
 141     playlistreverse:   Download playlist items in reverse order.
 142     matchtitle:        Download only matching titles.
 143     rejecttitle:       Reject downloads for matching titles.
 144     logger:            Log messages to a logging.Logger instance.
 145     logtostderr:       Log messages to stderr instead of stdout.
 146     writedescription:  Write the video description to a .description file
 147     writeinfojson:     Write the video description to a .info.json file
 148     writeannotations:  Write the video annotations to a .annotations.xml file
 149     writethumbnail:    Write the thumbnail image to a file
 150     write_all_thumbnails:  Write all thumbnail formats to files
 151     writesubtitles:    Write the video subtitles to a file
 152     writeautomaticsub: Write the automatic subtitles to a file
 153     allsubtitles:      Downloads all the subtitles of the video
 154                        (requires writesubtitles or writeautomaticsub)
 155     listsubtitles:     Lists all available subtitles for the video
 156     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 157     subtitleslangs:    List of languages of the subtitles to download
 158     keepvideo:         Keep the video file after post-processing
 159     daterange:         A DateRange object, download only if the upload_date is in the range.
 160     skip_download:     Skip the actual download of the video file
 161     cachedir:          Location of the cache files in the filesystem.
 162                        False to disable filesystem cache.
 163     noplaylist:        Download single video instead of a playlist if in doubt.
 164     age_limit:         An integer representing the user's age in years.
 165                        Unsuitable videos for the given age are skipped.
 166     min_views:         An integer representing the minimum view count the video
 167                        must have in order to not be skipped.
 168                        Videos without view count information are always
 169                        downloaded. None for no limit.
 170     max_views:         An integer representing the maximum view count.
 171                        Videos that are more popular than that are not
 172                        downloaded.
 173                        Videos without view count information are always
 174                        downloaded. None for no limit.
 175     download_archive:  File name of a file where all downloads are recorded.
 176                        Videos already present in the file are not downloaded
 177                        again.
 178     cookiefile:        File name where cookies should be read from and dumped to.
 179     nocheckcertificate:Do not verify SSL certificates
 180     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
 181                        At the moment, this is only supported by YouTube.
 182     proxy:             URL of the proxy server to use
 183     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 184     bidi_workaround:   Work around buggy terminals without bidirectional text
 185                        support, using fridibi
 186     debug_printtraffic:Print out sent and received HTTP traffic
 187     include_ads:       Download ads as well
 188     default_search:    Prepend this string if an input url is not valid.
 189                        'auto' for elaborate guessing
 190     encoding:          Use this encoding instead of the system-specified.
 191     extract_flat:      Do not resolve URLs, return the immediate result.
 192                        Pass in 'in_playlist' to only show this behavior for
 193                        playlist items.
 194     postprocessors:    A list of dictionaries, each with an entry
 195                        * key:  The name of the postprocessor. See
 196                                youtube_dl/postprocessor/__init__.py for a list.
 197                        as well as any further keyword arguments for the
 198                        postprocessor.
 199     progress_hooks:    A list of functions that get called on download
 200                        progress, with a dictionary with the entries
 201                        * status: One of "downloading" and "finished".
 202                                  Check this first and ignore unknown values.
 203
 204                        If status is one of "downloading" or "finished", the
 205                        following properties may also be present:
 206                        * filename: The final filename (always present)
 207                        * downloaded_bytes: Bytes on disk
 208                        * total_bytes: Size of the whole file, None if unknown
 209                        * tmpfilename: The filename we're currently writing to
 210                        * eta: The estimated time in seconds, None if unknown
 211                        * speed: The download speed in bytes/second, None if
 212                                 unknown
 213
 214                        Progress hooks are guaranteed to be called at least once
 215                        (with status "finished") if the download is successful.
 216     merge_output_format: Extension to use when merging formats.
 217     fixup:             Automatically correct known faults of the file.
 218                        One of:
 219                        - "never": do nothing
 220                        - "warn": only emit a warning
 221                        - "detect_or_warn": check whether we can do anything
 222                                            about it, warn otherwise (default)
 223     source_address:    (Experimental) Client-side IP address to bind to.
 224     call_home:         Boolean, true iff we are allowed to contact the
 225                        youtube-dl servers for debugging.
 226     sleep_interval:    Number of seconds to sleep before each download.
 227     external_downloader:  Executable of the external downloader to call.
 228     listformats:       Print an overview of available video formats and exit.
 229     list_thumbnails:   Print a table of all thumbnails and exit.
 230
 231
 232     The following parameters are not used by YoutubeDL itself, they are used by
 233     the FileDownloader:
 234     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 235     noresizebuffer, retries, continuedl, noprogress, consoletitle,
 236     xattr_set_filesize.
 237
 238     The following options are used by the post processors:
 239     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 240                        otherwise prefer avconv.
 241     exec_cmd:          Arbitrary command to run after downloading
 242     """
 243
 244     params = None
 245     _ies = []
 246     _pps = []
 247     _download_retcode = None
 248     _num_downloads = None
 249     _screen_file = None
 250
 251     def __init__(self, params=None, auto_init=True):
 252         """Create a FileDownloader object with the given options."""
 253         if params is None:
 254             params = {}
 255         self._ies = []
 256         self._ies_instances = {}
 257         self._pps = []
 258         self._progress_hooks = []
 259         self._download_retcode = 0
 260         self._num_downloads = 0
 261         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 262         self._err_file = sys.stderr
 263         self.params = params
 264         self.cache = Cache(self)
 265
 266         if params.get('bidi_workaround', False):
 267             try:
 268                 import pty
 269                 master, slave = pty.openpty()
 270                 width = get_term_width()
 271                 if width is None:
 272                     width_args = []
 273                 else:
 274                     width_args = ['-w', str(width)]
 275                 sp_kwargs = dict(
 276                     stdin=subprocess.PIPE,
 277                     stdout=slave,
 278                     stderr=self._err_file)
 279                 try:
 280                     self._output_process = subprocess.Popen(
 281                         ['bidiv'] + width_args, **sp_kwargs
 282                     )
 283                 except OSError:
 284                     self._output_process = subprocess.Popen(
 285                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 286                 self._output_channel = os.fdopen(master, 'rb')
 287             except OSError as ose:
 288                 if ose.errno == 2:
 289                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 290                 else:
 291                     raise
 292
 293         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 294                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 295                 and not params.get('restrictfilenames', False)):
 296             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 297             self.report_warning(
 298                 'Assuming --restrict-filenames since file system encoding '
 299                 'cannot encode all characters. '
 300                 'Set the LC_ALL environment variable to fix this.')
 301             self.params['restrictfilenames'] = True
 302
 303         if '%(stitle)s' in self.params.get('outtmpl', ''):
 304             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 305
 306         self._setup_opener()
 307
 308         if auto_init:
 309             self.print_debug_header()
 310             self.add_default_info_extractors()
 311
 312         for pp_def_raw in self.params.get('postprocessors', []):
 313             pp_class = get_postprocessor(pp_def_raw['key'])
 314             pp_def = dict(pp_def_raw)
 315             del pp_def['key']
 316             pp = pp_class(self, **compat_kwargs(pp_def))
 317             self.add_post_processor(pp)
 318
 319         for ph in self.params.get('progress_hooks', []):
 320             self.add_progress_hook(ph)
 321
 322     def warn_if_short_id(self, argv):
 323         # short YouTube ID starting with dash?
 324         idxs = [
 325             i for i, a in enumerate(argv)
 326             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
 327         if idxs:
 328             correct_argv = (
 329                 ['youtube-dl'] +
 330                 [a for i, a in enumerate(argv) if i not in idxs] +
 331                 ['--'] + [argv[i] for i in idxs]
 332             )
 333             self.report_warning(
 334                 'Long argument string detected. '
 335                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
 336                 args_to_str(correct_argv))
 337
 338     def add_info_extractor(self, ie):
 339         """Add an InfoExtractor object to the end of the list."""
 340         self._ies.append(ie)
 341         self._ies_instances[ie.ie_key()] = ie
 342         ie.set_downloader(self)
 343
 344     def get_info_extractor(self, ie_key):
 345         """
 346         Get an instance of an IE with name ie_key, it will try to get one from
 347         the _ies list, if there's no instance it will create a new one and add
 348         it to the extractor list.
 349         """
 350         ie = self._ies_instances.get(ie_key)
 351         if ie is None:
 352             ie = get_info_extractor(ie_key)()
 353             self.add_info_extractor(ie)
 354         return ie
 355
 356     def add_default_info_extractors(self):
 357         """
 358         Add the InfoExtractors returned by gen_extractors to the end of the list
 359         """
 360         for ie in gen_extractors():
 361             self.add_info_extractor(ie)
 362
 363     def add_post_processor(self, pp):
 364         """Add a PostProcessor object to the end of the chain."""
 365         self._pps.append(pp)
 366         pp.set_downloader(self)
 367
 368     def add_progress_hook(self, ph):
 369         """Add the progress hook (currently only for the file downloader)"""
 370         self._progress_hooks.append(ph)
 371
 372     def _bidi_workaround(self, message):
 373         if not hasattr(self, '_output_channel'):
 374             return message
 375
 376         assert hasattr(self, '_output_process')
 377         assert isinstance(message, compat_str)
 378         line_count = message.count('\n') + 1
 379         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 380         self._output_process.stdin.flush()
 381         res = ''.join(self._output_channel.readline().decode('utf-8')
 382                       for _ in range(line_count))
 383         return res[:-len('\n')]
 384
 385     def to_screen(self, message, skip_eol=False):
 386         """Print message to stdout if not in quiet mode."""
 387         return self.to_stdout(message, skip_eol, check_quiet=True)
 388
 389     def _write_string(self, s, out=None):
 390         write_string(s, out=out, encoding=self.params.get('encoding'))
 391
 392     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 393         """Print message to stdout if not in quiet mode."""
 394         if self.params.get('logger'):
 395             self.params['logger'].debug(message)
 396         elif not check_quiet or not self.params.get('quiet', False):
 397             message = self._bidi_workaround(message)
 398             terminator = ['\n', ''][skip_eol]
 399             output = message + terminator
 400
 401             self._write_string(output, self._screen_file)
 402
 403     def to_stderr(self, message):
 404         """Print message to stderr."""
 405         assert isinstance(message, compat_str)
 406         if self.params.get('logger'):
 407             self.params['logger'].error(message)
 408         else:
 409             message = self._bidi_workaround(message)
 410             output = message + '\n'
 411             self._write_string(output, self._err_file)
 412
 413     def to_console_title(self, message):
 414         if not self.params.get('consoletitle', False):
 415             return
 416         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 417             # c_wchar_p() might not be necessary if `message` is
 418             # already of type unicode()
 419             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 420         elif 'TERM' in os.environ:
 421             self._write_string('\033]0;%s\007' % message, self._screen_file)
 422
 423     def save_console_title(self):
 424         if not self.params.get('consoletitle', False):
 425             return
 426         if 'TERM' in os.environ:
 427             # Save the title on stack
 428             self._write_string('\033[22;0t', self._screen_file)
 429
 430     def restore_console_title(self):
 431         if not self.params.get('consoletitle', False):
 432             return
 433         if 'TERM' in os.environ:
 434             # Restore the title from stack
 435             self._write_string('\033[23;0t', self._screen_file)
 436
 437     def __enter__(self):
 438         self.save_console_title()
 439         return self
 440
 441     def __exit__(self, *args):
 442         self.restore_console_title()
 443
 444         if self.params.get('cookiefile') is not None:
 445             self.cookiejar.save()
 446
 447     def trouble(self, message=None, tb=None):
 448         """Determine action to take when a download problem appears.
 449
 450         Depending on if the downloader has been configured to ignore
 451         download errors or not, this method may throw an exception or
 452         not when errors are found, after printing the message.
 453
 454         tb, if given, is additional traceback information.
 455         """
 456         if message is not None:
 457             self.to_stderr(message)
 458         if self.params.get('verbose'):
 459             if tb is None:
 460                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 461                     tb = ''
 462                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 463                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 464                     tb += compat_str(traceback.format_exc())
 465                 else:
 466                     tb_data = traceback.format_list(traceback.extract_stack())
 467                     tb = ''.join(tb_data)
 468             self.to_stderr(tb)
 469         if not self.params.get('ignoreerrors', False):
 470             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 471                 exc_info = sys.exc_info()[1].exc_info
 472             else:
 473                 exc_info = sys.exc_info()
 474             raise DownloadError(message, exc_info)
 475         self._download_retcode = 1
 476
 477     def report_warning(self, message):
 478         '''
 479         Print the message to stderr, it will be prefixed with 'WARNING:'
 480         If stderr is a tty file the 'WARNING:' will be colored
 481         '''
 482         if self.params.get('logger') is not None:
 483             self.params['logger'].warning(message)
 484         else:
 485             if self.params.get('no_warnings'):
 486                 return
 487             if self._err_file.isatty() and os.name != 'nt':
 488                 _msg_header = '\033[0;33mWARNING:\033[0m'
 489             else:
 490                 _msg_header = 'WARNING:'
 491             warning_message = '%s %s' % (_msg_header, message)
 492             self.to_stderr(warning_message)
 493
 494     def report_error(self, message, tb=None):
 495         '''
 496         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 497         in red if stderr is a tty file.
 498         '''
 499         if self._err_file.isatty() and os.name != 'nt':
 500             _msg_header = '\033[0;31mERROR:\033[0m'
 501         else:
 502             _msg_header = 'ERROR:'
 503         error_message = '%s %s' % (_msg_header, message)
 504         self.trouble(error_message, tb)
 505
 506     def report_file_already_downloaded(self, file_name):
 507         """Report file has already been fully downloaded."""
 508         try:
 509             self.to_screen('[download] %s has already been downloaded' % file_name)
 510         except UnicodeEncodeError:
 511             self.to_screen('[download] The file has already been downloaded')
 512
 513     def prepare_filename(self, info_dict):
 514         """Generate the output filename."""
 515         try:
 516             template_dict = dict(info_dict)
 517
 518             template_dict['epoch'] = int(time.time())
 519             autonumber_size = self.params.get('autonumber_size')
 520             if autonumber_size is None:
 521                 autonumber_size = 5
 522             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 523             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 524             if template_dict.get('playlist_index') is not None:
 525                 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
 526             if template_dict.get('resolution') is None:
 527                 if template_dict.get('width') and template_dict.get('height'):
 528                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
 529                 elif template_dict.get('height'):
 530                     template_dict['resolution'] = '%sp' % template_dict['height']
 531                 elif template_dict.get('width'):
 532                     template_dict['resolution'] = '?x%d' % template_dict['width']
 533
 534             sanitize = lambda k, v: sanitize_filename(
 535                 compat_str(v),
 536                 restricted=self.params.get('restrictfilenames'),
 537                 is_id=(k == 'id'))
 538             template_dict = dict((k, sanitize(k, v))
 539                                  for k, v in template_dict.items()
 540                                  if v is not None)
 541             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 542
 543             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 544             tmpl = compat_expanduser(outtmpl)
 545             filename = tmpl % template_dict
 546             # Temporary fix for #4787
 547             # 'Treat' all problem characters by passing filename through preferredencoding
 548             # to workaround encoding issues with subprocess on python2 @ Windows
 549             if sys.version_info < (3, 0) and sys.platform == 'win32':
 550                 filename = encodeFilename(filename, True).decode(preferredencoding())
 551             return filename
 552         except ValueError as err:
 553             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 554             return None
 555
 556     def _match_entry(self, info_dict):
 557         """ Returns None iff the file should be downloaded """
 558
 559         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 560         if 'title' in info_dict:
 561             # This can happen when we're just evaluating the playlist
 562             title = info_dict['title']
 563             matchtitle = self.params.get('matchtitle', False)
 564             if matchtitle:
 565                 if not re.search(matchtitle, title, re.IGNORECASE):
 566                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 567             rejecttitle = self.params.get('rejecttitle', False)
 568             if rejecttitle:
 569                 if re.search(rejecttitle, title, re.IGNORECASE):
 570                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 571         date = info_dict.get('upload_date', None)
 572         if date is not None:
 573             dateRange = self.params.get('daterange', DateRange())
 574             if date not in dateRange:
 575                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 576         view_count = info_dict.get('view_count', None)
 577         if view_count is not None:
 578             min_views = self.params.get('min_views')
 579             if min_views is not None and view_count < min_views:
 580                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 581             max_views = self.params.get('max_views')
 582             if max_views is not None and view_count > max_views:
 583                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 584         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 585             return 'Skipping "%s" because it is age restricted' % title
 586         if self.in_download_archive(info_dict):
 587             return '%s has already been recorded in archive' % video_title
 588         return None
 589
 590     @staticmethod
 591     def add_extra_info(info_dict, extra_info):
 592         '''Set the keys from extra_info in info dict if they are missing'''
 593         for key, value in extra_info.items():
 594             info_dict.setdefault(key, value)
 595
 596     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 597                      process=True):
 598         '''
 599         Returns a list with a dictionary for each video we find.
 600         If 'download', also downloads the videos.
 601         extra_info is a dict containing the extra values to add to each result
 602          '''
 603
 604         if ie_key:
 605             ies = [self.get_info_extractor(ie_key)]
 606         else:
 607             ies = self._ies
 608
 609         for ie in ies:
 610             if not ie.suitable(url):
 611                 continue
 612
 613             if not ie.working():
 614                 self.report_warning('The program functionality for this site has been marked as broken, '
 615                                     'and will probably not work.')
 616
 617             try:
 618                 ie_result = ie.extract(url)
 619                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
 620                     break
 621                 if isinstance(ie_result, list):
 622                     # Backwards compatibility: old IE result format
 623                     ie_result = {
 624                         '_type': 'compat_list',
 625                         'entries': ie_result,
 626                     }
 627                 self.add_default_extra_info(ie_result, ie, url)
 628                 if process:
 629                     return self.process_ie_result(ie_result, download, extra_info)
 630                 else:
 631                     return ie_result
 632             except ExtractorError as de:  # An error we somewhat expected
 633                 self.report_error(compat_str(de), de.format_traceback())
 634                 break
 635             except MaxDownloadsReached:
 636                 raise
 637             except Exception as e:
 638                 if self.params.get('ignoreerrors', False):
 639                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 640                     break
 641                 else:
 642                     raise
 643         else:
 644             self.report_error('no suitable InfoExtractor for URL %s' % url)
 645
 646     def add_default_extra_info(self, ie_result, ie, url):
 647         self.add_extra_info(ie_result, {
 648             'extractor': ie.IE_NAME,
 649             'webpage_url': url,
 650             'webpage_url_basename': url_basename(url),
 651             'extractor_key': ie.ie_key(),
 652         })
 653
 654     def process_ie_result(self, ie_result, download=True, extra_info={}):
 655         """
 656         Take the result of the ie(may be modified) and resolve all unresolved
 657         references (URLs, playlist items).
 658
 659         It will also download the videos if 'download'.
 660         Returns the resolved ie_result.
 661         """
 662
 663         result_type = ie_result.get('_type', 'video')
 664
 665         if result_type in ('url', 'url_transparent'):
 666             extract_flat = self.params.get('extract_flat', False)
 667             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
 668                     extract_flat is True):
 669                 if self.params.get('forcejson', False):
 670                     self.to_stdout(json.dumps(ie_result))
 671                 return ie_result
 672
 673         if result_type == 'video':
 674             self.add_extra_info(ie_result, extra_info)
 675             return self.process_video_result(ie_result, download=download)
 676         elif result_type == 'url':
 677             # We have to add extra_info to the results because it may be
 678             # contained in a playlist
 679             return self.extract_info(ie_result['url'],
 680                                      download,
 681                                      ie_key=ie_result.get('ie_key'),
 682                                      extra_info=extra_info)
 683         elif result_type == 'url_transparent':
 684             # Use the information from the embedding page
 685             info = self.extract_info(
 686                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 687                 extra_info=extra_info, download=False, process=False)
 688
 689             force_properties = dict(
 690                 (k, v) for k, v in ie_result.items() if v is not None)
 691             for f in ('_type', 'url'):
 692                 if f in force_properties:
 693                     del force_properties[f]
 694             new_result = info.copy()
 695             new_result.update(force_properties)
 696
 697             assert new_result.get('_type') != 'url_transparent'
 698
 699             return self.process_ie_result(
 700                 new_result, download=download, extra_info=extra_info)
 701         elif result_type == 'playlist' or result_type == 'multi_video':
 702             # We process each entry in the playlist
 703             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 704             self.to_screen('[download] Downloading playlist: %s' % playlist)
 705
 706             playlist_results = []
 707
 708             playliststart = self.params.get('playliststart', 1) - 1
 709             playlistend = self.params.get('playlistend', None)
 710             # For backwards compatibility, interpret -1 as whole list
 711             if playlistend == -1:
 712                 playlistend = None
 713
 714             playlistitems_str = self.params.get('playlist_items', None)
 715             playlistitems = None
 716             if playlistitems_str is not None:
 717                 def iter_playlistitems(format):
 718                     for string_segment in format.split(','):
 719                         if '-' in string_segment:
 720                             start, end = string_segment.split('-')
 721                             for item in range(int(start), int(end) + 1):
 722                                 yield int(item)
 723                         else:
 724                             yield int(string_segment)
 725                 playlistitems = iter_playlistitems(playlistitems_str)
 726
 727             ie_entries = ie_result['entries']
 728             if isinstance(ie_entries, list):
 729                 n_all_entries = len(ie_entries)
 730                 if playlistitems:
 731                     entries = [ie_entries[i - 1] for i in playlistitems]
 732                 else:
 733                     entries = ie_entries[playliststart:playlistend]
 734                 n_entries = len(entries)
 735                 self.to_screen(
 736                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
 737                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
 738             elif isinstance(ie_entries, PagedList):
 739                 if playlistitems:
 740                     entries = []
 741                     for item in playlistitems:
 742                         entries.extend(ie_entries.getslice(
 743                             item - 1, item
 744                         ))
 745                 else:
 746                     entries = ie_entries.getslice(
 747                         playliststart, playlistend)
 748                 n_entries = len(entries)
 749                 self.to_screen(
 750                     "[%s] playlist %s: Downloading %d videos" %
 751                     (ie_result['extractor'], playlist, n_entries))
 752             else:  # iterable
 753                 if playlistitems:
 754                     entry_list = list(ie_entries)
 755                     entries = [entry_list[i - 1] for i in playlistitems]
 756                 else:
 757                     entries = list(itertools.islice(
 758                         ie_entries, playliststart, playlistend))
 759                 n_entries = len(entries)
 760                 self.to_screen(
 761                     "[%s] playlist %s: Downloading %d videos" %
 762                     (ie_result['extractor'], playlist, n_entries))
 763
 764             if self.params.get('playlistreverse', False):
 765                 entries = entries[::-1]
 766
 767             for i, entry in enumerate(entries, 1):
 768                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
 769                 extra = {
 770                     'n_entries': n_entries,
 771                     'playlist': playlist,
 772                     'playlist_id': ie_result.get('id'),
 773                     'playlist_title': ie_result.get('title'),
 774                     'playlist_index': i + playliststart,
 775                     'extractor': ie_result['extractor'],
 776                     'webpage_url': ie_result['webpage_url'],
 777                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 778                     'extractor_key': ie_result['extractor_key'],
 779                 }
 780
 781                 reason = self._match_entry(entry)
 782                 if reason is not None:
 783                     self.to_screen('[download] ' + reason)
 784                     continue
 785
 786                 entry_result = self.process_ie_result(entry,
 787                                                       download=download,
 788                                                       extra_info=extra)
 789                 playlist_results.append(entry_result)
 790             ie_result['entries'] = playlist_results
 791             return ie_result
 792         elif result_type == 'compat_list':
 793             self.report_warning(
 794                 'Extractor %s returned a compat_list result. '
 795                 'It needs to be updated.' % ie_result.get('extractor'))
 796
 797             def _fixup(r):
 798                 self.add_extra_info(
 799                     r,
 800                     {
 801                         'extractor': ie_result['extractor'],
 802                         'webpage_url': ie_result['webpage_url'],
 803                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 804                         'extractor_key': ie_result['extractor_key'],
 805                     }
 806                 )
 807                 return r
 808             ie_result['entries'] = [
 809                 self.process_ie_result(_fixup(r), download, extra_info)
 810                 for r in ie_result['entries']
 811             ]
 812             return ie_result
 813         else:
 814             raise Exception('Invalid result type: %s' % result_type)
 815
 816     def _apply_format_filter(self, format_spec, available_formats):
 817         " Returns a tuple of the remaining format_spec and filtered formats "
 818
 819         OPERATORS = {
 820             '<': operator.lt,
 821             '<=': operator.le,
 822             '>': operator.gt,
 823             '>=': operator.ge,
 824             '=': operator.eq,
 825             '!=': operator.ne,
 826         }
 827         operator_rex = re.compile(r'''(?x)\s*\[
 828             (?P<key>width|height|tbr|abr|vbr|filesize|fps)
 829             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
 830             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
 831             \]$
 832             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
 833         m = operator_rex.search(format_spec)
 834         if not m:
 835             raise ValueError('Invalid format specification %r' % format_spec)
 836
 837         try:
 838             comparison_value = int(m.group('value'))
 839         except ValueError:
 840             comparison_value = parse_filesize(m.group('value'))
 841             if comparison_value is None:
 842                 comparison_value = parse_filesize(m.group('value') + 'B')
 843             if comparison_value is None:
 844                 raise ValueError(
 845                     'Invalid value %r in format specification %r' % (
 846                         m.group('value'), format_spec))
 847         op = OPERATORS[m.group('op')]
 848
 849         def _filter(f):
 850             actual_value = f.get(m.group('key'))
 851             if actual_value is None:
 852                 return m.group('none_inclusive')
 853             return op(actual_value, comparison_value)
 854         new_formats = [f for f in available_formats if _filter(f)]
 855
 856         new_format_spec = format_spec[:-len(m.group(0))]
 857         if not new_format_spec:
 858             new_format_spec = 'best'
 859
 860         return (new_format_spec, new_formats)
 861
 862     def select_format(self, format_spec, available_formats):
 863         while format_spec.endswith(']'):
 864             format_spec, available_formats = self._apply_format_filter(
 865                 format_spec, available_formats)
 866         if not available_formats:
 867             return None
 868
 869         if format_spec == 'best' or format_spec is None:
 870             return available_formats[-1]
 871         elif format_spec == 'worst':
 872             return available_formats[0]
 873         elif format_spec == 'bestaudio':
 874             audio_formats = [
 875                 f for f in available_formats
 876                 if f.get('vcodec') == 'none']
 877             if audio_formats:
 878                 return audio_formats[-1]
 879         elif format_spec == 'worstaudio':
 880             audio_formats = [
 881                 f for f in available_formats
 882                 if f.get('vcodec') == 'none']
 883             if audio_formats:
 884                 return audio_formats[0]
 885         elif format_spec == 'bestvideo':
 886             video_formats = [
 887                 f for f in available_formats
 888                 if f.get('acodec') == 'none']
 889             if video_formats:
 890                 return video_formats[-1]
 891         elif format_spec == 'worstvideo':
 892             video_formats = [
 893                 f for f in available_formats
 894                 if f.get('acodec') == 'none']
 895             if video_formats:
 896                 return video_formats[0]
 897         else:
 898             extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
 899             if format_spec in extensions:
 900                 filter_f = lambda f: f['ext'] == format_spec
 901             else:
 902                 filter_f = lambda f: f['format_id'] == format_spec
 903             matches = list(filter(filter_f, available_formats))
 904             if matches:
 905                 return matches[-1]
 906         return None
 907
 908     def _calc_headers(self, info_dict):
 909         res = std_headers.copy()
 910
 911         add_headers = info_dict.get('http_headers')
 912         if add_headers:
 913             res.update(add_headers)
 914
 915         cookies = self._calc_cookies(info_dict)
 916         if cookies:
 917             res['Cookie'] = cookies
 918
 919         return res
 920
 921     def _calc_cookies(self, info_dict):
 922         class _PseudoRequest(object):
 923             def __init__(self, url):
 924                 self.url = url
 925                 self.headers = {}
 926                 self.unverifiable = False
 927
 928             def add_unredirected_header(self, k, v):
 929                 self.headers[k] = v
 930
 931             def get_full_url(self):
 932                 return self.url
 933
 934             def is_unverifiable(self):
 935                 return self.unverifiable
 936
 937             def has_header(self, h):
 938                 return h in self.headers
 939
 940         pr = _PseudoRequest(info_dict['url'])
 941         self.cookiejar.add_cookie_header(pr)
 942         return pr.headers.get('Cookie')
 943
 944     def process_video_result(self, info_dict, download=True):
 945         assert info_dict.get('_type', 'video') == 'video'
 946
 947         if 'id' not in info_dict:
 948             raise ExtractorError('Missing "id" field in extractor result')
 949         if 'title' not in info_dict:
 950             raise ExtractorError('Missing "title" field in extractor result')
 951
 952         if 'playlist' not in info_dict:
 953             # It isn't part of a playlist
 954             info_dict['playlist'] = None
 955             info_dict['playlist_index'] = None
 956
 957         thumbnails = info_dict.get('thumbnails')
 958         if thumbnails is None:
 959             thumbnail = info_dict.get('thumbnail')
 960             if thumbnail:
 961                 thumbnails = [{'url': thumbnail}]
 962         if thumbnails:
 963             thumbnails.sort(key=lambda t: (
 964                 t.get('preference'), t.get('width'), t.get('height'),
 965                 t.get('id'), t.get('url')))
 966             for t in thumbnails:
 967                 if 'width' in t and 'height' in t:
 968                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
 969
 970         if thumbnails and 'thumbnail' not in info_dict:
 971             info_dict['thumbnail'] = thumbnails[-1]['url']
 972
 973         if 'display_id' not in info_dict and 'id' in info_dict:
 974             info_dict['display_id'] = info_dict['id']
 975
 976         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
 977             # Working around negative timestamps in Windows
 978             # (see http://bugs.python.org/issue1646728)
 979             if info_dict['timestamp'] < 0 and os.name == 'nt':
 980                 info_dict['timestamp'] = 0
 981             upload_date = datetime.datetime.utcfromtimestamp(
 982                 info_dict['timestamp'])
 983             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 984
 985         # This extractors handle format selection themselves
 986         if info_dict['extractor'] in ['Youku']:
 987             if download:
 988                 self.process_info(info_dict)
 989             return info_dict
 990
 991         # We now pick which formats have to be downloaded
 992         if info_dict.get('formats') is None:
 993             # There's only one format available
 994             formats = [info_dict]
 995         else:
 996             formats = info_dict['formats']
 997
 998         if not formats:
 999             raise ExtractorError('No video formats found!')
1000
1001         # We check that all the formats have the format and format_id fields
1002         for i, format in enumerate(formats):
1003             if 'url' not in format:
1004                 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1005
1006             if format.get('format_id') is None:
1007                 format['format_id'] = compat_str(i)
1008             if format.get('format') is None:
1009                 format['format'] = '{id} - {res}{note}'.format(
1010                     id=format['format_id'],
1011                     res=self.format_resolution(format),
1012                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1013                 )
1014             # Automatically determine file extension if missing
1015             if 'ext' not in format:
1016                 format['ext'] = determine_ext(format['url']).lower()
1017             # Add HTTP headers, so that external programs can use them from the
1018             # json output
1019             full_format_info = info_dict.copy()
1020             full_format_info.update(format)
1021             format['http_headers'] = self._calc_headers(full_format_info)
1022
1023         format_limit = self.params.get('format_limit', None)
1024         if format_limit:
1025             formats = list(takewhile_inclusive(
1026                 lambda f: f['format_id'] != format_limit, formats
1027             ))
1028
1029         # TODO Central sorting goes here
1030
1031         if formats[0] is not info_dict:
1032             # only set the 'formats' fields if the original info_dict list them
1033             # otherwise we end up with a circular reference, the first (and unique)
1034             # element in the 'formats' field in info_dict is info_dict itself,
1035             # wich can't be exported to json
1036             info_dict['formats'] = formats
1037         if self.params.get('listformats'):
1038             self.list_formats(info_dict)
1039             return
1040         if self.params.get('list_thumbnails'):
1041             self.list_thumbnails(info_dict)
1042             return
1043
1044         req_format = self.params.get('format')
1045         if req_format is None:
1046             req_format = 'best'
1047         formats_to_download = []
1048         # The -1 is for supporting YoutubeIE
1049         if req_format in ('-1', 'all'):
1050             formats_to_download = formats
1051         else:
1052             for rfstr in req_format.split(','):
1053                 # We can accept formats requested in the format: 34/5/best, we pick
1054                 # the first that is available, starting from left
1055                 req_formats = rfstr.split('/')
1056                 for rf in req_formats:
1057                     if re.match(r'.+?\+.+?', rf) is not None:
1058                         # Two formats have been requested like '137+139'
1059                         format_1, format_2 = rf.split('+')
1060                         formats_info = (self.select_format(format_1, formats),
1061                                         self.select_format(format_2, formats))
1062                         if all(formats_info):
1063                             # The first format must contain the video and the
1064                             # second the audio
1065                             if formats_info[0].get('vcodec') == 'none':
1066                                 self.report_error('The first format must '
1067                                                   'contain the video, try using '
1068                                                   '"-f %s+%s"' % (format_2, format_1))
1069                                 return
1070                             output_ext = (
1071                                 formats_info[0]['ext']
1072                                 if self.params.get('merge_output_format') is None
1073                                 else self.params['merge_output_format'])
1074                             selected_format = {
1075                                 'requested_formats': formats_info,
1076                                 'format': rf,
1077                                 'ext': formats_info[0]['ext'],
1078                                 'width': formats_info[0].get('width'),
1079                                 'height': formats_info[0].get('height'),
1080                                 'resolution': formats_info[0].get('resolution'),
1081                                 'fps': formats_info[0].get('fps'),
1082                                 'vcodec': formats_info[0].get('vcodec'),
1083                                 'vbr': formats_info[0].get('vbr'),
1084                                 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1085                                 'acodec': formats_info[1].get('acodec'),
1086                                 'abr': formats_info[1].get('abr'),
1087                                 'ext': output_ext,
1088                             }
1089                         else:
1090                             selected_format = None
1091                     else:
1092                         selected_format = self.select_format(rf, formats)
1093                     if selected_format is not None:
1094                         formats_to_download.append(selected_format)
1095                         break
1096         if not formats_to_download:
1097             raise ExtractorError('requested format not available',
1098                                  expected=True)
1099
1100         if download:
1101             if len(formats_to_download) > 1:
1102                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1103             for format in formats_to_download:
1104                 new_info = dict(info_dict)
1105                 new_info.update(format)
1106                 self.process_info(new_info)
1107         # We update the info dict with the best quality format (backwards compatibility)
1108         info_dict.update(formats_to_download[-1])
1109         return info_dict
1110
1111     def process_info(self, info_dict):
1112         """Process a single resolved IE result."""
1113
1114         assert info_dict.get('_type', 'video') == 'video'
1115
1116         max_downloads = self.params.get('max_downloads')
1117         if max_downloads is not None:
1118             if self._num_downloads >= int(max_downloads):
1119                 raise MaxDownloadsReached()
1120
1121         info_dict['fulltitle'] = info_dict['title']
1122         if len(info_dict['title']) > 200:
1123             info_dict['title'] = info_dict['title'][:197] + '...'
1124
1125         # Keep for backwards compatibility
1126         info_dict['stitle'] = info_dict['title']
1127
1128         if 'format' not in info_dict:
1129             info_dict['format'] = info_dict['ext']
1130
1131         reason = self._match_entry(info_dict)
1132         if reason is not None:
1133             self.to_screen('[download] ' + reason)
1134             return
1135
1136         self._num_downloads += 1
1137
1138         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1139
1140         # Forced printings
1141         if self.params.get('forcetitle', False):
1142             self.to_stdout(info_dict['fulltitle'])
1143         if self.params.get('forceid', False):
1144             self.to_stdout(info_dict['id'])
1145         if self.params.get('forceurl', False):
1146             if info_dict.get('requested_formats') is not None:
1147                 for f in info_dict['requested_formats']:
1148                     self.to_stdout(f['url'] + f.get('play_path', ''))
1149             else:
1150                 # For RTMP URLs, also include the playpath
1151                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1152         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1153             self.to_stdout(info_dict['thumbnail'])
1154         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1155             self.to_stdout(info_dict['description'])
1156         if self.params.get('forcefilename', False) and filename is not None:
1157             self.to_stdout(filename)
1158         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1159             self.to_stdout(formatSeconds(info_dict['duration']))
1160         if self.params.get('forceformat', False):
1161             self.to_stdout(info_dict['format'])
1162         if self.params.get('forcejson', False):
1163             self.to_stdout(json.dumps(info_dict))
1164
1165         # Do nothing else if in simulate mode
1166         if self.params.get('simulate', False):
1167             return
1168
1169         if filename is None:
1170             return
1171
1172         try:
1173             dn = os.path.dirname(encodeFilename(filename))
1174             if dn and not os.path.exists(dn):
1175                 os.makedirs(dn)
1176         except (OSError, IOError) as err:
1177             self.report_error('unable to create directory ' + compat_str(err))
1178             return
1179
1180         if self.params.get('writedescription', False):
1181             descfn = filename + '.description'
1182             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1183                 self.to_screen('[info] Video description is already present')
1184             elif info_dict.get('description') is None:
1185                 self.report_warning('There\'s no description to write.')
1186             else:
1187                 try:
1188                     self.to_screen('[info] Writing video description to: ' + descfn)
1189                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1190                         descfile.write(info_dict['description'])
1191                 except (OSError, IOError):
1192                     self.report_error('Cannot write description file ' + descfn)
1193                     return
1194
1195         if self.params.get('writeannotations', False):
1196             annofn = filename + '.annotations.xml'
1197             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1198                 self.to_screen('[info] Video annotations are already present')
1199             else:
1200                 try:
1201                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1202                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1203                         annofile.write(info_dict['annotations'])
1204                 except (KeyError, TypeError):
1205                     self.report_warning('There are no annotations to write.')
1206                 except (OSError, IOError):
1207                     self.report_error('Cannot write annotations file: ' + annofn)
1208                     return
1209
1210         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1211                                        self.params.get('writeautomaticsub')])
1212
1213         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
1214             # subtitles download errors are already managed as troubles in relevant IE
1215             # that way it will silently go on when used with unsupporting IE
1216             subtitles = info_dict['subtitles']
1217             sub_format = self.params.get('subtitlesformat', 'srt')
1218             for sub_lang in subtitles.keys():
1219                 sub = subtitles[sub_lang]
1220                 if sub is None:
1221                     continue
1222                 try:
1223                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1224                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1225                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1226                     else:
1227                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1228                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1229                             subfile.write(sub)
1230                 except (OSError, IOError):
1231                     self.report_error('Cannot write subtitles file ' + sub_filename)
1232                     return
1233
1234         if self.params.get('writeinfojson', False):
1235             infofn = os.path.splitext(filename)[0] + '.info.json'
1236             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1237                 self.to_screen('[info] Video description metadata is already present')
1238             else:
1239                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1240                 try:
1241                     write_json_file(info_dict, infofn)
1242                 except (OSError, IOError):
1243                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1244                     return
1245
1246         self._write_thumbnails(info_dict, filename)
1247
1248         if not self.params.get('skip_download', False):
1249             try:
1250                 def dl(name, info):
1251                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1252                     for ph in self._progress_hooks:
1253                         fd.add_progress_hook(ph)
1254                     if self.params.get('verbose'):
1255                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1256                     return fd.download(name, info)
1257
1258                 if info_dict.get('requested_formats') is not None:
1259                     downloaded = []
1260                     success = True
1261                     merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
1262                     if not merger._executable:
1263                         postprocessors = []
1264                         self.report_warning('You have requested multiple '
1265                                             'formats but ffmpeg or avconv are not installed.'
1266                                             ' The formats won\'t be merged')
1267                     else:
1268                         postprocessors = [merger]
1269                     for f in info_dict['requested_formats']:
1270                         new_info = dict(info_dict)
1271                         new_info.update(f)
1272                         fname = self.prepare_filename(new_info)
1273                         fname = prepend_extension(fname, 'f%s' % f['format_id'])
1274                         downloaded.append(fname)
1275                         partial_success = dl(fname, new_info)
1276                         success = success and partial_success
1277                     info_dict['__postprocessors'] = postprocessors
1278                     info_dict['__files_to_merge'] = downloaded
1279                 else:
1280                     # Just a single file
1281                     success = dl(filename, info_dict)
1282             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1283                 self.report_error('unable to download video data: %s' % str(err))
1284                 return
1285             except (OSError, IOError) as err:
1286                 raise UnavailableVideoError(err)
1287             except (ContentTooShortError, ) as err:
1288                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1289                 return
1290
1291             if success:
1292                 # Fixup content
1293                 fixup_policy = self.params.get('fixup')
1294                 if fixup_policy is None:
1295                     fixup_policy = 'detect_or_warn'
1296
1297                 stretched_ratio = info_dict.get('stretched_ratio')
1298                 if stretched_ratio is not None and stretched_ratio != 1:
1299                     if fixup_policy == 'warn':
1300                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1301                             info_dict['id'], stretched_ratio))
1302                     elif fixup_policy == 'detect_or_warn':
1303                         stretched_pp = FFmpegFixupStretchedPP(self)
1304                         if stretched_pp.available:
1305                             info_dict.setdefault('__postprocessors', [])
1306                             info_dict['__postprocessors'].append(stretched_pp)
1307                         else:
1308                             self.report_warning(
1309                                 '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
1310                                     info_dict['id'], stretched_ratio))
1311                     else:
1312                         assert fixup_policy in ('ignore', 'never')
1313
1314                 if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
1315                     if fixup_policy == 'warn':
1316                         self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
1317                             info_dict['id']))
1318                     elif fixup_policy == 'detect_or_warn':
1319                         fixup_pp = FFmpegFixupM4aPP(self)
1320                         if fixup_pp.available:
1321                             info_dict.setdefault('__postprocessors', [])
1322                             info_dict['__postprocessors'].append(fixup_pp)
1323                         else:
1324                             self.report_warning(
1325                                 '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
1326                                     info_dict['id']))
1327                     else:
1328                         assert fixup_policy in ('ignore', 'never')
1329
1330                 try:
1331                     self.post_process(filename, info_dict)
1332                 except (PostProcessingError) as err:
1333                     self.report_error('postprocessing: %s' % str(err))
1334                     return
1335                 self.record_download_archive(info_dict)
1336
1337     def download(self, url_list):
1338         """Download a given list of URLs."""
1339         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1340         if (len(url_list) > 1 and
1341                 '%' not in outtmpl
1342                 and self.params.get('max_downloads') != 1):
1343             raise SameFileError(outtmpl)
1344
1345         for url in url_list:
1346             try:
1347                 # It also downloads the videos
1348                 res = self.extract_info(url)
1349             except UnavailableVideoError:
1350                 self.report_error('unable to download video')
1351             except MaxDownloadsReached:
1352                 self.to_screen('[info] Maximum number of downloaded files reached.')
1353                 raise
1354             else:
1355                 if self.params.get('dump_single_json', False):
1356                     self.to_stdout(json.dumps(res))
1357
1358         return self._download_retcode
1359
1360     def download_with_info_file(self, info_filename):
1361         with io.open(info_filename, 'r', encoding='utf-8') as f:
1362             info = json.load(f)
1363         try:
1364             self.process_ie_result(info, download=True)
1365         except DownloadError:
1366             webpage_url = info.get('webpage_url')
1367             if webpage_url is not None:
1368                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1369                 return self.download([webpage_url])
1370             else:
1371                 raise
1372         return self._download_retcode
1373
1374     def post_process(self, filename, ie_info):
1375         """Run all the postprocessors on the given file."""
1376         info = dict(ie_info)
1377         info['filepath'] = filename
1378         pps_chain = []
1379         if ie_info.get('__postprocessors') is not None:
1380             pps_chain.extend(ie_info['__postprocessors'])
1381         pps_chain.extend(self._pps)
1382         for pp in pps_chain:
1383             keep_video = None
1384             old_filename = info['filepath']
1385             try:
1386                 keep_video_wish, info = pp.run(info)
1387                 if keep_video_wish is not None:
1388                     if keep_video_wish:
1389                         keep_video = keep_video_wish
1390                     elif keep_video is None:
1391                         # No clear decision yet, let IE decide
1392                         keep_video = keep_video_wish
1393             except PostProcessingError as e:
1394                 self.report_error(e.msg)
1395             if keep_video is False and not self.params.get('keepvideo', False):
1396                 try:
1397                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1398                     os.remove(encodeFilename(old_filename))
1399                 except (IOError, OSError):
1400                     self.report_warning('Unable to remove downloaded video file')
1401
1402     def _make_archive_id(self, info_dict):
1403         # Future-proof against any change in case
1404         # and backwards compatibility with prior versions
1405         extractor = info_dict.get('extractor_key')
1406         if extractor is None:
1407             if 'id' in info_dict:
1408                 extractor = info_dict.get('ie_key')  # key in a playlist
1409         if extractor is None:
1410             return None  # Incomplete video information
1411         return extractor.lower() + ' ' + info_dict['id']
1412
1413     def in_download_archive(self, info_dict):
1414         fn = self.params.get('download_archive')
1415         if fn is None:
1416             return False
1417
1418         vid_id = self._make_archive_id(info_dict)
1419         if vid_id is None:
1420             return False  # Incomplete video information
1421
1422         try:
1423             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1424                 for line in archive_file:
1425                     if line.strip() == vid_id:
1426                         return True
1427         except IOError as ioe:
1428             if ioe.errno != errno.ENOENT:
1429                 raise
1430         return False
1431
1432     def record_download_archive(self, info_dict):
1433         fn = self.params.get('download_archive')
1434         if fn is None:
1435             return
1436         vid_id = self._make_archive_id(info_dict)
1437         assert vid_id
1438         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1439             archive_file.write(vid_id + '\n')
1440
1441     @staticmethod
1442     def format_resolution(format, default='unknown'):
1443         if format.get('vcodec') == 'none':
1444             return 'audio only'
1445         if format.get('resolution') is not None:
1446             return format['resolution']
1447         if format.get('height') is not None:
1448             if format.get('width') is not None:
1449                 res = '%sx%s' % (format['width'], format['height'])
1450             else:
1451                 res = '%sp' % format['height']
1452         elif format.get('width') is not None:
1453             res = '?x%d' % format['width']
1454         else:
1455             res = default
1456         return res
1457
1458     def _format_note(self, fdict):
1459         res = ''
1460         if fdict.get('ext') in ['f4f', 'f4m']:
1461             res += '(unsupported) '
1462         if fdict.get('format_note') is not None:
1463             res += fdict['format_note'] + ' '
1464         if fdict.get('tbr') is not None:
1465             res += '%4dk ' % fdict['tbr']
1466         if fdict.get('container') is not None:
1467             if res:
1468                 res += ', '
1469             res += '%s container' % fdict['container']
1470         if (fdict.get('vcodec') is not None and
1471                 fdict.get('vcodec') != 'none'):
1472             if res:
1473                 res += ', '
1474             res += fdict['vcodec']
1475             if fdict.get('vbr') is not None:
1476                 res += '@'
1477         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1478             res += 'video@'
1479         if fdict.get('vbr') is not None:
1480             res += '%4dk' % fdict['vbr']
1481         if fdict.get('fps') is not None:
1482             res += ', %sfps' % fdict['fps']
1483         if fdict.get('acodec') is not None:
1484             if res:
1485                 res += ', '
1486             if fdict['acodec'] == 'none':
1487                 res += 'video only'
1488             else:
1489                 res += '%-5s' % fdict['acodec']
1490         elif fdict.get('abr') is not None:
1491             if res:
1492                 res += ', '
1493             res += 'audio'
1494         if fdict.get('abr') is not None:
1495             res += '@%3dk' % fdict['abr']
1496         if fdict.get('asr') is not None:
1497             res += ' (%5dHz)' % fdict['asr']
1498         if fdict.get('filesize') is not None:
1499             if res:
1500                 res += ', '
1501             res += format_bytes(fdict['filesize'])
1502         elif fdict.get('filesize_approx') is not None:
1503             if res:
1504                 res += ', '
1505             res += '~' + format_bytes(fdict['filesize_approx'])
1506         return res
1507
1508     def list_formats(self, info_dict):
1509         def line(format, idlen=20):
1510             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1511                 format['format_id'],
1512                 format['ext'],
1513                 self.format_resolution(format),
1514                 self._format_note(format),
1515             ))
1516
1517         formats = info_dict.get('formats', [info_dict])
1518         idlen = max(len('format code'),
1519                     max(len(f['format_id']) for f in formats))
1520         formats_s = [
1521             line(f, idlen) for f in formats
1522             if f.get('preference') is None or f['preference'] >= -1000]
1523         if len(formats) > 1:
1524             formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
1525             formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
1526
1527         header_line = line({
1528             'format_id': 'format code', 'ext': 'extension',
1529             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1530         self.to_screen(
1531             '[info] Available formats for %s:\n%s\n%s' %
1532             (info_dict['id'], header_line, '\n'.join(formats_s)))
1533
1534     def list_thumbnails(self, info_dict):
1535         thumbnails = info_dict.get('thumbnails')
1536         if not thumbnails:
1537             tn_url = info_dict.get('thumbnail')
1538             if tn_url:
1539                 thumbnails = [{'id': '0', 'url': tn_url}]
1540             else:
1541                 self.to_screen(
1542                     '[info] No thumbnails present for %s' % info_dict['id'])
1543                 return
1544
1545         self.to_screen(
1546             '[info] Thumbnails for %s:' % info_dict['id'])
1547         self.to_screen(render_table(
1548             ['ID', 'width', 'height', 'URL'],
1549             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1550
1551     def urlopen(self, req):
1552         """ Start an HTTP download """
1553
1554         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1555         # always respected by websites, some tend to give out URLs with non percent-encoded
1556         # non-ASCII characters (see telemb.py, ard.py [#3412])
1557         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1558         # To work around aforementioned issue we will replace request's original URL with
1559         # percent-encoded one
1560         req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str)
1561         url = req if req_is_string else req.get_full_url()
1562         url_escaped = escape_url(url)
1563
1564         # Substitute URL if any change after escaping
1565         if url != url_escaped:
1566             if req_is_string:
1567                 req = url_escaped
1568             else:
1569                 req = compat_urllib_request.Request(
1570                     url_escaped, data=req.data, headers=req.headers,
1571                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1572
1573         return self._opener.open(req, timeout=self._socket_timeout)
1574
1575     def print_debug_header(self):
1576         if not self.params.get('verbose'):
1577             return
1578
1579         if type('') is not compat_str:
1580             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1581             self.report_warning(
1582                 'Your Python is broken! Update to a newer and supported version')
1583
1584         stdout_encoding = getattr(
1585             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1586         encoding_str = (
1587             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1588                 locale.getpreferredencoding(),
1589                 sys.getfilesystemencoding(),
1590                 stdout_encoding,
1591                 self.get_encoding()))
1592         write_string(encoding_str, encoding=None)
1593
1594         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1595         try:
1596             sp = subprocess.Popen(
1597                 ['git', 'rev-parse', '--short', 'HEAD'],
1598                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1599                 cwd=os.path.dirname(os.path.abspath(__file__)))
1600             out, err = sp.communicate()
1601             out = out.decode().strip()
1602             if re.match('[0-9a-f]+', out):
1603                 self._write_string('[debug] Git HEAD: ' + out + '\n')
1604         except:
1605             try:
1606                 sys.exc_clear()
1607             except:
1608                 pass
1609         self._write_string('[debug] Python version %s - %s\n' % (
1610             platform.python_version(), platform_name()))
1611
1612         exe_versions = FFmpegPostProcessor.get_versions()
1613         exe_versions['rtmpdump'] = rtmpdump_version()
1614         exe_str = ', '.join(
1615             '%s %s' % (exe, v)
1616             for exe, v in sorted(exe_versions.items())
1617             if v
1618         )
1619         if not exe_str:
1620             exe_str = 'none'
1621         self._write_string('[debug] exe versions: %s\n' % exe_str)
1622
1623         proxy_map = {}
1624         for handler in self._opener.handlers:
1625             if hasattr(handler, 'proxies'):
1626                 proxy_map.update(handler.proxies)
1627         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1628
1629         if self.params.get('call_home', False):
1630             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1631             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1632             latest_version = self.urlopen(
1633                 'https://yt-dl.org/latest/version').read().decode('utf-8')
1634             if version_tuple(latest_version) > version_tuple(__version__):
1635                 self.report_warning(
1636                     'You are using an outdated version (newest version: %s)! '
1637                     'See https://yt-dl.org/update if you need help updating.' %
1638                     latest_version)
1639
1640     def _setup_opener(self):
1641         timeout_val = self.params.get('socket_timeout')
1642         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1643
1644         opts_cookiefile = self.params.get('cookiefile')
1645         opts_proxy = self.params.get('proxy')
1646
1647         if opts_cookiefile is None:
1648             self.cookiejar = compat_cookiejar.CookieJar()
1649         else:
1650             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1651                 opts_cookiefile)
1652             if os.access(opts_cookiefile, os.R_OK):
1653                 self.cookiejar.load()
1654
1655         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1656             self.cookiejar)
1657         if opts_proxy is not None:
1658             if opts_proxy == '':
1659                 proxies = {}
1660             else:
1661                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1662         else:
1663             proxies = compat_urllib_request.getproxies()
1664             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1665             if 'http' in proxies and 'https' not in proxies:
1666                 proxies['https'] = proxies['http']
1667         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1668
1669         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1670         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
1671         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
1672         opener = compat_urllib_request.build_opener(
1673             https_handler, proxy_handler, cookie_processor, ydlh)
1674         # Delete the default user-agent header, which would otherwise apply in
1675         # cases where our custom HTTP handler doesn't come into play
1676         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1677         opener.addheaders = []
1678         self._opener = opener
1679
1680     def encode(self, s):
1681         if isinstance(s, bytes):
1682             return s  # Already encoded
1683
1684         try:
1685             return s.encode(self.get_encoding())
1686         except UnicodeEncodeError as err:
1687             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1688             raise
1689
1690     def get_encoding(self):
1691         encoding = self.params.get('encoding')
1692         if encoding is None:
1693             encoding = preferredencoding()
1694         return encoding
1695
1696     def _write_thumbnails(self, info_dict, filename):
1697         if self.params.get('writethumbnail', False):
1698             thumbnails = info_dict.get('thumbnails')
1699             if thumbnails:
1700                 thumbnails = [thumbnails[-1]]
1701         elif self.params.get('write_all_thumbnails', False):
1702             thumbnails = info_dict.get('thumbnails')
1703         else:
1704             return
1705
1706         if not thumbnails:
1707             # No thumbnails present, so return immediately
1708             return
1709
1710         for t in thumbnails:
1711             thumb_ext = determine_ext(t['url'], 'jpg')
1712             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
1713             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
1714             thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
1715
1716             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
1717                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
1718                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1719             else:
1720                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
1721                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
1722                 try:
1723                     uf = self.urlopen(t['url'])
1724                     with open(thumb_filename, 'wb') as thumbf:
1725                         shutil.copyfileobj(uf, thumbf)
1726                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
1727                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
1728                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1729                     self.report_warning('Unable to download thumbnail "%s": %s' %
1730                                         (t['url'], compat_str(err)))