Bubble up all the stack of exceptions and retry download tests on timeout errors
[youtube-dl] / youtube_dl / FileDownloader.py
index a861086c3c8a70fa072100d9b83fe8a4ceec397d..a13a5f9d79cab1a0cf7da5c5ac6a640ad26016cb 100644 (file)
@@ -81,6 +81,9 @@ class FileDownloader(object):
     writesubtitles:    Write the video subtitles to a .srt file
     subtitleslang:     Language of the subtitles to download
     test:              Download only first bytes to test the downloader.
+    keepvideo:         Keep the video file after post-processing
+    min_filesize:      Skip files smaller than this size
+    max_filesize:      Skip files larger than this size
     """
 
     params = None
@@ -94,13 +97,14 @@ class FileDownloader(object):
         """Create a FileDownloader object with the given options."""
         self._ies = []
         self._pps = []
+        self._progress_hooks = []
         self._download_retcode = 0
         self._num_downloads = 0
         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
         self.params = params
 
         if '%(stitle)s' in self.params['outtmpl']:
-            self.to_stderr(u'WARNING: %(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
+            self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 
     @staticmethod
     def format_bytes(bytes):
@@ -204,27 +208,54 @@ class FileDownloader(object):
             # already of type unicode()
             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
         elif 'TERM' in os.environ:
-            sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
+            self.to_screen('\033]0;%s\007' % message, skip_eol=True)
 
     def fixed_template(self):
         """Checks if the output template is fixed."""
         return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
 
-    def trouble(self, message=None):
+    def trouble(self, message=None, tb=None):
         """Determine action to take when a download problem appears.
 
         Depending on if the downloader has been configured to ignore
         download errors or not, this method may throw an exception or
         not when errors are found, after printing the message.
+
+        tb, if given, is additional traceback information.
         """
         if message is not None:
             self.to_stderr(message)
         if self.params.get('verbose'):
-            self.to_stderr(u''.join(traceback.format_list(traceback.extract_stack())))
+            if tb is None:
+                if sys.exc_info()[0]:  # if .trouble has been called from an except block
+                    tb = u''
+                    if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
+                        tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
+                    tb += compat_str(traceback.format_exc())
+                else:
+                    tb_data = traceback.format_list(traceback.extract_stack())
+                    tb = u''.join(tb_data)
+            self.to_stderr(tb)
         if not self.params.get('ignoreerrors', False):
-            raise DownloadError(message)
+            if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
+                exc_info = sys.exc_info()[1].exc_info
+            else:
+                exc_info = sys.exc_info()
+            raise DownloadError(message, exc_info)
         self._download_retcode = 1
 
+    def report_warning(self, message):
+        '''
+        Print the message to stderr, it will be prefixed with 'WARNING:'
+        If stderr is a tty file the 'WARNING:' will be colored
+        '''
+        if sys.stderr.isatty():
+            _msg_header=u'\033[0;33mWARNING:\033[0m'
+        else:
+            _msg_header=u'WARNING:'
+        warning_message=u'%s %s' % (_msg_header,message)
+        self.to_stderr(warning_message)
+
     def slow_down(self, start_time, byte_counter):
         """Sleep if the download speed is over the rate limit."""
         rate_limit = self.params.get('ratelimit', None)
@@ -296,7 +327,11 @@ class FileDownloader(object):
         """Report download progress."""
         if self.params.get('noprogress', False):
             return
-        self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
+        if self.params.get('progress_with_newline', False):
+            self.to_screen(u'[download] %s of %s at %s ETA %s' %
+                (percent_str, data_len_str, speed_str, eta_str))
+        else:
+            self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
         self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
@@ -347,8 +382,11 @@ class FileDownloader(object):
 
             filename = self.params['outtmpl'] % template_dict
             return filename
-        except (ValueError, KeyError) as err:
-            self.trouble(u'ERROR: invalid system charset or erroneous output template')
+        except KeyError as err:
+            self.trouble(u'ERROR: Erroneous output template')
+            return None
+        except ValueError as err:
+            self.trouble(u'ERROR: Insufficient system charset ' + repr(preferredencoding()))
             return None
 
     def _match_entry(self, info_dict):
@@ -357,12 +395,10 @@ class FileDownloader(object):
         title = info_dict['title']
         matchtitle = self.params.get('matchtitle', False)
         if matchtitle:
-            matchtitle = matchtitle.decode('utf8')
             if not re.search(matchtitle, title, re.IGNORECASE):
                 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
         rejecttitle = self.params.get('rejecttitle', False)
         if rejecttitle:
-            rejecttitle = rejecttitle.decode('utf8')
             if re.search(rejecttitle, title, re.IGNORECASE):
                 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
         return None
@@ -433,11 +469,8 @@ class FileDownloader(object):
             try:
                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
                 self.report_writesubtitles(srtfn)
-                srtfile = open(encodeFilename(srtfn), 'wb')
-                try:
-                    srtfile.write(info_dict['subtitles'].encode('utf-8'))
-                finally:
-                    srtfile.close()
+                with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile:
+                    srtfile.write(info_dict['subtitles'])
             except (OSError, IOError):
                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
                 return
@@ -488,14 +521,28 @@ class FileDownloader(object):
 
                 # Warn if the _WORKING attribute is False
                 if not ie.working():
-                    self.trouble(u'WARNING: the program functionality for this site has been marked as broken, '
-                                 u'and will probably not work. If you want to go on, use the -i option.')
+                    self.report_warning(u'the program functionality for this site has been marked as broken, '
+                                        u'and will probably not work. If you want to go on, use the -i option.')
 
                 # Suitable InfoExtractor found
                 suitable_found = True
 
                 # Extract information from URL and process it
-                videos = ie.extract(url)
+                try:
+                    videos = ie.extract(url)
+                except ExtractorError as de: # An error we somewhat expected
+                    self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
+                    break
+                except Exception as e:
+                    if self.params.get('ignoreerrors', False):
+                        self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc()))
+                        break
+                    else:
+                        raise
+
+                if len(videos or []) > 1 and self.fixed_template():
+                    raise SameFileError(self.params['outtmpl'])
+
                 for video in videos or []:
                     video['extractor'] = ie.IE_NAME
                     try:
@@ -513,21 +560,35 @@ class FileDownloader(object):
         return self._download_retcode
 
     def post_process(self, filename, ie_info):
-        """Run the postprocessing chain on the given file."""
+        """Run all the postprocessors on the given file."""
         info = dict(ie_info)
         info['filepath'] = filename
+        keep_video = None
         for pp in self._pps:
-            info = pp.run(info)
-            if info is None:
-                break
+            try:
+                keep_video_wish,new_info = pp.run(info)
+                if keep_video_wish is not None:
+                    if keep_video_wish:
+                        keep_video = keep_video_wish
+                    elif keep_video is None:
+                        # No clear decision yet, let IE decide
+                        keep_video = keep_video_wish
+            except PostProcessingError as e:
+                self.to_stderr(u'ERROR: ' + e.msg)
+        if keep_video is False and not self.params.get('keepvideo', False):
+            try:
+                self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
+                os.remove(encodeFilename(filename))
+            except (IOError, OSError):
+                self.report_warning(u'Unable to remove downloaded video file')
 
-    def _download_with_rtmpdump(self, filename, url, player_url):
+    def _download_with_rtmpdump(self, filename, url, player_url, page_url):
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
 
         # Check for rtmpdump first
         try:
-            subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+            subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
         except (OSError, IOError):
             self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
             return False
@@ -535,7 +596,11 @@ class FileDownloader(object):
         # Download using rtmpdump. rtmpdump returns exit code 2 when
         # the connection was interrumpted and resuming appears to be
         # possible. This is part of rtmpdump's normal usage, AFAIK.
-        basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
+        basic_args = ['rtmpdump', '-q', '-r', url, '-o', tmpfilename]
+        if player_url is not None:
+            basic_args += ['-W', player_url]
+        if page_url is not None:
+            basic_args += ['--pageUrl', page_url]
         args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
         if self.params.get('verbose', False):
             try:
@@ -559,8 +624,15 @@ class FileDownloader(object):
                 retval = 0
                 break
         if retval == 0:
-            self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(encodeFilename(tmpfilename)))
+            fsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
             self.try_rename(tmpfilename, filename)
+            self._hook_progress({
+                'downloaded_bytes': fsize,
+                'total_bytes': fsize,
+                'filename': filename,
+                'status': 'finished',
+            })
             return True
         else:
             self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
@@ -568,22 +640,29 @@ class FileDownloader(object):
 
     def _do_download(self, filename, info_dict):
         url = info_dict['url']
-        player_url = info_dict.get('player_url', None)
 
         # Check file already present
         if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
             self.report_file_already_downloaded(filename)
+            self._hook_progress({
+                'filename': filename,
+                'status': 'finished',
+            })
             return True
 
         # Attempt to download using rtmpdump
         if url.startswith('rtmp'):
-            return self._download_with_rtmpdump(filename, url, player_url)
+            return self._download_with_rtmpdump(filename, url,
+                                                info_dict.get('player_url', None),
+                                                info_dict.get('page_url', None))
 
         tmpfilename = self.temp_name(filename)
         stream = None
 
         # Do not include the Accept-Encoding header
         headers = {'Youtubedl-no-compression': 'True'}
+        if 'user_agent' in info_dict:
+            headers['Youtubedl-user-agent'] = info_dict['user_agent']
         basic_request = compat_urllib_request.Request(url, None, headers)
         request = compat_urllib_request.Request(url, None, headers)
 
@@ -640,6 +719,10 @@ class FileDownloader(object):
                             # the one in the hard drive.
                             self.report_file_already_downloaded(filename)
                             self.try_rename(tmpfilename, filename)
+                            self._hook_progress({
+                                'filename': filename,
+                                'status': 'finished',
+                            })
                             return True
                         else:
                             # The length does not match, we start the download over
@@ -658,6 +741,15 @@ class FileDownloader(object):
         data_len = data.info().get('Content-length', None)
         if data_len is not None:
             data_len = int(data_len) + resume_len
+            min_data_len = self.params.get("min_filesize", None)
+            max_data_len =  self.params.get("max_filesize", None)
+            if min_data_len is not None and data_len < min_data_len:
+                self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+                return False
+            if max_data_len is not None and data_len > max_data_len:
+                self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+                return False
+
         data_len_str = self.format_bytes(data_len)
         byte_counter = 0 + resume_len
         block_size = self.params.get('buffersize', 1024)
@@ -698,6 +790,14 @@ class FileDownloader(object):
                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 
+            self._hook_progress({
+                'downloaded_bytes': byte_counter,
+                'total_bytes': data_len,
+                'tmpfilename': tmpfilename,
+                'filename': filename,
+                'status': 'downloading',
+            })
+
             # Apply rate limit
             self.slow_down(start, byte_counter - resume_len)
 
@@ -714,4 +814,31 @@ class FileDownloader(object):
         if self.params.get('updatetime', True):
             info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
 
+        self._hook_progress({
+            'downloaded_bytes': byte_counter,
+            'total_bytes': byte_counter,
+            'filename': filename,
+            'status': 'finished',
+        })
+
         return True
+
+    def _hook_progress(self, status):
+        for ph in self._progress_hooks:
+            ph(status)
+
+    def add_progress_hook(self, ph):
+        """ ph gets called on download progress, with a dictionary with the entries
+        * filename: The final filename
+        * status: One of "downloading" and "finished"
+
+        It can also have some of the following entries:
+
+        * downloaded_bytes: Bytes on disks
+        * total_bytes: Total bytes, None if unknown
+        * tmpfilename: The filename we're currently writing to
+
+        Hooks are guaranteed to be called at least once (with status "finished")
+        if the download is successful.
+        """
+        self._progress_hooks.append(ph)