Merge remote-tracking branch 'dstftw/generic-webpage-unescape'
[youtube-dl] / youtube_dl / YoutubeDL.py
index 11f88f1280c2d82bea2fd5fbe1f5af2bd2459d1e..c5d08b0bbabb572c3711d1ae8119e7eeb7e40e71 100644 (file)
@@ -4,6 +4,7 @@
 from __future__ import absolute_import, unicode_literals
 
 import collections
+import datetime
 import errno
 import io
 import json
@@ -39,6 +40,7 @@ from .utils import (
     locked_file,
     make_HTTPS_handler,
     MaxDownloadsReached,
+    PagedList,
     PostProcessingError,
     platform_name,
     preferredencoding,
@@ -146,11 +148,16 @@ class YoutubeDL(object):
                        again.
     cookiefile:        File name where cookies should be read from and dumped to.
     nocheckcertificate:Do not verify SSL certificates
+    prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
+                       At the moment, this is only supported by YouTube.
     proxy:             URL of the proxy server to use
     socket_timeout:    Time to wait for unresponsive hosts, in seconds
     bidi_workaround:   Work around buggy terminals without bidirectional text
                        support, using fridibi
     debug_printtraffic:Print out sent and received HTTP traffic
+    include_ads:       Download ads as well
+    default_search:    Prepend this string if an input url is not valid.
+                       'auto' for elaborate guessing
 
     The following parameters are not used by YoutubeDL itself, they are used by
     the FileDownloader:
@@ -327,7 +334,7 @@ class YoutubeDL(object):
 
     def __exit__(self, *args):
         self.restore_console_title()
-    
+
         if self.params.get('cookiefile') is not None:
             self.cookiejar.save()
 
@@ -366,12 +373,15 @@ class YoutubeDL(object):
         Print the message to stderr, it will be prefixed with 'WARNING:'
         If stderr is a tty file the 'WARNING:' will be colored
         '''
-        if self._err_file.isatty() and os.name != 'nt':
-            _msg_header = '\033[0;33mWARNING:\033[0m'
+        if self.params.get('logger') is not None:
+            self.params['logger'].warning(message)
         else:
-            _msg_header = 'WARNING:'
-        warning_message = '%s %s' % (_msg_header, message)
-        self.to_stderr(warning_message)
+            if self._err_file.isatty() and os.name != 'nt':
+                _msg_header = '\033[0;33mWARNING:\033[0m'
+            else:
+                _msg_header = 'WARNING:'
+            warning_message = '%s %s' % (_msg_header, message)
+            self.to_stderr(warning_message)
 
     def report_error(self, message, tb=None):
         '''
@@ -392,10 +402,6 @@ class YoutubeDL(object):
         except UnicodeEncodeError:
             self.to_screen('[download] The file has already been downloaded')
 
-    def increment_downloads(self):
-        """Increment the ordinal that assigns a number to each file."""
-        self._num_downloads += 1
-
     def prepare_filename(self, info_dict):
         """Generate the output filename."""
         try:
@@ -409,6 +415,13 @@ class YoutubeDL(object):
             template_dict['autonumber'] = autonumber_templ % self._num_downloads
             if template_dict.get('playlist_index') is not None:
                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
+            if template_dict.get('resolution') is None:
+                if template_dict.get('width') and template_dict.get('height'):
+                    template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+                elif template_dict.get('height'):
+                    template_dict['resolution'] = '%sp' % template_dict['height']
+                elif template_dict.get('width'):
+                    template_dict['resolution'] = '?x%d' % template_dict['width']
 
             sanitize = lambda k, v: sanitize_filename(
                 compat_str(v),
@@ -513,6 +526,8 @@ class YoutubeDL(object):
             except ExtractorError as de: # An error we somewhat expected
                 self.report_error(compat_str(de), de.format_traceback())
                 break
+            except MaxDownloadsReached:
+                raise
             except Exception as e:
                 if self.params.get('ignoreerrors', False):
                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
@@ -520,7 +535,7 @@ class YoutubeDL(object):
                 else:
                     raise
         else:
-            self.report_error('no suitable InfoExtractor: %s' % url)
+            self.report_error('no suitable InfoExtractor for URL %s' % url)
 
     def process_ie_result(self, ie_result, download=True, extra_info={}):
         """
@@ -575,19 +590,27 @@ class YoutubeDL(object):
 
             playlist_results = []
 
-            n_all_entries = len(ie_result['entries'])
             playliststart = self.params.get('playliststart', 1) - 1
             playlistend = self.params.get('playlistend', None)
             # For backwards compatibility, interpret -1 as whole list
             if playlistend == -1:
                 playlistend = None
 
-            entries = ie_result['entries'][playliststart:playlistend]
-            n_entries = len(entries)
-
-            self.to_screen(
-                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
-                (ie_result['extractor'], playlist, n_all_entries, n_entries))
+            if isinstance(ie_result['entries'], list):
+                n_all_entries = len(ie_result['entries'])
+                entries = ie_result['entries'][playliststart:playlistend]
+                n_entries = len(entries)
+                self.to_screen(
+                    "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
+            else:
+                assert isinstance(ie_result['entries'], PagedList)
+                entries = ie_result['entries'].getslice(
+                    playliststart, playlistend)
+                n_entries = len(entries)
+                self.to_screen(
+                    "[%s] playlist %s: Downloading %d videos" %
+                    (ie_result['extractor'], playlist, n_entries))
 
             for i, entry in enumerate(entries, 1):
                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
@@ -634,6 +657,30 @@ class YoutubeDL(object):
             return available_formats[-1]
         elif format_spec == 'worst':
             return available_formats[0]
+        elif format_spec == 'bestaudio':
+            audio_formats = [
+                f for f in available_formats
+                if f.get('vcodec') == 'none']
+            if audio_formats:
+                return audio_formats[-1]
+        elif format_spec == 'worstaudio':
+            audio_formats = [
+                f for f in available_formats
+                if f.get('vcodec') == 'none']
+            if audio_formats:
+                return audio_formats[0]
+        elif format_spec == 'bestvideo':
+            video_formats = [
+                f for f in available_formats
+                if f.get('acodec') == 'none']
+            if video_formats:
+                return video_formats[-1]
+        elif format_spec == 'worstvideo':
+            video_formats = [
+                f for f in available_formats
+                if f.get('acodec') == 'none']
+            if video_formats:
+                return video_formats[0]
         else:
             extensions = ['mp4', 'flv', 'webm', '3gp']
             if format_spec in extensions:
@@ -653,6 +700,14 @@ class YoutubeDL(object):
             info_dict['playlist'] = None
             info_dict['playlist_index'] = None
 
+        if 'display_id' not in info_dict and 'id' in info_dict:
+            info_dict['display_id'] = info_dict['id']
+
+        if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+            upload_date = datetime.datetime.utcfromtimestamp(
+                info_dict['timestamp'])
+            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
             if download:
@@ -666,8 +721,11 @@ class YoutubeDL(object):
         else:
             formats = info_dict['formats']
 
+        if not formats:
+            raise ExtractorError('No video formats found!')
+
         # We check that all the formats have the format and format_id fields
-        for (i, format) in enumerate(formats):
+        for i, format in enumerate(formats):
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
             if format.get('format') is None:
@@ -688,17 +746,17 @@ class YoutubeDL(object):
 
         # TODO Central sorting goes here
 
-        if formats[0] is not info_dict: 
+        if formats[0] is not info_dict:
             # only set the 'formats' fields if the original info_dict list them
             # otherwise we end up with a circular reference, the first (and unique)
-            # element in the 'formats' field in info_dict is info_dict itself, 
+            # element in the 'formats' field in info_dict is info_dict itself,
             # wich can't be exported to json
             info_dict['formats'] = formats
         if self.params.get('listformats', None):
             self.list_formats(info_dict)
             return
 
-        req_format = self.params.get('format', 'best')
+        req_format = self.params.get('format')
         if req_format is None:
             req_format = 'best'
         formats_to_download = []
@@ -747,8 +805,11 @@ class YoutubeDL(object):
         """Process a single resolved IE result."""
 
         assert info_dict.get('_type', 'video') == 'video'
-        #We increment the download the download count here to match the previous behaviour.
-        self.increment_downloads()
+
+        max_downloads = self.params.get('max_downloads')
+        if max_downloads is not None:
+            if self._num_downloads >= int(max_downloads):
+                raise MaxDownloadsReached()
 
         info_dict['fulltitle'] = info_dict['title']
         if len(info_dict['title']) > 200:
@@ -765,10 +826,7 @@ class YoutubeDL(object):
             self.to_screen('[download] ' + reason)
             return
 
-        max_downloads = self.params.get('max_downloads')
-        if max_downloads is not None:
-            if self._num_downloads > int(max_downloads):
-                raise MaxDownloadsReached()
+        self._num_downloads += 1
 
         filename = self.prepare_filename(info_dict)
 
@@ -886,7 +944,7 @@ class YoutubeDL(object):
                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
                                    (info_dict['extractor'], info_dict['id']))
                     try:
-                        uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+                        uf = self.urlopen(info_dict['thumbnail'])
                         with open(thumb_filename, 'wb') as thumbf:
                             shutil.copyfileobj(uf, thumbf)
                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
@@ -1072,9 +1130,15 @@ class YoutubeDL(object):
                 res += fdict['format_note'] + ' '
             if fdict.get('tbr') is not None:
                 res += '%4dk ' % fdict['tbr']
+            if fdict.get('container') is not None:
+                if res:
+                    res += ', '
+                res += '%s container' % fdict['container']
             if (fdict.get('vcodec') is not None and
                     fdict.get('vcodec') != 'none'):
-                res += '%-5s' % fdict['vcodec']
+                if res:
+                    res += ', '
+                res += fdict['vcodec']
                 if fdict.get('vbr') is not None:
                     res += '@'
             elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
@@ -1084,7 +1148,10 @@ class YoutubeDL(object):
             if fdict.get('acodec') is not None:
                 if res:
                     res += ', '
-                res += '%-5s' % fdict['acodec']
+                if fdict['acodec'] == 'none':
+                    res += 'video only'
+                else:
+                    res += '%-5s' % fdict['acodec']
             elif fdict.get('abr') is not None:
                 if res:
                     res += ', '
@@ -1123,7 +1190,7 @@ class YoutubeDL(object):
 
     def urlopen(self, req):
         """ Start an HTTP download """
-        return self._opener.open(req)
+        return self._opener.open(req, timeout=self._socket_timeout)
 
     def print_debug_header(self):
         if not self.params.get('verbose'):
@@ -1154,7 +1221,7 @@ class YoutubeDL(object):
 
     def _setup_opener(self):
         timeout_val = self.params.get('socket_timeout')
-        timeout = 600 if timeout_val is None else float(timeout_val)
+        self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
 
         opts_cookiefile = self.params.get('cookiefile')
         opts_proxy = self.params.get('proxy')
@@ -1192,7 +1259,3 @@ class YoutubeDL(object):
         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
         opener.addheaders = []
         self._opener = opener
-
-        # TODO remove this global modification
-        compat_urllib_request.install_opener(opener)
-        socket.setdefaulttimeout(timeout)