[YoutubeDL] only add normal subtitles to the 'requested_subtitles' field if 'writesub...
[youtube-dl] / youtube_dl / YoutubeDL.py
index 633e3d8a141e2cd249a69da2137709f2f1858040..7319323e59ae34dbcfc5c3101d32fddb1ff46319 100755 (executable)
@@ -154,7 +154,7 @@ class YoutubeDL(object):
     allsubtitles:      Downloads all the subtitles of the video
                        (requires writesubtitles or writeautomaticsub)
     listsubtitles:     Lists all available subtitles for the video
-    subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
+    subtitlesformat:   The format code for subtitles
     subtitleslangs:    List of languages of the subtitles to download
     keepvideo:         Keep the video file after post-processing
     daterange:         A DateRange object, download only if the upload_date is in the range.
@@ -228,6 +228,12 @@ class YoutubeDL(object):
     external_downloader:  Executable of the external downloader to call.
     listformats:       Print an overview of available video formats and exit.
     list_thumbnails:   Print a table of all thumbnails and exit.
+    match_filter:      A function that gets called with the info_dict of
+                       every video.
+                       If it returns a message, the video is ignored.
+                       If it returns None, the video is downloaded.
+                       match_filter_func in utils.py is one example for this.
+    no_color:          Do not emit color codes in output.
 
 
     The following parameters are not used by YoutubeDL itself, they are used by
@@ -485,7 +491,7 @@ class YoutubeDL(object):
         else:
             if self.params.get('no_warnings'):
                 return
-            if self._err_file.isatty() and os.name != 'nt':
+            if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
                 _msg_header = '\033[0;33mWARNING:\033[0m'
             else:
                 _msg_header = 'WARNING:'
@@ -497,7 +503,7 @@ class YoutubeDL(object):
         Do the same as trouble, but prefixes the message with 'ERROR:', colored
         in red if stderr is a tty file.
         '''
-        if self._err_file.isatty() and os.name != 'nt':
+        if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
             _msg_header = '\033[0;31mERROR:\033[0m'
         else:
             _msg_header = 'ERROR:'
@@ -554,7 +560,7 @@ class YoutubeDL(object):
             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
             return None
 
-    def _match_entry(self, info_dict):
+    def _match_entry(self, info_dict, incomplete):
         """ Returns None iff the file should be downloaded """
 
         video_title = info_dict.get('title', info_dict.get('id', 'video'))
@@ -583,9 +589,17 @@ class YoutubeDL(object):
             if max_views is not None and view_count > max_views:
                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
-            return 'Skipping "%s" because it is age restricted' % title
+            return 'Skipping "%s" because it is age restricted' % video_title
         if self.in_download_archive(info_dict):
             return '%s has already been recorded in archive' % video_title
+
+        if not incomplete:
+            match_filter = self.params.get('match_filter')
+            if match_filter is not None:
+                ret = match_filter(info_dict)
+                if ret is not None:
+                    return ret
+
         return None
 
     @staticmethod
@@ -779,7 +793,7 @@ class YoutubeDL(object):
                     'extractor_key': ie_result['extractor_key'],
                 }
 
-                reason = self._match_entry(entry)
+                reason = self._match_entry(entry, incomplete=True)
                 if reason is not None:
                     self.to_screen('[download] ' + reason)
                     continue
@@ -826,26 +840,43 @@ class YoutubeDL(object):
             '!=': operator.ne,
         }
         operator_rex = re.compile(r'''(?x)\s*\[
-            (?P<key>width|height|tbr|abr|vbr|filesize|fps)
+            (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
             \]$
             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
         m = operator_rex.search(format_spec)
+        if m:
+            try:
+                comparison_value = int(m.group('value'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('value'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('value') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid value %r in format specification %r' % (
+                            m.group('value'), format_spec))
+            op = OPERATORS[m.group('op')]
+
         if not m:
-            raise ValueError('Invalid format specification %r' % format_spec)
+            STR_OPERATORS = {
+                '=': operator.eq,
+                '!=': operator.ne,
+            }
+            str_operator_rex = re.compile(r'''(?x)\s*\[
+                \s*(?P<key>ext|acodec|vcodec|container|protocol)
+                \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+                \s*(?P<value>[a-zA-Z0-9_-]+)
+                \s*\]$
+                ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
+            m = str_operator_rex.search(format_spec)
+            if m:
+                comparison_value = m.group('value')
+                op = STR_OPERATORS[m.group('op')]
 
-        try:
-            comparison_value = int(m.group('value'))
-        except ValueError:
-            comparison_value = parse_filesize(m.group('value'))
-            if comparison_value is None:
-                comparison_value = parse_filesize(m.group('value') + 'B')
-            if comparison_value is None:
-                raise ValueError(
-                    'Invalid value %r in format specification %r' % (
-                        m.group('value'), format_spec))
-        op = OPERATORS[m.group('op')]
+        if not m:
+            raise ValueError('Invalid format specification %r' % format_spec)
 
         def _filter(f):
             actual_value = f.get(m.group('key'))
@@ -988,6 +1019,15 @@ class YoutubeDL(object):
                 info_dict['timestamp'])
             info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
 
+        if self.params.get('listsubtitles', False):
+            if 'automatic_captions' in info_dict:
+                self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
+            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
+            return
+        info_dict['requested_subtitles'] = self.process_subtitles(
+            info_dict['id'], info_dict.get('subtitles'),
+            info_dict.get('automatic_captions'))
+
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
             if download:
@@ -1116,6 +1156,63 @@ class YoutubeDL(object):
         info_dict.update(formats_to_download[-1])
         return info_dict
 
+    def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
+        """Select the requested subtitles and their format"""
+        available_subs = {}
+        if normal_subtitles and self.params.get('writesubtitles'):
+            available_subs.update(normal_subtitles)
+        if automatic_captions and self.params.get('writeautomaticsub'):
+            for lang, cap_info in automatic_captions.items():
+                if lang not in available_subs:
+                    available_subs[lang] = cap_info
+
+        if (not self.params.get('writesubtitles') and not
+                self.params.get('writeautomaticsub') or not
+                available_subs):
+            return None
+
+        if self.params.get('allsubtitles', False):
+            requested_langs = available_subs.keys()
+        else:
+            if self.params.get('subtitleslangs', False):
+                requested_langs = self.params.get('subtitleslangs')
+            elif 'en' in available_subs:
+                requested_langs = ['en']
+            else:
+                requested_langs = [list(available_subs.keys())[0]]
+
+        formats_query = self.params.get('subtitlesformat', 'best')
+        formats_preference = formats_query.split('/') if formats_query else []
+        subs = {}
+        for lang in requested_langs:
+            formats = available_subs.get(lang)
+            if formats is None:
+                self.report_warning('%s subtitles not available for %s' % (lang, video_id))
+                continue
+            if isinstance(formats, compat_str):
+                # TODO: convert all IE with subtitles support to the new format
+                # and remove this
+                subs[lang] = {
+                    'ext': formats_preference[0],
+                    'data': formats,
+                }
+                continue
+            for ext in formats_preference:
+                if ext == 'best':
+                    f = formats[-1]
+                    break
+                matches = list(filter(lambda f: f['ext'] == ext, formats))
+                if matches:
+                    f = matches[-1]
+                    break
+            else:
+                f = formats[-1]
+                self.report_warning(
+                    'No subtitle format found matching "%s" for language %s, '
+                    'using %s' % (formats_query, lang, f['ext']))
+            subs[lang] = f
+        return subs
+
     def process_info(self, info_dict):
         """Process a single resolved IE result."""
 
@@ -1136,7 +1233,7 @@ class YoutubeDL(object):
         if 'format' not in info_dict:
             info_dict['format'] = info_dict['ext']
 
-        reason = self._match_entry(info_dict)
+        reason = self._match_entry(info_dict, incomplete=False)
         if reason is not None:
             self.to_screen('[download] ' + reason)
             return
@@ -1218,15 +1315,22 @@ class YoutubeDL(object):
         subtitles_are_requested = any([self.params.get('writesubtitles', False),
                                        self.params.get('writeautomaticsub')])
 
-        if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
+        if subtitles_are_requested and info_dict.get('requested_subtitles'):
             # subtitles download errors are already managed as troubles in relevant IE
             # that way it will silently go on when used with unsupporting IE
-            subtitles = info_dict['subtitles']
-            sub_format = self.params.get('subtitlesformat', 'srt')
-            for sub_lang in subtitles.keys():
-                sub = subtitles[sub_lang]
-                if sub is None:
-                    continue
+            subtitles = info_dict['requested_subtitles']
+            for sub_lang, sub_info in subtitles.items():
+                sub_format = sub_info['ext']
+                if sub_info.get('data') is not None:
+                    sub_data = sub_info['data']
+                else:
+                    try:
+                        uf = self.urlopen(sub_info['url'])
+                        sub_data = uf.read().decode('utf-8')
+                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                        self.report_warning('Unable to download subtitle for "%s": %s' %
+                                            (sub_lang, compat_str(err)))
+                        continue
                 try:
                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
@@ -1234,7 +1338,7 @@ class YoutubeDL(object):
                     else:
                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
-                            subfile.write(sub)
+                            subfile.write(sub_data)
                 except (OSError, IOError):
                     self.report_error('Cannot write subtitles file ' + sub_filename)
                     return
@@ -1529,7 +1633,6 @@ class YoutubeDL(object):
             line(f, idlen) for f in formats
             if f.get('preference') is None or f['preference'] >= -1000]
         if len(formats) > 1:
-            formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
             formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
 
         header_line = line({
@@ -1556,6 +1659,17 @@ class YoutubeDL(object):
             ['ID', 'width', 'height', 'URL'],
             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
 
+    def list_subtitles(self, video_id, subtitles, name='subtitles'):
+        if not subtitles:
+            self.to_screen('%s has no %s' % (video_id, name))
+            return
+        self.to_screen(
+            'Available %s for %s:' % (name, video_id))
+        self.to_screen(render_table(
+            ['Language', 'formats'],
+            [[lang, ', '.join(f['ext'] for f in reversed(formats))]
+                for lang, formats in subtitles.items()]))
+
     def urlopen(self, req):
         """ Start an HTTP download """