[YoutubeDL] Skip malformed formats for better extraction robustness

author Sergey M․ <dstftw@gmail.com>

Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)

committer Sergey M․ <dstftw@gmail.com>

Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)
author Sergey M․ <dstftw@gmail.com>
Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)
committer Sergey M․ <dstftw@gmail.com>
Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index c05103bb6082d4548db7d640955810de5a61d0d9..b3a6d4d3b080e2899d1dbb9868340bac39c368a7 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1448,17 +1448,25 @@ class YoutubeDL(object):
          if not formats:
              raise ExtractorError('No video formats found!')
  
+        def is_wellformed(f):
+            url = f.get('url')
+            valid_url = url and isinstance(url, compat_str)
+            if not valid_url:
+                self.report_warning(
+                    '"url" field is missing or empty - skipping format, '
+                    'there is an error in extractor')
+            return valid_url
+
+        # Filter out malformed formats for better extraction robustness
+        formats = list(filter(is_wellformed, formats))
+
          formats_dict = {}
  
          # We check that all the formats have the format and format_id fields
          for i, format in enumerate(formats):
-            if 'url' not in format:
-                raise ExtractorError('Missing "url" key in result (index %d)' % i)
-
              sanitize_string_field(format, 'format_id')
              sanitize_numeric_fields(format)
              format['url'] = sanitize_url(format['url'])
-
              if format.get('format_id') is None:
                  format['format_id'] = compat_str(i)
              else:
author	Sergey M․ <dstftw@gmail.com>
	Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Fri, 23 Jun 2017 14:18:33 +0000 (21:18 +0700)