Simplify formats accumulation for f4m/m3u8/smil formats

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 5e263f8b5a2cf46fbb26e928f5df85c87c42dfde..65520744799013fbaa756d171cb10bff8a3e693e 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -18,8 +18,6 @@ from ..compat import (
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
      compat_urlparse,
      compat_str,
      compat_etree_fromstring,
@@ -31,17 +29,20 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
+    error_to_compat_str,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
+    sanitized_Request,
      unescapeHTML,
      unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
+    determine_protocol,
  )
  
  
@@ -167,7 +168,7 @@ class InfoExtractor(object):
                      "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
@@ -332,7 +333,8 @@ class InfoExtractor(object):
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
-            errmsg = '%s: %s' % (errnote, compat_str(err))
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
@@ -622,7 +624,7 @@ class InfoExtractor(object):
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
  
          return (username, password)
  
@@ -776,14 +778,12 @@ class InfoExtractor(object):
  
              preference = f.get('preference')
              if preference is None:
-                proto = f.get('protocol')
-                if proto is None:
-                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
-                preference = 0 if proto in ['http', 'https'] else -0.1
+                preference = 0
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
+            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -814,6 +814,7 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
@@ -883,7 +884,7 @@ class InfoExtractor(object):
              fatal=fatal)
  
          if manifest is False:
-            return manifest
+            return []
  
          formats = []
          manifest_version = '1.0'
@@ -891,6 +892,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -898,16 +904,14 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    f4m_formats = self._extract_f4m_formats(
-                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
-                    if f4m_formats:
-                        formats.extend(f4m_formats)
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -949,7 +953,7 @@ class InfoExtractor(object):
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
          if res is False:
-            return res
+            return []
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
          last_info = None
@@ -1141,10 +1145,8 @@ class InfoExtractor(object):
              src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
  
              if proto == 'm3u8' or src_ext == 'm3u8':
-                m3u8_formats = self._extract_m3u8_formats(
-                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
-                if m3u8_formats:
-                    formats.extend(m3u8_formats)
+                formats.extend(self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
                  continue
  
              if src_ext == 'f4m':
@@ -1156,9 +1158,7 @@ class InfoExtractor(object):
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
-                if f4m_formats:
-                    formats.extend(f4m_formats)
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
                  continue
  
              if src_url.startswith('http') and self._is_valid_url(src, video_id):
@@ -1280,7 +1280,7 @@ class InfoExtractor(object):
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
-        req = compat_urllib_request.Request(url)
+        req = sanitized_Request(url)
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))