Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

author Sergey M․ <dstftw@gmail.com>

Sat, 17 Mar 2018 19:46:50 +0000 (02:46 +0700)

committer Sergey M․ <dstftw@gmail.com>

Sat, 17 Mar 2018 19:52:17 +0000 (02:52 +0700)
author Sergey M․ <dstftw@gmail.com>
Sat, 17 Mar 2018 19:46:50 +0000 (02:46 +0700)
committer Sergey M․ <dstftw@gmail.com>
Sat, 17 Mar 2018 19:52:17 +0000 (02:52 +0700)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py

index a695ce64b3fbc9a8c9eca4562f8fc94ba2c742b5..4833396a521bf1d7a072db8ad425bed333235248 100644 (file)
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -698,40 +698,47 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
          _TEST_CASES = [
              (
                  'foo_xspf',
-                'https://example.org/src/',
+                'https://example.org/src/foo_xspf.xspf',
                  [{
+                    'id': 'foo_xspf',
+                    'title': 'Pandemonium',
                      'description': 'Visit http://bigbrother404.bandcamp.com',
                      'duration': 202.416,
-                    'formats': [{'url': 'https://example.org/src/cd1/track%201.mp3'}],
+                    'formats': [{
+                        'manifest_url': 'https://example.org/src/foo_xspf.xspf',
+                        'url': 'https://example.org/src/cd1/track%201.mp3',
+                    }],
+                }, {
                      'id': 'foo_xspf',
-                    'title': 'Pandemonium'
-                },
-                {
+                    'title': 'Final Cartridge (Nichico Twelve Remix)',
                      'description': 'Visit http://bigbrother404.bandcamp.com',
                      'duration': 255.857,
-                    'formats': [{'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3'}],
+                    'formats': [{
+                        'manifest_url': 'https://example.org/src/foo_xspf.xspf',
+                        'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3',
+                    }],
+                }, {
                      'id': 'foo_xspf',
-                    'title': 'Final Cartridge (Nichico Twelve Remix)'
-                },
-                {
+                    'title': 'Rebuilding Nightingale',
                      'description': 'Visit http://bigbrother404.bandcamp.com',
                      'duration': 287.915,
-                    'formats': [
-                        {'url': 'https://example.org/src/track3.mp3'},
-                        {'url': 'https://example.com/track3.mp3'}
-                    ],
-                    'id': 'foo_xspf',
-                    'title': 'Rebuilding Nightingale'
+                    'formats': [{
+                        'manifest_url': 'https://example.org/src/foo_xspf.xspf',
+                        'url': 'https://example.org/src/track3.mp3',
+                    }, {
+                        'manifest_url': 'https://example.org/src/foo_xspf.xspf',
+                        'url': 'https://example.com/track3.mp3',
+                    }]
                  }]
              ),
          ]
  
-        for xspf_file, xspf_base_url, expected_entries in _TEST_CASES:
+        for xspf_file, xspf_url, expected_entries in _TEST_CASES:
              with io.open('./test/testdata/xspf/%s.xspf' % xspf_file,
                           mode='r', encoding='utf-8') as f:
                  entries = self.ie._parse_xspf(
                      compat_etree_fromstring(f.read().encode('utf-8')),
-                        xspf_file, xspf_base_url)
+                    xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url)
                  expect_value(self, entries, expected_entries, None)
                  for i in range(len(entries)):
                      expect_dict(self, entries[i], expected_entries[i])
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index a507785097a3e0e336f8dd5d912fe0c90fa4b613..2e2a02948d1ad66753cb23729ce7fcf74d983556 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1706,22 +1706,24 @@ class InfoExtractor(object):
              })
          return subtitles
  
-    def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
          xspf = self._download_xml(
-            playlist_url, playlist_id, 'Downloading xpsf playlist',
+            xspf_url, playlist_id, 'Downloading xpsf playlist',
              'Unable to download xspf manifest', fatal=fatal)
          if xspf is False:
              return []
-        return self._parse_xspf(xspf, playlist_id, base_url(playlist_url))
+        return self._parse_xspf(
+            xspf, playlist_id, xspf_url=xspf_url,
+            xspf_base_url=base_url(xspf_url))
  
-    def _parse_xspf(self, playlist, playlist_id, playlist_base_url=''):
+    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
          NS_MAP = {
              'xspf': 'http://xspf.org/ns/0/',
              's1': 'http://static.streamone.nl/player/ns/0',
          }
  
          entries = []
-        for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
              title = xpath_text(
                  track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
              description = xpath_text(
@@ -1731,12 +1733,18 @@ class InfoExtractor(object):
              duration = float_or_none(
                  xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
  
-            formats = [{
-                'url': urljoin(playlist_base_url, location.text),
-                'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
-                'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
-                'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
-            } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+            formats = []
+            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+                format_url = urljoin(xspf_base_url, location.text)
+                if not format_url:
+                    continue
+                formats.append({
+                    'url': format_url,
+                    'manifest_url': xspf_url,
+                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+                })
              self._sort_formats(formats)
  
              entries.append({
@@ -1750,18 +1758,18 @@ class InfoExtractor(object):
          return entries
  
      def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
-        res = self._download_webpage_handle(
+        res = self._download_xml_handle(
              mpd_url, video_id,
              note=note or 'Downloading MPD manifest',
              errnote=errnote or 'Failed to download MPD manifest',
              fatal=fatal)
          if res is False:
              return []
-        mpd, urlh = res
+        mpd_doc, urlh = res
          mpd_base_url = base_url(urlh.geturl())
  
          return self._parse_mpd_formats(
-            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+            mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
              formats_dict=formats_dict, mpd_url=mpd_url)
  
      def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
@@ -2035,17 +2043,16 @@ class InfoExtractor(object):
          return formats
  
      def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
-        res = self._download_webpage_handle(
+        res = self._download_xml_handle(
              ism_url, video_id,
              note=note or 'Downloading ISM manifest',
              errnote=errnote or 'Failed to download ISM manifest',
              fatal=fatal)
          if res is False:
              return []
-        ism, urlh = res
+        ism_doc, urlh = res
  
-        return self._parse_ism_formats(
-            compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+        return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
  
      def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
          """
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 023ccbc9bf108e5cf51c0afc7579905899956ae3..1cc491b19b35fccd0bef2886a3de20fd111c5f01 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -2233,7 +2233,9 @@ class GenericIE(InfoExtractor):
                  return smil
              elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                  return self.playlist_result(
-                    self._parse_xspf(doc, video_id, compat_str(full_response.geturl())),
+                    self._parse_xspf(
+                        doc, video_id, xspf_url=url,
+                        xspf_base_url=compat_str(full_response.geturl())),
                      video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
author	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Mar 2018 19:46:50 +0000 (02:46 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Mar 2018 19:52:17 +0000 (02:52 +0700)
test/test_InfoExtractor.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history