Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 5b6a09c0b7f732ef5e9361775e42719af0deb8b7..2e2a02948d1ad66753cb23729ce7fcf74d983556 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -174,6 +174,8 @@ class InfoExtractor(object):
                                   width : height ratio as float.
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
+                    * downloader_options  A dictionary of downloader options as
+                                 described in FileDownloader
  
      url:            Final video URL.
      ext:            Video filename extension.
@@ -642,19 +644,31 @@ class InfoExtractor(object):
              content, _ = res
              return content
  
+    def _download_xml_handle(
+            self, url_or_request, video_id, note='Downloading XML',
+            errnote='Unable to download XML', transform_source=None,
+            fatal=True, encoding=None, data=None, headers={}, query={}):
+        """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
+        res = self._download_webpage_handle(
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding, data=data, headers=headers, query=query)
+        if res is False:
+            return res
+        xml_string, urlh = res
+        return self._parse_xml(
+            xml_string, video_id, transform_source=transform_source,
+            fatal=fatal), urlh
+
      def _download_xml(self, url_or_request, video_id,
                        note='Downloading XML', errnote='Unable to download XML',
                        transform_source=None, fatal=True, encoding=None,
                        data=None, headers={}, query={}):
          """Return the xml as an xml.etree.ElementTree.Element"""
-        xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding, data=data, headers=headers, query=query)
-        if xml_string is False:
-            return xml_string
-        return self._parse_xml(
-            xml_string, video_id, transform_source=transform_source,
-            fatal=fatal)
+        res = self._download_xml_handle(
+            url_or_request, video_id, note=note, errnote=errnote,
+            transform_source=transform_source, fatal=fatal, encoding=encoding,
+            data=data, headers=headers, query=query)
+        return res if res is False else res[0]
  
      def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
          if transform_source:
@@ -1027,7 +1041,7 @@ class InfoExtractor(object):
                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
                      if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
                          info['series'] = unescapeHTML(part_of_series.get('name'))
-                elif item_type == 'Article':
+                elif item_type in ('Article', 'NewsArticle'):
                      info.update({
                          'timestamp': parse_iso8601(e.get('datePublished')),
                          'title': unescapeHTML(e.get('headline')),
@@ -1692,22 +1706,24 @@ class InfoExtractor(object):
              })
          return subtitles
  
-    def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
          xspf = self._download_xml(
-            playlist_url, playlist_id, 'Downloading xpsf playlist',
+            xspf_url, playlist_id, 'Downloading xpsf playlist',
              'Unable to download xspf manifest', fatal=fatal)
          if xspf is False:
              return []
-        return self._parse_xspf(xspf, playlist_id)
+        return self._parse_xspf(
+            xspf, playlist_id, xspf_url=xspf_url,
+            xspf_base_url=base_url(xspf_url))
  
-    def _parse_xspf(self, playlist, playlist_id):
+    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
          NS_MAP = {
              'xspf': 'http://xspf.org/ns/0/',
              's1': 'http://static.streamone.nl/player/ns/0',
          }
  
          entries = []
-        for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
              title = xpath_text(
                  track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
              description = xpath_text(
@@ -1717,12 +1733,18 @@ class InfoExtractor(object):
              duration = float_or_none(
                  xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
  
-            formats = [{
-                'url': location.text,
-                'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
-                'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
-                'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
-            } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+            formats = []
+            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+                format_url = urljoin(xspf_base_url, location.text)
+                if not format_url:
+                    continue
+                formats.append({
+                    'url': format_url,
+                    'manifest_url': xspf_url,
+                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+                })
              self._sort_formats(formats)
  
              entries.append({
@@ -1736,18 +1758,18 @@ class InfoExtractor(object):
          return entries
  
      def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
-        res = self._download_webpage_handle(
+        res = self._download_xml_handle(
              mpd_url, video_id,
              note=note or 'Downloading MPD manifest',
              errnote=errnote or 'Failed to download MPD manifest',
              fatal=fatal)
          if res is False:
              return []
-        mpd, urlh = res
+        mpd_doc, urlh = res
          mpd_base_url = base_url(urlh.geturl())
  
          return self._parse_mpd_formats(
-            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+            mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
              formats_dict=formats_dict, mpd_url=mpd_url)
  
      def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
@@ -2021,17 +2043,16 @@ class InfoExtractor(object):
          return formats
  
      def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
-        res = self._download_webpage_handle(
+        res = self._download_xml_handle(
              ism_url, video_id,
              note=note or 'Downloading ISM manifest',
              errnote=errnote or 'Failed to download ISM manifest',
              fatal=fatal)
          if res is False:
              return []
-        ism, urlh = res
+        ism_doc, urlh = res
  
-        return self._parse_ism_formats(
-            compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+        return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
  
      def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
          """
@@ -2248,9 +2269,10 @@ class InfoExtractor(object):
      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
          query = compat_urlparse.urlparse(url).query
          url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
-        url_base = self._search_regex(
-            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
-        http_base_url = '%s:%s' % ('http', url_base)
+        mobj = re.search(
+            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
+        url_base = mobj.group('url')
+        http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
          formats = []
  
          def manifest_url(manifest):
@@ -2350,7 +2372,10 @@ class InfoExtractor(object):
                  for track in tracks:
                      if not isinstance(track, dict):
                          continue
-                    if track.get('kind') != 'captions':
+                    track_kind = track.get('kind')
+                    if not track_kind or not isinstance(track_kind, compat_str):
+                        continue
+                    if track_kind.lower() not in ('captions', 'subtitles'):
                          continue
                      track_url = urljoin(base_url, track.get('file'))
                      if not track_url:
@@ -2404,7 +2429,7 @@ class InfoExtractor(object):
                  formats.extend(self._extract_m3u8_formats(
                      source_url, video_id, 'mp4', entry_protocol='m3u8_native',
                      m3u8_id=m3u8_id, fatal=False))
-            elif ext == 'mpd':
+            elif source_type == 'dash' or ext == 'mpd':
                  formats.extend(self._extract_mpd_formats(
                      source_url, video_id, mpd_id=mpd_id, fatal=False))
              elif ext == 'smil':