[extractor/common] Return MPD manifest as format's url meta field (#20242)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index af621b74b5eb7c0f2a0fe89be3ee8ecf42a1859b..1fa8048b833489bce7a42cba3374b806e5b0f9e9 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1058,7 +1058,7 @@ class InfoExtractor(object):
      @staticmethod
      def _og_regexes(prop):
          content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
-        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
                         % {'prop': re.escape(prop)})
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
@@ -1249,7 +1249,10 @@ class InfoExtractor(object):
                          info['title'] = episode_name
                      part_of_season = e.get('partOfSeason')
                      if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
-                        info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+                        info.update({
+                            'season': unescapeHTML(part_of_season.get('name')),
+                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
+                        })
                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
                      if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
                          info['series'] = unescapeHTML(part_of_series.get('name'))
@@ -1668,7 +1671,7 @@ class InfoExtractor(object):
              rendition = stream_group[0]
              return rendition.get('NAME') or stream_group_id
  
-        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF inorder to have the
+        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
          # chance to detect video only formats when EXT-X-STREAM-INF tags
          # precede EXT-X-MEDIA tags in HLS manifest such as [3].
          for line in m3u8_doc.splitlines():
@@ -2117,7 +2120,7 @@ class InfoExtractor(object):
                          bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
-                            'url': base_url,
+                            'url': mpd_url,
                              'manifest_url': mpd_url,
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
@@ -2630,7 +2633,7 @@ class InfoExtractor(object):
                  'id': this_video_id,
                  'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
                  'description': video_data.get('description'),
-                'thumbnail': self._proto_relative_url(video_data.get('image')),
+                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
                  'timestamp': int_or_none(video_data.get('pubdate')),
                  'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
                  'subtitles': subtitles,
@@ -2657,12 +2660,9 @@ class InfoExtractor(object):
          for source in jwplayer_sources_data:
              if not isinstance(source, dict):
                  continue
-            source_url = self._proto_relative_url(source.get('file'))
-            if not source_url:
-                continue
-            if base_url:
-                source_url = compat_urlparse.urljoin(base_url, source_url)
-            if source_url in urls:
+            source_url = urljoin(
+                base_url, self._proto_relative_url(source.get('file')))
+            if not source_url or source_url in urls:
                  continue
              urls.append(source_url)
              source_type = source.get('type') or ''