[extractor/common] Mention podcast in series fields section

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index e637b33d51689756b569b752c55e63fe4503de26..6aff71c48e224458b0a6d2bc4226af86d6ebc128 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,6 +21,7 @@ from ..compat import (
      compat_os_name,
      compat_str,
      compat_urllib_error,
+    compat_urllib_parse_unquote,
      compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
@@ -86,9 +87,10 @@ class InfoExtractor(object):
                      from worst to best quality.
  
                      Potential fields:
-                    * url        Mandatory. The URL of the video file or URL of
-                                 the manifest file in case of fragmented media
-                                 (DASH, hls, hds).
+                    * url        Mandatory. The URL of the video file
+                    * manifest_url
+                                 The URL of the manifest file in case of
+                                 fragmented media (DASH, hls, hds)
                      * ext        Will be calculated from URL if missing
                      * format     A human-readable description of the format
                                   ("mp4 container with h264/opus").
@@ -233,7 +235,7 @@ class InfoExtractor(object):
      chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  
      The following fields should only be used when the video is an episode of some
-    series or programme:
+    series, programme or podcast:
  
      series:         Title of the series or programme the video episode belongs to.
      season:         Title of the season the video episode belongs to.
@@ -1149,6 +1151,7 @@ class InfoExtractor(object):
              formats.append({
                  'format_id': format_id,
                  'url': manifest_url,
+                'manifest_url': manifest_url,
                  'ext': 'flv' if bootstrap_info is not None else None,
                  'tbr': tbr,
                  'width': width,
@@ -1254,9 +1257,11 @@ class InfoExtractor(object):
                  # format_id intact.
                  if not live:
                      format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
+                manifest_url = format_url(line.strip())
                  f = {
                      'format_id': '-'.join(format_id),
-                    'url': format_url(line.strip()),
+                    'url': manifest_url,
+                    'manifest_url': manifest_url,
                      'tbr': tbr,
                      'ext': ext,
                      'fps': float_or_none(last_info.get('FRAME-RATE')),
@@ -1528,9 +1533,10 @@ class InfoExtractor(object):
          mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
  
          return self._parse_mpd_formats(
-            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
+            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+            formats_dict=formats_dict, mpd_url=mpd_url)
  
-    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
          """
          Parse formats from MPD manifest.
          References:
@@ -1654,6 +1660,7 @@ class InfoExtractor(object):
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                              'url': base_url,
+                            'manifest_url': mpd_url,
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
                              'height': int_or_none(representation_attrib.get('height')),
@@ -1682,14 +1689,6 @@ class InfoExtractor(object):
                                  if 'total_number' not in representation_ms_info and 'segment_duration':
                                      segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                      representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
-                                representation_ms_info['segment_urls'] = [
-                                    media_template % {
-                                        'Number': segment_number,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
-                                    }
-                                    for segment_number in range(
-                                        representation_ms_info['start_number'],
-                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                                  representation_ms_info['fragments'] = [{
                                      'url': media_template % {
                                          'Number': segment_number,
@@ -1703,7 +1702,6 @@ class InfoExtractor(object):
                                  # $Number*$ or $Time$ in media template with S list available
                                  # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
                                  # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
-                                representation_ms_info['segment_urls'] = []
                                  representation_ms_info['fragments'] = []
                                  segment_time = 0
                                  segment_d = None
@@ -1715,7 +1713,6 @@ class InfoExtractor(object):
                                          'Bandwidth': representation_attrib.get('bandwidth'),
                                          'Number': segment_number,
                                      }
-                                    representation_ms_info['segment_urls'].append(segment_url)
                                      representation_ms_info['fragments'].append({
                                          'url': segment_url,
                                          'duration': float_or_none(segment_d, representation_ms_info['timescale']),
@@ -1745,17 +1742,15 @@ class InfoExtractor(object):
                                          'duration': float_or_none(s['d'], representation_ms_info['timescale']),
                                      })
                              representation_ms_info['fragments'] = fragments
-                        if 'segment_urls' in representation_ms_info:
+                        # NB: MPD manifest may contain direct URLs to unfragmented media.
+                        # No fragments key is present in this case.
+                        if 'fragments' in representation_ms_info:
                              f.update({
-                                'segment_urls': representation_ms_info['segment_urls'],
                                  'fragments': [],
                                  'protocol': 'http_dash_segments',
                              })
                              if 'initialization_url' in representation_ms_info:
                                  initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
-                                f.update({
-                                    'initialization_url': initialization_url,
-                                })
                                  if not f.get('url'):
                                      f['url'] = initialization_url
                                  f['fragments'].append({'url': initialization_url})
@@ -1807,7 +1802,11 @@ class InfoExtractor(object):
              return is_plain_url, formats
  
          entries = []
-        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+        media_tags = [(media_tag, media_type, '')
+                      for media_tag, media_type
+                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                  'subtitles': {},
@@ -1834,7 +1833,7 @@ class InfoExtractor(object):
                  for track_tag in re.findall(r'<track[^>]+>', media_content):
                      track_attributes = extract_attributes(track_tag)
                      kind = track_attributes.get('kind')
-                    if not kind or kind == 'subtitles':
+                    if not kind or kind in ('subtitles', 'captions'):
                          src = track_attributes.get('src')
                          if not src:
                              continue
@@ -1842,16 +1841,21 @@ class InfoExtractor(object):
                          media_info['subtitles'].setdefault(lang, []).append({
                              'url': absolute_url(src),
                          })
-            if media_info['formats']:
+            if media_info['formats'] or media_info['subtitles']:
                  entries.append(media_info)
          return entries
  
      def _extract_akamai_formats(self, manifest_url, video_id):
          formats = []
+        hdcore_sign = 'hdcore=3.7.0'
          f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
-        formats.extend(self._extract_f4m_formats(
-            update_url_query(f4m_url, {'hdcore': '3.7.0'}),
-            video_id, f4m_id='hds', fatal=False))
+        if 'hdcore=' not in f4m_url:
+            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
+        f4m_formats = self._extract_f4m_formats(
+            f4m_url, video_id, f4m_id='hds', fatal=False)
+        for entry in f4m_formats:
+            entry.update({'extra_param_to_segment_url': hdcore_sign})
+        formats.extend(f4m_formats)
          m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
          formats.extend(self._extract_m3u8_formats(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
@@ -2021,6 +2025,12 @@ class InfoExtractor(object):
              headers['Ytdl-request-proxy'] = geo_verification_proxy
          return headers
  
+    def _generic_id(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
+
+    def _generic_title(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+
  
  class SearchInfoExtractor(InfoExtractor):
      """