[extractor/common] detect f4m audio only formats

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index da192728f182dbe38c754337c83d5766112f123d..0cbb97aae9727a0ca0cf2e149d54b5bf66371769 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -235,7 +235,7 @@ class InfoExtractor(object):
      chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  
      The following fields should only be used when the video is an episode of some
-    series or programme:
+    series, programme or podcast:
  
      series:         Title of the series or programme the video episode belongs to.
      season:         Title of the season the video episode belongs to.
@@ -1100,6 +1100,13 @@ class InfoExtractor(object):
              manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
              'bootstrap info', default=None)
  
+        vcodec = None
+        mime_type = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
+            'base URL', default=None)
+        if mime_type and mime_type.startswith('audio/'):
+            vcodec = 'none'
+
          for i, media_el in enumerate(media_nodes):
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              width = int_or_none(media_el.attrib.get('width'))
@@ -1140,6 +1147,7 @@ class InfoExtractor(object):
                              'width': f.get('width') or width,
                              'height': f.get('height') or height,
                              'format_id': f.get('format_id') if not tbr else format_id,
+                            'vcodec': vcodec,
                          })
                      formats.extend(f4m_formats)
                      continue
@@ -1156,6 +1164,7 @@ class InfoExtractor(object):
                  'tbr': tbr,
                  'width': width,
                  'height': height,
+                'vcodec': vcodec,
                  'preference': preference,
              })
          return formats
@@ -1802,7 +1811,11 @@ class InfoExtractor(object):
              return is_plain_url, formats
  
          entries = []
-        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+        media_tags = [(media_tag, media_type, '')
+                      for media_tag, media_type
+                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                  'subtitles': {},