[vodpl] Make more robust and add another test (closes #12122)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 2c8ec1417c21cb9397e34efe8dec7e0e5bca9e62..9681453ca397fa495d21b5a691701255fd18df33 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1208,6 +1208,9 @@ class InfoExtractor(object):
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
  
+        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
+            return []
+
          formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
  
          format_url = lambda u: (
@@ -1315,8 +1318,8 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
-                if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
-                    # TODO: update acodec for for audio only formats with the same GROUP-ID
+                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
+                    # TODO: update acodec for audio only formats with the same GROUP-ID
                      f['acodec'] = 'none'
                  formats.append(f)
                  last_info = {}
@@ -1959,7 +1962,12 @@ class InfoExtractor(object):
          media_tags = [(media_tag, media_type, '')
                        for media_tag, media_type
                        in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
-        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        media_tags.extend(re.findall(
+            # We only allow video|audio followed by a whitespace or '>'.
+            # Allowing more characters may end up in significant slow down (see
+            # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+            # http://www.porntrex.com/maps/videositemap.xml).
+            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
          for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],