[vodpl] Make more robust and add another test (closes #12122)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index a3048fb595ea62f1648cc98cdfcb77eda3bef6fb..9681453ca397fa495d21b5a691701255fd18df33 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -121,9 +121,19 @@ class InfoExtractor(object):
                                   download, lower-case.
                                   "http", "https", "rtsp", "rtmp", "rtmpe",
                                   "m3u8", "m3u8_native" or "http_dash_segments".
-                    * fragments  A list of fragments of the fragmented media,
-                                 with the following entries:
-                                 * "url" (mandatory) - fragment's URL
+                    * fragment_base_url
+                                 Base URL for fragments. Each fragment's path
+                                 value (if present) will be relative to
+                                 this URL.
+                    * fragments  A list of fragments of a fragmented media.
+                                 Each fragment entry must contain either an url
+                                 or a path. If an url is present it should be
+                                 considered by a client. Otherwise both path and
+                                 fragment_base_url must be present. Here is
+                                 the list of all potential fields:
+                                 * "url" - fragment's URL
+                                 * "path" - fragment's path relative to
+                                            fragment_base_url
                                   * "duration" (optional, int or float)
                                   * "filesize" (optional, int)
                      * preference Order number of this format. If this field is
@@ -1015,13 +1025,13 @@ class InfoExtractor(object):
                  unique_formats.append(f)
          formats[:] = unique_formats
  
-    def _is_valid_url(self, url, video_id, item='video'):
+    def _is_valid_url(self, url, video_id, item='video', headers={}):
          url = self._proto_relative_url(url, scheme='http:')
          # For now assume non HTTP(S) URLs always valid
          if not (url.startswith('http://') or url.startswith('https://')):
              return True
          try:
-            self._request_webpage(url, video_id, 'Checking %s URL' % item)
+            self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
              return True
          except ExtractorError as e:
              if isinstance(e.cause, compat_urllib_error.URLError):
@@ -1198,6 +1208,9 @@ class InfoExtractor(object):
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
  
+        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
+            return []
+
          formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
  
          format_url = lambda u: (
@@ -1305,8 +1318,8 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
-                if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
-                    # TODO: update acodec for for audio only formats with the same GROUP-ID
+                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
+                    # TODO: update acodec for audio only formats with the same GROUP-ID
                      f['acodec'] = 'none'
                  formats.append(f)
                  last_info = {}
@@ -1627,12 +1640,12 @@ class InfoExtractor(object):
                  segment_template = element.find(_add_ns('SegmentTemplate'))
                  if segment_template is not None:
                      extract_common(segment_template)
-                    media_template = segment_template.get('media')
-                    if media_template:
-                        ms_info['media_template'] = media_template
+                    media = segment_template.get('media')
+                    if media:
+                        ms_info['media'] = media
                      initialization = segment_template.get('initialization')
                      if initialization:
-                        ms_info['initialization_url'] = initialization
+                        ms_info['initialization'] = initialization
                      else:
                          extract_Initialization(segment_template)
              return ms_info
@@ -1676,6 +1689,7 @@ class InfoExtractor(object):
                          lang = representation_attrib.get('lang')
                          url_el = representation.find(_add_ns('BaseURL'))
                          filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+                        bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                              'url': base_url,
@@ -1683,7 +1697,7 @@ class InfoExtractor(object):
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
                              'height': int_or_none(representation_attrib.get('height')),
-                            'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
+                            'tbr': int_or_none(bandwidth, 1000),
                              'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                              'fps': int_or_none(representation_attrib.get('frameRate')),
                              'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@@ -1692,13 +1706,32 @@ class InfoExtractor(object):
                          }
                          f.update(parse_codecs(representation_attrib.get('codecs')))
                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
-                        if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
  
-                            media_template = representation_ms_info['media_template']
-                            media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
-                            media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
-                            media_template.replace('$$', '$')
+                        def prepare_template(template_name, identifiers):
+                            t = representation_ms_info[template_name]
+                            t = t.replace('$RepresentationID$', representation_id)
+                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+                            t.replace('$$', '$')
+                            return t
+
+                        # @initialization is a regular template like @media one
+                        # so it should be handled just the same way (see
+                        # https://github.com/rg3/youtube-dl/issues/11605)
+                        if 'initialization' in representation_ms_info:
+                            initialization_template = prepare_template(
+                                'initialization',
+                                # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+                                # $Time$ shall not be included for @initialization thus
+                                # only $Bandwidth$ remains
+                                ('Bandwidth', ))
+                            representation_ms_info['initialization_url'] = initialization_template % {
+                                'Bandwidth': bandwidth,
+                            }
+
+                        if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+                            media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
  
                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                              # can't be used at the same time
@@ -1710,7 +1743,7 @@ class InfoExtractor(object):
                                  representation_ms_info['fragments'] = [{
                                      'url': media_template % {
                                          'Number': segment_number,
-                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+                                        'Bandwidth': bandwidth,
                                      },
                                      'duration': segment_duration,
                                  } for segment_number in range(
@@ -1728,7 +1761,7 @@ class InfoExtractor(object):
                                  def add_segment_url():
                                      segment_url = media_template % {
                                          'Time': segment_time,
-                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
+                                        'Bandwidth': bandwidth,
                                          'Number': segment_number,
                                      }
                                      representation_ms_info['fragments'].append({
@@ -1770,7 +1803,7 @@ class InfoExtractor(object):
                                  'protocol': 'http_dash_segments',
                              })
                              if 'initialization_url' in representation_ms_info:
-                                initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
+                                initialization_url = representation_ms_info['initialization_url']
                                  if not f.get('url'):
                                      f['url'] = initialization_url
                                  f['fragments'].append({'url': initialization_url})
@@ -1929,7 +1962,12 @@ class InfoExtractor(object):
          media_tags = [(media_tag, media_type, '')
                        for media_tag, media_type
                        in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
-        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        media_tags.extend(re.findall(
+            # We only allow video|audio followed by a whitespace or '>'.
+            # Allowing more characters may end up in significant slow down (see
+            # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+            # http://www.porntrex.com/maps/videositemap.xml).
+            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
          for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],