Refactor fragments interface and dash segments downloader

author Sergey M․ <dstftw@gmail.com>

Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)

committer Sergey M․ <dstftw@gmail.com>

Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)
author Sergey M․ <dstftw@gmail.com>
Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)
committer Sergey M․ <dstftw@gmail.com>
Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py

index 41fc9cfc2b6b301bc09261c4b79668ca2395d544..8437dde30ca2afe031afb1ff2882ed12ac4b49b5 100644 (file)
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@@ -1,7 +1,6 @@
  from __future__ import unicode_literals
  
  import os
-import re
  
  from .fragment import FragmentFD
  from ..compat import compat_urllib_error
@@ -19,34 +18,32 @@ class DashSegmentsFD(FragmentFD):
      FD_NAME = 'dashsegments'
  
      def real_download(self, filename, info_dict):
-        base_url = info_dict['url']
-        segment_urls = [info_dict['segment_urls'][0]] if self.params.get('test', False) else info_dict['segment_urls']
-        initialization_url = info_dict.get('initialization_url')
+        segments = info_dict['fragments'][:1] if self.params.get(
+            'test', False) else info_dict['fragments']
  
          ctx = {
              'filename': filename,
-            'total_frags': len(segment_urls) + (1 if initialization_url else 0),
+            'total_frags': len(segments),
          }
  
          self._prepare_and_start_frag_download(ctx)
  
-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
          segments_filenames = []
  
          fragment_retries = self.params.get('fragment_retries', 0)
          skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
  
-        def process_segment(segment, tmp_filename, fatal):
-            target_url, segment_name = segment
+        def process_segment(segment, tmp_filename, num):
+            segment_url = segment['url']
+            segment_name = 'Frag%d' % num
              target_filename = '%s-%s' % (tmp_filename, segment_name)
+            # In DASH, the first segment contains necessary headers to
+            # generate a valid MP4 file, so always abort for the first segment
+            fatal = num == 0 or not skip_unavailable_fragments
              count = 0
              while count <= fragment_retries:
                  try:
-                    success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)})
+                    success = ctx['dl'].download(target_filename, {'url': segment_url})
                      if not success:
                          return False
                      down, target_sanitized = sanitize_open(target_filename, 'rb')
@@ -72,16 +69,8 @@ class DashSegmentsFD(FragmentFD):
                  return False
              return True
  
-        segments_to_download = [(initialization_url, 'Init')] if initialization_url else []
-        segments_to_download.extend([
-            (segment_url, 'Seg%d' % i)
-            for i, segment_url in enumerate(segment_urls)])
-
-        for i, segment in enumerate(segments_to_download):
-            # In DASH, the first segment contains necessary headers to
-            # generate a valid MP4 file, so always abort for the first segment
-            fatal = i == 0 or not skip_unavailable_fragments
-            if not process_segment(segment, ctx['tmpfilename'], fatal):
+        for i, segment in enumerate(segments):
+            if not process_segment(segment, ctx['tmpfilename'], i):
                  return False
  
          self._finish_frag_download(ctx)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index e637b33d51689756b569b752c55e63fe4503de26..f35311e7ac189a3b8467cc3a3f7348ce3c2b1a3b 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -86,9 +86,10 @@ class InfoExtractor(object):
                      from worst to best quality.
  
                      Potential fields:
-                    * url        Mandatory. The URL of the video file or URL of
-                                 the manifest file in case of fragmented media
-                                 (DASH, hls, hds).
+                    * url        Mandatory. The URL of the video file
+                    * manifest_url
+                                 The URL of the manifest file in case of
+                                 fragmented media (DASH, hls, hds)
                      * ext        Will be calculated from URL if missing
                      * format     A human-readable description of the format
                                   ("mp4 container with h264/opus").
@@ -1528,9 +1529,10 @@ class InfoExtractor(object):
          mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
  
          return self._parse_mpd_formats(
-            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
+            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+            formats_dict=formats_dict, mpd_url=mpd_url)
  
-    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
          """
          Parse formats from MPD manifest.
          References:
@@ -1654,6 +1656,7 @@ class InfoExtractor(object):
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                              'url': base_url,
+                            'manifest_url': mpd_url,
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
                              'height': int_or_none(representation_attrib.get('height')),
@@ -1682,14 +1685,6 @@ class InfoExtractor(object):
                                  if 'total_number' not in representation_ms_info and 'segment_duration':
                                      segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                      representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
-                                representation_ms_info['segment_urls'] = [
-                                    media_template % {
-                                        'Number': segment_number,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
-                                    }
-                                    for segment_number in range(
-                                        representation_ms_info['start_number'],
-                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                                  representation_ms_info['fragments'] = [{
                                      'url': media_template % {
                                          'Number': segment_number,
@@ -1703,7 +1698,6 @@ class InfoExtractor(object):
                                  # $Number*$ or $Time$ in media template with S list available
                                  # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
                                  # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
-                                representation_ms_info['segment_urls'] = []
                                  representation_ms_info['fragments'] = []
                                  segment_time = 0
                                  segment_d = None
@@ -1715,7 +1709,6 @@ class InfoExtractor(object):
                                          'Bandwidth': representation_attrib.get('bandwidth'),
                                          'Number': segment_number,
                                      }
-                                    representation_ms_info['segment_urls'].append(segment_url)
                                      representation_ms_info['fragments'].append({
                                          'url': segment_url,
                                          'duration': float_or_none(segment_d, representation_ms_info['timescale']),
@@ -1745,17 +1738,15 @@ class InfoExtractor(object):
                                          'duration': float_or_none(s['d'], representation_ms_info['timescale']),
                                      })
                              representation_ms_info['fragments'] = fragments
-                        if 'segment_urls' in representation_ms_info:
+                        # NB: MPD manifest may contain direct URLs to unfragmented media.
+                        # No fragments key is present in this case.
+                        if 'fragments' in representation_ms_info:
                              f.update({
-                                'segment_urls': representation_ms_info['segment_urls'],
                                  'fragments': [],
                                  'protocol': 'http_dash_segments',
                              })
                              if 'initialization_url' in representation_ms_info:
                                  initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
-                                f.update({
-                                    'initialization_url': initialization_url,
-                                })
                                  if not f.get('url'):
                                      f['url'] = initialization_url
                                  f['fragments'].append({'url': initialization_url})
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 92a6e51461e63fe14d3c68296f91d9c0bc89ef8c..c1792c5348f3e4aea120bc873eb7670b15fccf11 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1657,7 +1657,9 @@ class GenericIE(InfoExtractor):
                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'] = self._parse_mpd_formats(
-                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                    doc, video_id,
+                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_url=url)
                  self._sort_formats(info_dict['formats'])
                  return info_dict
              elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
author	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Sep 2016 13:35:22 +0000 (20:35 +0700)
youtube_dl/downloader/dash.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history