[youtube] Improve format filesize extraction (#16453)

author Sergey M․ <dstftw@gmail.com>

Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)

committer Sergey M․ <dstftw@gmail.com>

Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)
author Sergey M․ <dstftw@gmail.com>
Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)
committer Sergey M․ <dstftw@gmail.com>
Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 897398d2086aee820966134f29b04c2fcca17596..7f4298c088c7514534e66f6a8d1921fe57196f03 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1815,6 +1815,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          chapters = self._extract_chapters(description_original, video_duration)
  
  
          chapters = self._extract_chapters(description_original, video_duration)
  
+        def _extract_filesize(media_url):
+            return int_or_none(self._search_regex(
+                r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
+
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
              formats = [{
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
              formats = [{
@@ -1919,8 +1923,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
                  width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
  
                  mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
                  width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
  
+                filesize = int_or_none(url_data.get(
+                    'clen', [None])[0]) or _extract_filesize(url)
+
                  more_fields = {
                  more_fields = {
-                    'filesize': int_or_none(url_data.get('clen', [None])[0]),
+                    'filesize': filesize,
                      'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
                      'width': width,
                      'height': height,
                      'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
                      'width': width,
                      'height': height,
@@ -1994,6 +2001,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      for df in self._extract_mpd_formats(
                              mpd_url, video_id, fatal=dash_mpd_fatal,
                              formats_dict=self._formats):
                      for df in self._extract_mpd_formats(
                              mpd_url, video_id, fatal=dash_mpd_fatal,
                              formats_dict=self._formats):
+                        if not df.get('filesize'):
+                            df['filesize'] = _extract_filesize(df['url'])
                          # Do not overwrite DASH format found in some previous DASH manifest
                          if df['format_id'] not in dash_formats:
                              dash_formats[df['format_id']] = df
                          # Do not overwrite DASH format found in some previous DASH manifest
                          if df['format_id'] not in dash_formats:
                              dash_formats[df['format_id']] = df
author	Sergey M․ <dstftw@gmail.com>
	Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Mon, 14 May 2018 16:27:56 +0000 (23:27 +0700)