Merge pull request #6097 from dstftw/union-itags-from-multiple-dashmpd
authorSergey M. <dstftw@gmail.com>
Mon, 29 Jun 2015 15:58:34 +0000 (20:58 +0500)
committerSergey M. <dstftw@gmail.com>
Mon, 29 Jun 2015 15:58:34 +0000 (20:58 +0500)
[youtube] Extract formats from multiple DASH manifests (Closes #6093)

1  2 
youtube_dl/extractor/youtube.py
youtube_dl/options.py

index 8b43e274b73b1393305566a1e6a0a36f0d71551c,20e1781f80ba23f8851121ecdc4ec1721f9e3269..036793fc0f71a38a529abd4955c32a2b9c774a7f
@@@ -29,11 -29,9 +29,11 @@@ from ..utils import 
      get_element_by_id,
      int_or_none,
      orderedSet,
 +    str_to_int,
      unescapeHTML,
      unified_strdate,
      uppercase_escape,
 +    ISO3166Utils,
  )
  
  
@@@ -520,6 -518,20 +520,20 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                  'skip_download': 'requires avconv',
              }
          },
+         # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+         {
+             'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+             'info_dict': {
+                 'id': 'FIl7x6_3R5Y',
+                 'ext': 'mp4',
+                 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+                 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+                 'upload_date': '20150625',
+                 'uploader_id': 'dorappi2000',
+                 'uploader': 'dorappi2000',
+                 'formats': 'mincount:33',
+             },
+         }
      ]
  
      def __init__(self, *args, **kwargs):
          else:
              player_url = None
  
+         dash_mpds = []
+         def add_dash_mpd(video_info):
+             dash_mpd = video_info.get('dashmpd')
+             if dash_mpd and dash_mpd[0] not in dash_mpds:
+                 dash_mpds.append(dash_mpd[0])
          # Get video info
          embed_webpage = None
          if re.search(r'player-age-gate-content">', video_webpage) is not None:
                  note='Refetching age-gated info webpage',
                  errnote='unable to download video info webpage')
              video_info = compat_parse_qs(video_info_webpage)
+             add_dash_mpd(video_info)
          else:
              age_gate = False
-             try:
-                 # Try looking directly into the video webpage
-                 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
-                 if not mobj:
-                     raise ValueError('Could not find ytplayer.config')  # caught below
+             video_info = None
+             # Try looking directly into the video webpage
+             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+             if mobj:
                  json_code = uppercase_escape(mobj.group(1))
                  ytplayer_config = json.loads(json_code)
                  args = ytplayer_config['args']
-                 # Convert to the same format returned by compat_parse_qs
-                 video_info = dict((k, [v]) for k, v in args.items())
-                 if not args.get('url_encoded_fmt_stream_map'):
-                     raise ValueError('No stream_map present')  # caught below
-             except ValueError:
-                 # We fallback to the get_video_info pages (used by the embed page)
+                 if args.get('url_encoded_fmt_stream_map'):
+                     # Convert to the same format returned by compat_parse_qs
+                     video_info = dict((k, [v]) for k, v in args.items())
+                     add_dash_mpd(video_info)
+             if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+                 # We also try looking in get_video_info since it may contain different dashmpd
+                 # URL that points to a DASH manifest with possibly different itag set (some itags
+                 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+                 # manifest pointed by get_video_info's dashmpd).
+                 # The general idea is to take a union of itags of both DASH manifests (for example
+                 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                  self.report_video_info_webpage_download(video_id)
-                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
                      video_info_url = (
                          '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                          % (proto, video_id, el_type))
                          video_info_url,
                          video_id, note=False,
                          errnote='unable to download video info webpage')
-                     video_info = compat_parse_qs(video_info_webpage)
-                     if 'token' in video_info:
+                     get_video_info = compat_parse_qs(video_info_webpage)
+                     add_dash_mpd(get_video_info)
+                     if not video_info:
+                         video_info = get_video_info
+                     if 'token' in get_video_info:
                          break
          if 'token' not in video_info:
              if 'reason' in video_info:
 +                if 'The uploader has not made this video available in your country.' in video_info['reason']:
 +                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
 +                    if regions_allowed is not None:
 +                        raise ExtractorError('YouTube said: This video is available in %s only' % (
 +                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
 +                            expected=True)
                  raise ExtractorError(
                      'YouTube said: %s' % video_info['reason'][0],
                      expected=True, video_id=video_id)
                  video_description = ''
  
          def _extract_count(count_name):
 -            count = self._search_regex(
 -                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 -                video_webpage, count_name, default=None)
 -            if count is not None:
 -                return int(count.replace(',', ''))
 -            return None
 +            return str_to_int(self._search_regex(
 +                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
 +                % re.escape(count_name),
 +                video_webpage, count_name, default=None))
 +
          like_count = _extract_count('like')
          dislike_count = _extract_count('dislike')
  
  
          # Look for the DASH manifest
          if self._downloader.params.get('youtube_include_dash_manifest', True):
-             dash_mpd = video_info.get('dashmpd')
-             if dash_mpd:
-                 dash_manifest_url = dash_mpd[0]
+             for dash_manifest_url in dash_mpds:
+                 dash_formats = {}
                  try:
-                     dash_formats = self._parse_dash_manifest(
-                         video_id, dash_manifest_url, player_url, age_gate)
+                     for df in self._parse_dash_manifest(
+                             video_id, dash_manifest_url, player_url, age_gate):
+                         # Do not overwrite DASH format found in some previous DASH manifest
+                         if df['format_id'] not in dash_formats:
+                             dash_formats[df['format_id']] = df
                  except (ExtractorError, KeyError) as e:
                      self.report_warning(
                          'Skipping DASH manifest: %r' % e, video_id)
-                 else:
+                 if dash_formats:
                      # Remove the formats we found through non-DASH, they
                      # contain less info and it can be wrong, because we use
                      # fixed values (for example the resolution). See
                      # https://github.com/rg3/youtube-dl/issues/5774 for an
                      # example.
-                     dash_keys = set(df['format_id'] for df in dash_formats)
-                     formats = [f for f in formats if f['format_id'] not in dash_keys]
-                     formats.extend(dash_formats)
+                     formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+                     formats.extend(dash_formats.values())
  
          # Check for malformed aspect ratio
          stretched_m = re.search(
diff --combined youtube_dl/options.py
index e7d0676425678072e2828a36f903f4b6147c798e,e3dfb7af93f6c7f74cd4257944df9adf6c230e67..4762e1e3c7bf1234dfa468f6f0371c781f1e5281
@@@ -346,13 -346,12 +346,13 @@@ def parseOpts(overrideArguments=None)
      video_format.add_option(
          '--youtube-skip-dash-manifest',
          action='store_false', dest='youtube_include_dash_manifest',
-         help='Do not download the DASH manifest on YouTube videos')
+         help='Do not download the DASH manifests and related data on YouTube videos')
      video_format.add_option(
          '--merge-output-format',
          action='store', dest='merge_output_format', metavar='FORMAT', default=None,
          help=(
 -            'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
 +            'If a merge is required (e.g. bestvideo+bestaudio), '
 +            'output to given container format. One of mkv, mp4, ogg, webm, flv. '
              'Ignored if no merge is required'))
  
      subtitles = optparse.OptionGroup(parser, 'Subtitle Options')