Merge pull request #6097 from dstftw/union-itags-from-multiple-dashmpd

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 46841617a44c85d9e7f4b2541f5c6d36829e388b..036793fc0f71a38a529abd4955c32a2b9c774a7f 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -29,9 +29,11 @@ from ..utils import (
      get_element_by_id,
      int_or_none,
      orderedSet,
+    str_to_int,
      unescapeHTML,
      unified_strdate,
      uppercase_escape,
+    ISO3166Utils,
  )
  
  
@@ -518,6 +520,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'skip_download': 'requires avconv',
              }
          },
+        # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+        {
+            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+            'info_dict': {
+                'id': 'FIl7x6_3R5Y',
+                'ext': 'mp4',
+                'title': 'md5:7b81415841e02ecd4313668cde88737a',
+                'description': 'md5:116377fd2963b81ec4ce64b542173306',
+                'upload_date': '20150625',
+                'uploader_id': 'dorappi2000',
+                'uploader': 'dorappi2000',
+                'formats': 'mincount:33',
+            },
+        }
      ]
  
      def __init__(self, *args, **kwargs):
@@ -883,6 +899,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              add_dash_mpd(video_info)
          else:
              age_gate = False
+            video_info = None
              # Try looking directly into the video webpage
              mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
              if mobj:
@@ -893,29 +910,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      # Convert to the same format returned by compat_parse_qs
                      video_info = dict((k, [v]) for k, v in args.items())
                      add_dash_mpd(video_info)
-            # We also try looking in get_video_info since it may contain different dashmpd
-            # URL that points to a DASH manifest with possibly different itag set (some itags
-            # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
-            # manifest pointed by get_video_info's dashmpd).
-            # The general idea is to take a union of itags of both DASH manifests (for example
-            # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
-            self.report_video_info_webpage_download(video_id)
-            for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
-                video_info_url = (
-                    '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
-                    % (proto, video_id, el_type))
-                video_info_webpage = self._download_webpage(
-                    video_info_url,
-                    video_id, note=False,
-                    errnote='unable to download video info webpage')
-                get_video_info = compat_parse_qs(video_info_webpage)
-                add_dash_mpd(get_video_info)
-                if not video_info:
-                    video_info = get_video_info
-                if 'token' in get_video_info:
-                    break
+            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+                # We also try looking in get_video_info since it may contain different dashmpd
+                # URL that points to a DASH manifest with possibly different itag set (some itags
+                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+                # manifest pointed by get_video_info's dashmpd).
+                # The general idea is to take a union of itags of both DASH manifests (for example
+                # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
+                self.report_video_info_webpage_download(video_id)
+                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                    video_info_url = (
+                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+                        % (proto, video_id, el_type))
+                    video_info_webpage = self._download_webpage(
+                        video_info_url,
+                        video_id, note=False,
+                        errnote='unable to download video info webpage')
+                    get_video_info = compat_parse_qs(video_info_webpage)
+                    add_dash_mpd(get_video_info)
+                    if not video_info:
+                        video_info = get_video_info
+                    if 'token' in get_video_info:
+                        break
          if 'token' not in video_info:
              if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+                    if regions_allowed is not None:
+                        raise ExtractorError('YouTube said: This video is available in %s only' % (
+                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+                            expected=True)
                  raise ExtractorError(
                      'YouTube said: %s' % video_info['reason'][0],
                      expected=True, video_id=video_id)
@@ -1011,12 +1035,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  video_description = ''
  
          def _extract_count(count_name):
-            count = self._search_regex(
-                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
-                video_webpage, count_name, default=None)
-            if count is not None:
-                return int(count.replace(',', ''))
-            return None
+            return str_to_int(self._search_regex(
+                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+                % re.escape(count_name),
+                video_webpage, count_name, default=None))
+
          like_count = _extract_count('like')
          dislike_count = _extract_count('dislike')