[youtube] Update signature function patterns (closes #21469, closes #21476)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 8619f38381e42f46367f8044ba3552d1b1a0f7dd..83b6ac1346580a65221b6c1db90751e7e4141915 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -500,6 +500,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          # RTMP (unnamed)
          '_rtmp': {'protocol': 'rtmp'},
+
+        # av01 video only formats sometimes served with "unknown" codecs
+        '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
      }
      _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  
@@ -1306,11 +1312,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
      def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
-            (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+            (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             # Obsolete patterns
+             r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
               r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
              jscode, 'Initial JS player signature function name', group='sig')
  
          jsi = JSInterpreter(jscode)
@@ -1575,8 +1587,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          return video_id
  
      def _extract_annotations(self, video_id):
-        url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
-        return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+        return self._download_webpage(
+            'https://www.youtube.com/annotations_invideo', video_id,
+            note='Downloading annotations',
+            errnote='Unable to download video annotations', fatal=False,
+            query={
+                'features': 1,
+                'legacy': 1,
+                'video_id': video_id,
+            })
  
      @staticmethod
      def _extract_chapters(description, duration):
@@ -1789,9 +1808,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              raise ExtractorError(
                  'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
  
-        if video_info.get('license_info'):
-            raise ExtractorError('This video is DRM protected.', expected=True)
-
          video_details = try_get(
              player_response, lambda x: x['videoDetails'], dict) or {}
  
@@ -1927,7 +1943,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              formats = []
              for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
-                if 'itag' not in url_data or 'url' not in url_data:
+                if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
                      continue
                  stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
                  # Unsupported FORMAT_STREAM_TYPE_OTF
@@ -2052,8 +2068,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  url_or_none(try_get(
                      player_response,
                      lambda x: x['streamingData']['hlsManifestUrl'],
-                    compat_str)) or
-                url_or_none(try_get(
+                    compat_str))
+                or url_or_none(try_get(
                      video_info, lambda x: x['hlsvp'][0], compat_str)))
              if manifest_url:
                  formats = []
@@ -2102,10 +2118,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              self._downloader.report_warning('unable to extract uploader nickname')
  
          channel_id = (
-            str_or_none(video_details.get('channelId')) or
-            self._html_search_meta(
-                'channelId', video_webpage, 'channel id', default=None) or
-            self._search_regex(
+            str_or_none(video_details.get('channelId'))
+            or self._html_search_meta(
+                'channelId', video_webpage, 'channel id', default=None)
+            or self._search_regex(
                  r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
                  video_webpage, 'channel id', default=None, group='id'))
          channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
@@ -2227,6 +2243,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
                  'view count', default=None))
  
+        average_rating = (
+            float_or_none(video_details.get('averageRating'))
+            or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
+
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
          automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
@@ -2323,6 +2343,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                          '"token" parameter not in video info for unknown reason',
                          video_id=video_id)
  
+        if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
          self._sort_formats(formats)
  
          self.mark_watched(video_id, video_info, player_response)
@@ -2353,7 +2376,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'view_count': view_count,
              'like_count': like_count,
              'dislike_count': dislike_count,
-            'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
+            'average_rating': average_rating,
              'formats': formats,
              'is_live': is_live,
              'start_time': start_time,
@@ -2564,9 +2587,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
  
          search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
          title_span = (
-            search_title('playlist-title') or
-            search_title('title long-title') or
-            search_title('title'))
+            search_title('playlist-title')
+            or search_title('title long-title')
+            or search_title('title'))
          title = clean_html(title_span)
  
          return self.playlist_result(url_results, playlist_id, title)