[youtube] Improve error detection (#16445)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 963c73a2d9cafb3c26e8e5b8bd3c9b52d76b2f44..44740cb75c0867d4d4710ffebf23bff044dcb660 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -27,6 +27,7 @@ from ..compat import (
      compat_str,
  )
  from ..utils import (
+    bool_or_none,
      clean_html,
      dict_get,
      error_to_compat_str,
@@ -116,6 +117,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                  'f.req': json.dumps(f_req),
                  'flowName': 'GlifWebSignIn',
                  'flowEntry': 'ServiceLogin',
+                # TODO: reverse actual botguard identifier generation algo
+                'bgRequest': '["identifier",""]',
              })
              return self._download_json(
                  url, None, note=note, errnote=errnote,
@@ -368,10 +371,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              (?:www\.)?hooktube\.com/|
                              (?:www\.)?yourepeat\.com/|
                              tube\.majestyc\.net/|
+                            # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
                              (?:(?:www|dev)\.)?invidio\.us/|
-                            (?:www\.)?invidiou\.sh/|
-                            (?:www\.)?invidious\.snopyta\.org/|
+                            (?:(?:www|no)\.)?invidiou\.sh/|
+                            (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
                              (?:www\.)?invidious\.kabi\.tk/|
+                            (?:www\.)?invidious\.enkirton\.net/|
+                            (?:www\.)?invidious\.13ad\.de/|
+                            (?:www\.)?invidious\.mastodon\.host/|
+                            (?:www\.)?tube\.poal\.co/|
                              (?:www\.)?vid\.wxzm\.sx/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
@@ -500,6 +508,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          # RTMP (unnamed)
          '_rtmp': {'protocol': 'rtmp'},
+
+        # av01 video only formats sometimes served with "unknown" codecs
+        '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
      }
      _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  
@@ -1306,11 +1320,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
      def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
-            (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+            (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+             # Obsolete patterns
+             r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
               r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
              jscode, 'Initial JS player signature function name', group='sig')
  
          jsi = JSInterpreter(jscode)
@@ -1575,8 +1596,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          return video_id
  
      def _extract_annotations(self, video_id):
-        url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
-        return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+        return self._download_webpage(
+            'https://www.youtube.com/annotations_invideo', video_id,
+            note='Downloading annotations',
+            errnote='Unable to download video annotations', fatal=False,
+            query={
+                'features': 1,
+                'legacy': 1,
+                'video_id': video_id,
+            })
  
      @staticmethod
      def _extract_chapters(description, duration):
@@ -1672,6 +1700,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          def extract_token(v_info):
              return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
  
+        def extract_player_response(player_response, video_id):
+            pl_response = str_or_none(player_response)
+            if not pl_response:
+                return
+            pl_response = self._parse_json(pl_response, video_id, fatal=False)
+            if isinstance(pl_response, dict):
+                add_dash_mpd_pr(pl_response)
+                return pl_response
+
          player_response = {}
  
          # Get video info
@@ -1694,7 +1731,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  note='Refetching age-gated info webpage',
                  errnote='unable to download video info webpage')
              video_info = compat_parse_qs(video_info_webpage)
+            pl_response = video_info.get('player_response', [None])[0]
+            player_response = extract_player_response(pl_response, video_id)
              add_dash_mpd(video_info)
+            view_count = extract_view_count(video_info)
          else:
              age_gate = False
              video_info = None
@@ -1717,11 +1757,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      is_live = True
                  sts = ytplayer_config.get('sts')
                  if not player_response:
-                    pl_response = str_or_none(args.get('player_response'))
-                    if pl_response:
-                        pl_response = self._parse_json(pl_response, video_id, fatal=False)
-                        if isinstance(pl_response, dict):
-                            player_response = pl_response
+                    player_response = extract_player_response(args.get('player_response'), video_id)
              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                  add_dash_mpd_pr(player_response)
                  # We also try looking in get_video_info since it may contain different dashmpd
@@ -1753,9 +1789,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      get_video_info = compat_parse_qs(video_info_webpage)
                      if not player_response:
                          pl_response = get_video_info.get('player_response', [None])[0]
-                        if isinstance(pl_response, dict):
-                            player_response = pl_response
-                            add_dash_mpd_pr(player_response)
+                        player_response = extract_player_response(pl_response, video_id)
                      add_dash_mpd(get_video_info)
                      if view_count is None:
                          view_count = extract_view_count(get_video_info)
@@ -1779,7 +1813,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          def extract_unavailable_message():
              return self._html_search_regex(
-                r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
+                (r'(?s)<div[^>]+id=["\']unavailable-submessage["\'][^>]+>(.+?)</div',
+                 r'(?s)<h1[^>]+id=["\']unavailable-message["\'][^>]*>(.+?)</h1>'),
                  video_webpage, 'unavailable message', default=None)
  
          if not video_info:
@@ -1792,16 +1827,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          video_details = try_get(
              player_response, lambda x: x['videoDetails'], dict) or {}
  
-        # title
-        if 'title' in video_info:
-            video_title = video_info['title'][0]
-        elif 'title' in player_response:
-            video_title = video_details['title']
-        else:
+        video_title = video_info.get('title', [None])[0] or video_details.get('title')
+        if not video_title:
              self._downloader.report_warning('Unable to extract video title')
              video_title = '_'
  
-        # description
          description_original = video_description = get_element_by_id("eow-description", video_webpage)
          if video_description:
  
@@ -1826,11 +1856,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              ''', replace_url, video_description)
              video_description = clean_html(video_description)
          else:
-            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
-            if fd_mobj:
-                video_description = unescapeHTML(fd_mobj.group(1))
-            else:
-                video_description = ''
+            video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')
  
          if not smuggled_data.get('force_singlefeed', False):
              if not self._downloader.params.get('noplaylist'):
@@ -1868,6 +1894,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          if view_count is None and video_details:
              view_count = int_or_none(video_details.get('viewCount'))
  
+        if is_live is None:
+            is_live = bool_or_none(video_details.get('isLive'))
+
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
              raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@@ -2070,9 +2099,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
                      formats.append(a_format)
              else:
-                error_message = clean_html(video_info.get('reason', [None])[0])
+                error_message = extract_unavailable_message()
                  if not error_message:
-                    error_message = extract_unavailable_message()
+                    error_message = clean_html(try_get(
+                        player_response, lambda x: x['playabilityStatus']['reason'],
+                        compat_str))
+                if not error_message:
+                    error_message = clean_html(
+                        try_get(video_info, lambda x: x['reason'][0], compat_str))
                  if error_message:
                      raise ExtractorError(error_message, expected=True)
                  raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
@@ -2401,7 +2435,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
                          (%(playlist_id)s)
                       )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
-    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
      IE_NAME = 'youtube:playlist'
      _TESTS = [{
          'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -2424,6 +2458,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'title': '29C3: Not my department',
              'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+            'uploader': 'Christiaan008',
+            'uploader_id': 'ChRiStIaAn008',
          },
          'playlist_count': 95,
      }, {
@@ -2432,6 +2468,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'title': '[OLD]Team Fortress 2 (Class-based LP)',
              'id': 'PLBB231211A4F62143',
+            'uploader': 'Wickydoo',
+            'uploader_id': 'Wickydoo',
          },
          'playlist_mincount': 26,
      }, {
@@ -2440,6 +2478,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'title': 'Uploads from Cauchemar',
              'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+            'uploader': 'Cauchemar',
+            'uploader_id': 'Cauchemar89',
          },
          'playlist_mincount': 799,
      }, {
@@ -2457,13 +2497,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'title': 'JODA15',
              'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+            'uploader': 'milan',
+            'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
          }
      }, {
          'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
          'playlist_mincount': 485,
          'info_dict': {
-            'title': '2017 華語最新單曲 (2/24更新)',
+            'title': '2018 Chinese New Singles (11/6 updated)',
              'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+            'uploader': 'LBK',
+            'uploader_id': 'sdragonfang',
          }
      }, {
          'note': 'Embedded SWF player',
@@ -2472,13 +2516,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'title': 'JODA7',
              'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
-        }
+        },
+        'skip': 'This playlist does not exist',
      }, {
          'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
          'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
          'info_dict': {
              'title': 'Uploads from Interstellar Movie',
              'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
+            'uploader': 'Interstellar Movie',
+            'uploader_id': 'InterstellarMovie1',
          },
          'playlist_mincount': 21,
      }, {
@@ -2503,6 +2550,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'This video is not available.',
          'add_ie': [YoutubeIE.ie_key()],
      }, {
          'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
@@ -2514,7 +2562,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
              'uploader_id': 'backuspagemuseum',
              'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
              'upload_date': '20161008',
-            'license': 'Standard YouTube License',
              'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
              'categories': ['Nonprofits & Activism'],
              'tags': list,
@@ -2525,6 +2572,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
              'noplaylist': True,
              'skip_download': True,
          },
+    }, {
+        # https://github.com/ytdl-org/youtube-dl/issues/21844
+        'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+        'info_dict': {
+            'title': 'Data Analysis with Dr Mike Pound',
+            'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+            'uploader_id': 'Computerphile',
+            'uploader': 'Computerphile',
+        },
+        'playlist_mincount': 11,
      }, {
          'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
          'only_matching': True,
@@ -2691,6 +2748,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
              'title': 'Uploads from lex will',
+            'uploader': 'lex will',
+            'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
          }
      }, {
          'note': 'Age restricted channel',
@@ -2700,6 +2759,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
          'info_dict': {
              'id': 'UUs0ifCMCm1icqRbqhUINa0w',
              'title': 'Uploads from Deus Ex',
+            'uploader': 'Deus Ex',
+            'uploader_id': 'DeusExOfficial',
          },
      }, {
          'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
@@ -2784,6 +2845,8 @@ class YoutubeUserIE(YoutubeChannelIE):
          'info_dict': {
              'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
              'title': 'Uploads from The Linux Foundation',
+            'uploader': 'The Linux Foundation',
+            'uploader_id': 'TheLinuxFoundation',
          }
      }, {
          # Only available via https://www.youtube.com/c/12minuteathlete/videos
@@ -2793,6 +2856,8 @@ class YoutubeUserIE(YoutubeChannelIE):
          'info_dict': {
              'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
              'title': 'Uploads from 12 Minute Athlete',
+            'uploader': '12 Minute Athlete',
+            'uploader_id': 'the12minuteathlete',
          }
      }, {
          'url': 'ytuser:phihag',
@@ -2886,7 +2951,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
          'playlist_mincount': 4,
          'info_dict': {
              'id': 'ThirstForScience',
-            'title': 'Thirst for Science',
+            'title': 'ThirstForScience',
          },
      }, {
          # with "Load more" button
@@ -2903,6 +2968,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
              'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
              'title': 'Chem Player',
          },
+        'skip': 'Blocked',
      }]