[youtube] Improve chapters extraction (closes #13247)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index c2e06c3a6a34bf33377f0af599c92b96ad223d3b..bf4f4e139b1973eef02b5ea4867b269895a0832b 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -37,7 +37,7 @@ from ..utils import (
      parse_codecs,
      parse_duration,
      remove_quotes,
-    # remove_start,
+    remove_start,
      smuggle_url,
      str_to_int,
      try_get,
@@ -55,13 +55,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
      _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
  
      _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
-    _LOOKUP_REQ_TEMPLATE = '["{0}",null,[],null,"US",null,null,2,false,true,[null,null,[2,1,null,1,"https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn",null,[],4],1,[null,null,[]],null,null,null,true],"{0}"]'
-
-    _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
-    _PASSWORD_CHALLENGE_REQ_TEMPLATE = '["{0}",null,1,null,[1,null,null,null,["{1}",null,true]],[null,null,[2,1,null,1,"https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn",null,[],4],1,[null,null,[]],null,null,null,true]]'
-
-    _TFA_URL = 'https://accounts.google.com/_/signin/challenge'
-    _TFA_REQ_TEMPLATE = '["{0}",null,2,null,[9,null,null,null,null,null,null,null,[null,"{1}",false,2]]]'
+    _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+    _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
  
      _NETRC_MACHINE = 'youtube'
      # If True it will raise an error if no login info is provided
@@ -112,7 +107,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                  'checkedDomains': 'youtube',
                  'hl': 'en',
                  'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
-                'f.req': f_req,
+                'f.req': json.dumps(f_req),
                  'flowName': 'GlifWebSignIn',
                  'flowEntry': 'ServiceLogin',
              })
@@ -125,53 +120,127 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                      'Google-Accounts-XSRF': 1,
                  })
  
+        def warn(message):
+            self._downloader.report_warning(message)
+
+        lookup_req = [
+            username,
+            None, [], None, 'US', None, None, 2, False, True,
+            [
+                None, None,
+                [2, 1, None, 1,
+                 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+                 None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ],
+            username,
+        ]
+
          lookup_results = req(
-            self._LOOKUP_URL, self._LOOKUP_REQ_TEMPLATE.format(username),
+            self._LOOKUP_URL, lookup_req,
              'Looking up account info', 'Unable to look up account info')
  
          if lookup_results is False:
              return False
  
-        user_hash = lookup_results[0][2]
+        user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+        if not user_hash:
+            warn('Unable to extract user hash')
+            return False
+
+        challenge_req = [
+            user_hash,
+            None, 1, None, [1, None, None, None, [password, None, True]],
+            [
+                None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ]]
  
-        password_challenge_results = req(
-            self._PASSWORD_CHALLENGE_URL,
-            self._PASSWORD_CHALLENGE_REQ_TEMPLATE.format(user_hash, password),
-            'Logging in', 'Unable to log in')[0]
+        challenge_results = req(
+            self._CHALLENGE_URL, challenge_req,
+            'Logging in', 'Unable to log in')
  
-        if password_challenge_results is False:
+        if challenge_results is False:
              return
  
-        msg = password_challenge_results[5]
-        if msg is not None and isinstance(msg, list):
-            raise ExtractorError('Unable to login: %s' % msg[5], expected=True)
-
-        password_challenge_results = password_challenge_results[-1]
-
-        # tfa = password_challenge_results[0]
-        # if isinstance(tfa, list) and tfa[0][2] == 'TWO_STEP_VERIFICATION':
-        #     tfa_code = self._get_tfa_info('2-step verification code')
-        #
-        #     if not tfa_code:
-        #         self._downloader.report_warning(
-        #             'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
-        #             '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
-        #         return False
-        #
-        #     tfa_code = remove_start(tfa_code, 'G-')
-        #     print('tfa', tfa_code)
-        #     tfa_results = req(
-        #         self._TFA_URL,
-        #         self._TFA_REQ_TEMPLATE.format(user_hash, tfa_code),
-        #         'Submitting TFA code', 'Unable to submit TFA code')
-        #
-        #     TODO
+        login_res = try_get(challenge_results, lambda x: x[0][5], list)
+        if login_res:
+            login_msg = try_get(login_res, lambda x: x[5], compat_str)
+            warn(
+                'Unable to login: %s' % 'Invalid password'
+                if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+            return False
+
+        res = try_get(challenge_results, lambda x: x[0][-1], list)
+        if not res:
+            warn('Unable to extract result entry')
+            return False
+
+        tfa = try_get(res, lambda x: x[0][0], list)
+        if tfa:
+            tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+            if tfa_str == 'TWO_STEP_VERIFICATION':
+                # SEND_SUCCESS - TFA code has been successfully sent to phone
+                # QUOTA_EXCEEDED - reached the limit of TFA codes
+                status = try_get(tfa, lambda x: x[5], compat_str)
+                if status == 'QUOTA_EXCEEDED':
+                    warn('Exceeded the limit of TFA codes, try later')
+                    return False
+
+                tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+                if not tl:
+                    warn('Unable to extract TL')
+                    return False
+
+                tfa_code = self._get_tfa_info('2-step verification code')
+
+                if not tfa_code:
+                    warn(
+                        'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+                        '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+                    return False
+
+                tfa_code = remove_start(tfa_code, 'G-')
+
+                tfa_req = [
+                    user_hash, None, 2, None,
+                    [
+                        9, None, None, None, None, None, None, None,
+                        [None, tfa_code, True, 2]
+                    ]]
+
+                tfa_results = req(
+                    self._TFA_URL.format(tl), tfa_req,
+                    'Submitting TFA code', 'Unable to submit TFA code')
+
+                if tfa_results is False:
+                    return False
+
+                tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+                if tfa_res:
+                    tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+                    warn(
+                        'Unable to finish TFA: %s' % 'Invalid TFA code'
+                        if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+                    return False
+
+                check_cookie_url = try_get(
+                    tfa_results, lambda x: x[0][-1][2], compat_str)
+        else:
+            check_cookie_url = try_get(res, lambda x: x[2], compat_str)
+
+        if not check_cookie_url:
+            warn('Unable to extract CheckCookie URL')
+            return False
  
          check_cookie_results = self._download_webpage(
-            password_challenge_results[2], None, 'Checking cookie')
+            check_cookie_url, None, 'Checking cookie', fatal=False)
+
+        if check_cookie_results is False:
+            return False
  
-        if '>Sign out<' not in check_cookie_results:
-            self._downloader.report_warning('Unable to log in')
+        if 'https://myaccount.google.com/' not in check_cookie_results:
+            warn('Unable to log in')
              return False
  
          return True
@@ -1284,10 +1353,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              start_time = parse_duration(time_point)
              if start_time is None:
                  continue
+            if start_time > duration:
+                break
              end_time = (duration if next_num == len(chapter_lines)
                          else parse_duration(chapter_lines[next_num][1]))
              if end_time is None:
                  continue
+            if end_time > duration:
+                end_time = duration
+            if start_time > end_time:
+                break
              chapter_title = re.sub(
                  r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
              chapter_title = re.sub(r'\s+', ' ', chapter_title)
@@ -1366,6 +1441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          else:
              age_gate = False
              video_info = None
+            sts = None
              # Try looking directly into the video webpage
              ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
              if ytplayer_config:
@@ -1382,6 +1458,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                          args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
                  if args.get('livestream') == '1' or args.get('live_playback') == 1:
                      is_live = True
+                sts = ytplayer_config.get('sts')
              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                  # We also try looking in get_video_info since it may contain different dashmpd
                  # URL that points to a DASH manifest with possibly different itag set (some itags
@@ -1390,17 +1467,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  # The general idea is to take a union of itags of both DASH manifests (for example
                  # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                  self.report_video_info_webpage_download(video_id)
-                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
-                    video_info_url = (
-                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
-                        % (proto, video_id, el_type))
+                for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+                    query = {
+                        'video_id': video_id,
+                        'ps': 'default',
+                        'eurl': '',
+                        'gl': 'US',
+                        'hl': 'en',
+                    }
+                    if el:
+                        query['el'] = el
+                    if sts:
+                        query['sts'] = sts
                      video_info_webpage = self._download_webpage(
-                        video_info_url,
+                        '%s://www.youtube.com/get_video_info' % proto,
                          video_id, note=False,
-                        errnote='unable to download video info webpage')
+                        errnote='unable to download video info webpage',
+                        fatal=False, query=query)
+                    if not video_info_webpage:
+                        continue
                      get_video_info = compat_parse_qs(video_info_webpage)
-                    if get_video_info.get('use_cipher_signature') != ['True']:
-                        add_dash_mpd(get_video_info)
+                    add_dash_mpd(get_video_info)
                      if not video_info:
                          video_info = get_video_info
                      if 'token' in get_video_info:
@@ -1634,12 +1721,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  format_id = url_data['itag'][0]
                  url = url_data['url'][0]
  
-                if 'sig' in url_data:
-                    url += '&signature=' + url_data['sig'][0]
-                elif 's' in url_data:
-                    encrypted_sig = url_data['s'][0]
+                if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
                      ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
-
                      jsplayer_url_json = self._search_regex(
                          ASSETS_RE,
                          embed_webpage if age_gate else video_webpage,
@@ -1660,6 +1743,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              video_webpage, 'age gate player URL')
                          player_url = json.loads(player_url_json)
  
+                if 'sig' in url_data:
+                    url += '&signature=' + url_data['sig'][0]
+                elif 's' in url_data:
+                    encrypted_sig = url_data['s'][0]
+
                      if self._downloader.params.get('verbose'):
                          if player_url is None:
                              player_version = 'unknown'