[youtube] Fix extraction with --youtube-skip-dash-manifest enabled (closes #14037)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index ae832cac5b27bd48577e94ee25f15b9dc9870dcb..5a6b735a09b87b839f01af91db40514170066604 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -673,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              },
          },
          # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+        # YouTube Red ad is not captured for creator
          {
              'url': '__2ABJjxzNo',
              'info_dict': {
@@ -1002,6 +1003,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'Skipping DASH manifest',
              ],
          },
+        {
+            # The following content has been identified by the YouTube community
+            # as inappropriate or offensive to some audiences.
+            'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
+            'info_dict': {
+                'id': '6SJNVb0GnPI',
+                'ext': 'mp4',
+                'title': 'Race Differences in Intelligence',
+                'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
+                'duration': 965,
+                'upload_date': '20140124',
+                'uploader': 'New Century Foundation',
+                'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
+                'license': 'Standard YouTube License',
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
          {
              # itag 212
              'url': '1t24XAntNCY',
@@ -1269,37 +1291,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      sub_lang_list[sub_lang] = sub_formats
                  return sub_lang_list
  
+            def make_captions(sub_url, sub_langs):
+                parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
+                caption_qs = compat_parse_qs(parsed_sub_url.query)
+                captions = {}
+                for sub_lang in sub_langs:
+                    sub_formats = []
+                    for ext in self._SUBTITLE_FORMATS:
+                        caption_qs.update({
+                            'tlang': [sub_lang],
+                            'fmt': [ext],
+                        })
+                        sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
+                            query=compat_urllib_parse_urlencode(caption_qs, True)))
+                        sub_formats.append({
+                            'url': sub_url,
+                            'ext': ext,
+                        })
+                    captions[sub_lang] = sub_formats
+                return captions
+
+            # New captions format as of 22.06.2017
+            player_response = args.get('player_response')
+            if player_response and isinstance(player_response, compat_str):
+                player_response = self._parse_json(
+                    player_response, video_id, fatal=False)
+                if player_response:
+                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                    base_url = renderer['captionTracks'][0]['baseUrl']
+                    sub_lang_list = []
+                    for lang in renderer['translationLanguages']:
+                        lang_code = lang.get('languageCode')
+                        if lang_code:
+                            sub_lang_list.append(lang_code)
+                    return make_captions(base_url, sub_lang_list)
+
              # Some videos don't provide ttsurl but rather caption_tracks and
              # caption_translation_languages (e.g. 20LmZk1hakA)
+            # Does not used anymore as of 22.06.2017
              caption_tracks = args['caption_tracks']
              caption_translation_languages = args['caption_translation_languages']
              caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
-            parsed_caption_url = compat_urllib_parse_urlparse(caption_url)
-            caption_qs = compat_parse_qs(parsed_caption_url.query)
-
-            sub_lang_list = {}
+            sub_lang_list = []
              for lang in caption_translation_languages.split(','):
                  lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
                  sub_lang = lang_qs.get('lc', [None])[0]
-                if not sub_lang:
-                    continue
-                sub_formats = []
-                for ext in self._SUBTITLE_FORMATS:
-                    caption_qs.update({
-                        'tlang': [sub_lang],
-                        'fmt': [ext],
-                    })
-                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
-                        query=compat_urllib_parse_urlencode(caption_qs, True)))
-                    sub_formats.append({
-                        'url': sub_url,
-                        'ext': ext,
-                    })
-                sub_lang_list[sub_lang] = sub_formats
-            return sub_lang_list
+                if sub_lang:
+                    sub_lang_list.append(sub_lang)
+            return make_captions(caption_url, sub_lang_list)
          # An extractor error can be raise by the download process if there are
          # no automatic captions but there are subtitles
-        except (KeyError, ExtractorError):
+        except (KeyError, IndexError, ExtractorError):
              self._downloader.report_warning(err_msg)
              return {}
  
@@ -1353,10 +1395,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              start_time = parse_duration(time_point)
              if start_time is None:
                  continue
+            if start_time > duration:
+                break
              end_time = (duration if next_num == len(chapter_lines)
                          else parse_duration(chapter_lines[next_num][1]))
              if end_time is None:
                  continue
+            if end_time > duration:
+                end_time = duration
+            if start_time > end_time:
+                break
              chapter_title = re.sub(
                  r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
              chapter_title = re.sub(r'\s+', ' ', chapter_title)
@@ -1410,9 +1458,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              if dash_mpd and dash_mpd[0] not in dash_mpds:
                  dash_mpds.append(dash_mpd[0])
  
+        is_live = None
+        view_count = None
+
+        def extract_view_count(v_info):
+            return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
+
          # Get video info
          embed_webpage = None
-        is_live = None
          if re.search(r'player-age-gate-content">', video_webpage) is not None:
              age_gate = True
              # We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -1435,7 +1488,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          else:
              age_gate = False
              video_info = None
-            sts = ''
+            sts = None
              # Try looking directly into the video webpage
              ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
              if ytplayer_config:
@@ -1452,7 +1505,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                          args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
                  if args.get('livestream') == '1' or args.get('live_playback') == 1:
                      is_live = True
-                sts = ytplayer_config.get('sts', '')
+                sts = ytplayer_config.get('sts')
              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                  # We also try looking in get_video_info since it may contain different dashmpd
                  # URL that points to a DASH manifest with possibly different itag set (some itags
@@ -1461,17 +1514,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  # The general idea is to take a union of itags of both DASH manifests (for example
                  # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                  self.report_video_info_webpage_download(video_id)
-                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
-                    video_info_url = (
-                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en&sts=%s'
-                        % (proto, video_id, el_type, sts))
+                for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+                    query = {
+                        'video_id': video_id,
+                        'ps': 'default',
+                        'eurl': '',
+                        'gl': 'US',
+                        'hl': 'en',
+                    }
+                    if el:
+                        query['el'] = el
+                    if sts:
+                        query['sts'] = sts
                      video_info_webpage = self._download_webpage(
-                        video_info_url,
+                        '%s://www.youtube.com/get_video_info' % proto,
                          video_id, note=False,
-                        errnote='unable to download video info webpage')
+                        errnote='unable to download video info webpage',
+                        fatal=False, query=query)
+                    if not video_info_webpage:
+                        continue
                      get_video_info = compat_parse_qs(video_info_webpage)
-                    if get_video_info.get('use_cipher_signature') != ['True']:
-                        add_dash_mpd(get_video_info)
+                    add_dash_mpd(get_video_info)
+                    if view_count is None:
+                        view_count = extract_view_count(get_video_info)
                      if not video_info:
                          video_info = get_video_info
                      if 'token' in get_video_info:
@@ -1555,10 +1620,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  return self.playlist_result(entries, video_id, video_title, video_description)
              self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
  
-        if 'view_count' in video_info:
-            view_count = int(video_info['view_count'][0])
-        else:
-            view_count = None
+        if view_count is None:
+            view_count = extract_view_count(video_info)
  
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
@@ -1613,7 +1676,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              video_webpage, 'license', default=None)
  
          m_music = re.search(
-            r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
+            r'''(?x)
+                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+                <ul[^>]*>\s*
+                <li>(?P<title>.+?)
+                by (?P<creator>.+?)
+                (?:
+                    \(.+?\)|
+                    <a[^>]*
+                        (?:
+                            \bhref=["\']/red[^>]*>|             # drop possible
+                            >\s*Listen ad-free with YouTube Red # YouTube Red ad
+                        )
+                    .*?
+                )?</li
+            ''',
              video_webpage)
          if m_music:
              video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
@@ -1705,12 +1782,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  format_id = url_data['itag'][0]
                  url = url_data['url'][0]
  
-                if 'sig' in url_data:
-                    url += '&signature=' + url_data['sig'][0]
-                elif 's' in url_data:
-                    encrypted_sig = url_data['s'][0]
+                if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
                      ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
-
                      jsplayer_url_json = self._search_regex(
                          ASSETS_RE,
                          embed_webpage if age_gate else video_webpage,
@@ -1731,6 +1804,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              video_webpage, 'age gate player URL')
                          player_url = json.loads(player_url_json)
  
+                if 'sig' in url_data:
+                    url += '&signature=' + url_data['sig'][0]
+                elif 's' in url_data:
+                    encrypted_sig = url_data['s'][0]
+
                      if self._downloader.params.get('verbose'):
                          if player_url is None:
                              player_version = 'unknown'