[youtube] Extend html5 player regex (closes #17516)

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 2fe074cb4339bf9fd560af1a9eeab295ac3d4a30..906774875d2271e547f9dc3f4238037feeb5aab8 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -41,12 +41,14 @@ from ..utils import (
      remove_quotes,
      remove_start,
      smuggle_url,
+    str_or_none,
      str_to_int,
      try_get,
      unescapeHTML,
      unified_strdate,
      unsmuggle_url,
      uppercase_escape,
+    url_or_none,
      urlencode_postdata,
  )
  
@@ -259,7 +261,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
          return True
  
      def _download_webpage_handle(self, *args, **kwargs):
-        kwargs.setdefault('query', {})['disable_polymer'] = 'true'
+        query = kwargs.get('query', {}).copy()
+        query['disable_polymer'] = 'true'
+        kwargs['query'] = query
          return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
              *args, **compat_kwargs(kwargs))
  
@@ -347,6 +351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              (?:www\.)?hooktube\.com/|
                              (?:www\.)?yourepeat\.com/|
                              tube\.majestyc\.net/|
+                            (?:www\.)?invidio\.us/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                           (?:                                                  # the various things that can precede the ID:
@@ -498,6 +503,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'categories': ['Science & Technology'],
                  'tags': ['youtube-dl'],
                  'duration': 10,
+                'view_count': int,
                  'like_count': int,
                  'dislike_count': int,
                  'start_time': 1,
@@ -580,6 +586,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'categories': ['Science & Technology'],
                  'tags': ['youtube-dl'],
                  'duration': 10,
+                'view_count': int,
                  'like_count': int,
                  'dislike_count': int,
              },
@@ -1066,6 +1073,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
              'only_matching': True,
          },
+        {
+            'url': 'https://invidio.us/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -1182,7 +1193,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
               r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
               r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
              jscode, 'Initial JS player signature function name', group='sig')
  
          jsi = JSInterpreter(jscode)
@@ -1375,8 +1387,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              self._downloader.report_warning(err_msg)
              return {}
  
-    def _mark_watched(self, video_id, video_info):
-        playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+    def _mark_watched(self, video_id, video_info, player_response):
+        playback_url = url_or_none(try_get(
+            player_response,
+            lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
+            video_info, lambda x: x['videostats_playback_base_url'][0]))
          if not playback_url:
              return
          parsed_playback_url = compat_urlparse.urlparse(playback_url)
@@ -1531,6 +1546,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          def extract_view_count(v_info):
              return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
  
+        player_response = {}
+
          # Get video info
          embed_webpage = None
          if re.search(r'player-age-gate-content">', video_webpage) is not None:
@@ -1573,6 +1590,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  if args.get('livestream') == '1' or args.get('live_playback') == 1:
                      is_live = True
                  sts = ytplayer_config.get('sts')
+                if not player_response:
+                    pl_response = str_or_none(args.get('player_response'))
+                    if pl_response:
+                        pl_response = self._parse_json(pl_response, video_id, fatal=False)
+                        if isinstance(pl_response, dict):
+                            player_response = pl_response
              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                  # We also try looking in get_video_info since it may contain different dashmpd
                  # URL that points to a DASH manifest with possibly different itag set (some itags
@@ -1601,6 +1624,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      if not video_info_webpage:
                          continue
                      get_video_info = compat_parse_qs(video_info_webpage)
+                    if not player_response:
+                        pl_response = get_video_info.get('player_response', [None])[0]
+                        if isinstance(pl_response, dict):
+                            player_response = pl_response
                      add_dash_mpd(get_video_info)
                      if view_count is None:
                          view_count = extract_view_count(get_video_info)
@@ -1646,9 +1673,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      '"token" parameter not in video info for unknown reason',
                      video_id=video_id)
  
+        video_details = try_get(
+            player_response, lambda x: x['videoDetails'], dict) or {}
+
          # title
          if 'title' in video_info:
              video_title = video_info['title'][0]
+        elif 'title' in player_response:
+            video_title = video_details['title']
          else:
              self._downloader.report_warning('Unable to extract video title')
              video_title = '_'
@@ -1684,33 +1716,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              else:
                  video_description = ''
  
-        if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+        if not smuggled_data.get('force_singlefeed', False):
              if not self._downloader.params.get('noplaylist'):
-                entries = []
-                feed_ids = []
-                multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
-                for feed in multifeed_metadata_list.split(','):
-                    # Unquote should take place before split on comma (,) since textual
-                    # fields may contain comma as well (see
-                    # https://github.com/rg3/youtube-dl/issues/8536)
-                    feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
-                    entries.append({
-                        '_type': 'url_transparent',
-                        'ie_key': 'Youtube',
-                        'url': smuggle_url(
-                            '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
-                            {'force_singlefeed': True}),
-                        'title': '%s (%s)' % (video_title, feed_data['title'][0]),
-                    })
-                    feed_ids.append(feed_data['id'][0])
-                self.to_screen(
-                    'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
-                    % (', '.join(feed_ids), video_id))
-                return self.playlist_result(entries, video_id, video_title, video_description)
-            self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                multifeed_metadata_list = try_get(
+                    player_response,
+                    lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
+                    compat_str) or try_get(
+                    video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
+                if multifeed_metadata_list:
+                    entries = []
+                    feed_ids = []
+                    for feed in multifeed_metadata_list.split(','):
+                        # Unquote should take place before split on comma (,) since textual
+                        # fields may contain comma as well (see
+                        # https://github.com/rg3/youtube-dl/issues/8536)
+                        feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+                        entries.append({
+                            '_type': 'url_transparent',
+                            'ie_key': 'Youtube',
+                            'url': smuggle_url(
+                                '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+                                {'force_singlefeed': True}),
+                            'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+                        })
+                        feed_ids.append(feed_data['id'][0])
+                    self.to_screen(
+                        'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+                        % (', '.join(feed_ids), video_id))
+                    return self.playlist_result(entries, video_id, video_title, video_description)
+            else:
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
  
          if view_count is None:
              view_count = extract_view_count(video_info)
+        if view_count is None and video_details:
+            view_count = int_or_none(video_details.get('viewCount'))
  
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
@@ -1794,7 +1834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              else:
                                  player_version = self._search_regex(
                                      [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
-                                     r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
+                                     r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
                                      player_url,
                                      'html5 player', fatal=False)
                                  player_desc = 'html5 player %s' % player_version
@@ -1891,7 +1931,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
          # uploader
-        video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
+        video_uploader = try_get(
+            video_info, lambda x: x['author'][0],
+            compat_str) or str_or_none(video_details.get('author'))
          if video_uploader:
              video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
          else:
@@ -2004,12 +2046,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          like_count = _extract_count('like')
          dislike_count = _extract_count('dislike')
  
+        if view_count is None:
+            view_count = str_to_int(self._search_regex(
+                r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
+                'view count', default=None))
+
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
          automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
  
          video_duration = try_get(
              video_info, lambda x: int_or_none(x['length_seconds'][0]))
+        if not video_duration:
+            video_duration = int_or_none(video_details.get('lengthSeconds'))
          if not video_duration:
              video_duration = parse_duration(self._html_search_meta(
                  'duration', video_webpage, 'video duration'))
@@ -2077,7 +2126,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          self._sort_formats(formats)
  
-        self.mark_watched(video_id, video_info)
+        self.mark_watched(video_id, video_info, player_response)
  
          return {
              'id': video_id,
@@ -2124,7 +2173,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
                          (?:https?://)?
                          (?:\w+\.)?
                          (?:
-                            youtube\.com/
+                            (?:
+                                youtube\.com|
+                                invidio\.us
+                            )
+                            /
                              (?:
                                 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
                                 \? (?:.*?[&;])*? (?:p|a|list)=
@@ -2237,6 +2290,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
              'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
              'categories': ['People & Blogs'],
              'tags': list,
+            'view_count': int,
              'like_count': int,
              'dislike_count': int,
          },
@@ -2275,6 +2329,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
          # music album playlist
          'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
          'only_matching': True,
+    }, {
+        'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+        'only_matching': True,
      }]
  
      def _real_initialize(self):
@@ -2417,7 +2474,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
  
  class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
      IE_DESC = 'YouTube.com channels'
-    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
      _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
      _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
      IE_NAME = 'youtube:channel'
@@ -2438,6 +2495,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
              'id': 'UUs0ifCMCm1icqRbqhUINa0w',
              'title': 'Uploads from Deus Ex',
          },
+    }, {
+        'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
+        'only_matching': True,
      }]
  
      @classmethod