[youtube:feed] Check each 'load more' portion for unique video ids

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index dc9e15e98d655e9e53c294035f467569cd307283..1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -28,7 +28,6 @@ from ..utils import (
      get_element_by_attribute,
      get_element_by_id,
      int_or_none,
      get_element_by_attribute,
      get_element_by_id,
      int_or_none,
-    OnDemandPagedList,
      orderedSet,
      unescapeHTML,
      unified_strdate,
      orderedSet,
      unescapeHTML,
      unified_strdate,
@@ -50,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              # YouTube sets the expire time to about two months
              expire_time=time.time() + 2 * 30 * 24 * 3600)
  
              # YouTube sets the expire time to about two months
              expire_time=time.time() + 2 * 30 * 24 * 3600)
  
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
      def _login(self):
          """
          Attempt to log in to YouTube.
      def _login(self):
          """
          Attempt to log in to YouTube.
@@ -1262,11 +1266,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      def _real_initialize(self):
          self._login()
  
      def _real_initialize(self):
          self._login()
  
-    def _ids_to_results(self, ids):
-        return [
-            self.url_result(vid_id, 'Youtube', video_id=vid_id)
-            for vid_id in ids]
-
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a single video
          # the id of the playlist is just 'RD' + video_id
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a single video
          # the id of the playlist is just 'RD' + video_id
@@ -1292,12 +1291,22 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
  
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
  
-        # Check if the playlist exists or is private
-        if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
-            raise ExtractorError(
-                'The playlist doesn\'t exist or is private, use --username or '
-                '--netrc to access it.',
-                expected=True)
+        for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
+            match = match.strip()
+            # Check if the playlist exists or is private
+            if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
+                raise ExtractorError(
+                    'The playlist doesn\'t exist or is private, use --username or '
+                    '--netrc to access it.',
+                    expected=True)
+            elif re.match(r'[^<]*Invalid parameters[^<]*', match):
+                raise ExtractorError(
+                    'Invalid parameters. Maybe URL is incorrect.',
+                    expected=True)
+            elif re.match(r'[^<]*Choose your language[^<]*', match):
+                continue
+            else:
+                self.report_warning('Youtube gives an alert message: ' + match)
  
          # Extract the video ids from the playlist pages
          ids = []
  
          # Extract the video ids from the playlist pages
          ids = []
@@ -1358,6 +1367,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = 'YouTube.com channels'
      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = 'YouTube.com channels'
      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
      IE_NAME = 'youtube:channel'
      _TESTS = [{
          'note': 'paginated channel',
      IE_NAME = 'youtube:channel'
      _TESTS = [{
          'note': 'paginated channel',
@@ -1368,7 +1378,8 @@ class YoutubeChannelIE(InfoExtractor):
          }
      }]
  
          }
      }]
  
-    def extract_videos_from_page(self, page):
+    @staticmethod
+    def extract_videos_from_page(page):
          ids_in_page = []
          titles_in_page = []
          for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
          ids_in_page = []
          titles_in_page = []
          for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
@@ -1386,8 +1397,8 @@ class YoutubeChannelIE(InfoExtractor):
      def _real_extract(self, url):
          channel_id = self._match_id(url)
  
      def _real_extract(self, url):
          channel_id = self._match_id(url)
  
-        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
-        channel_page = self._download_webpage(url, channel_id)
+        url = self._TEMPLATE_URL % channel_id
+        channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
          autogenerated = re.search(r'''(?x)
                  class="[^"]*?(?:
                      channel-header-autogenerated-label|
          autogenerated = re.search(r'''(?x)
                  class="[^"]*?(?:
                      channel-header-autogenerated-label|
@@ -1397,20 +1408,18 @@ class YoutubeChannelIE(InfoExtractor):
          if autogenerated:
              # The videos are contained in a single page
              # the ajax pages can't be used, they are empty
          if autogenerated:
              # The videos are contained in a single page
              # the ajax pages can't be used, they are empty
-            videos = self.extract_videos_from_page(channel_page)
              entries = [
                  self.url_result(
                      video_id, 'Youtube', video_id=video_id,
                      video_title=video_title)
              entries = [
                  self.url_result(
                      video_id, 'Youtube', video_id=video_id,
                      video_title=video_title)
-                for video_id, video_title in videos]
+                for video_id, video_title in self.extract_videos_from_page(channel_page)]
              return self.playlist_result(entries, channel_id)
  
          def _entries():
              more_widget_html = content_html = channel_page
              for pagenum in itertools.count(1):
  
              return self.playlist_result(entries, channel_id)
  
          def _entries():
              more_widget_html = content_html = channel_page
              for pagenum in itertools.count(1):
  
-                ids_in_page = self.extract_videos_from_page(content_html)
-                for video_id, video_title in ids_in_page:
+                for video_id, video_title in self.extract_videos_from_page(content_html):
                      yield self.url_result(
                          video_id, 'Youtube', video_id=video_id,
                          video_title=video_title)
                      yield self.url_result(
                          video_id, 'Youtube', video_id=video_id,
                          video_title=video_title)
@@ -1431,12 +1440,10 @@ class YoutubeChannelIE(InfoExtractor):
          return self.playlist_result(_entries(), channel_id)
  
  
          return self.playlist_result(_entries(), channel_id)
  
  
-class YoutubeUserIE(InfoExtractor):
+class YoutubeUserIE(YoutubeChannelIE):
      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
      _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
      _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
-    _GDATA_PAGE_SIZE = 50
-    _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
+    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
      IE_NAME = 'youtube:user'
  
      _TESTS = [{
      IE_NAME = 'youtube:user'
  
      _TESTS = [{
@@ -1460,95 +1467,57 @@ class YoutubeUserIE(InfoExtractor):
          else:
              return super(YoutubeUserIE, cls).suitable(url)
  
          else:
              return super(YoutubeUserIE, cls).suitable(url)
  
-    def _real_extract(self, url):
-        username = self._match_id(url)
-
-        # Download video ids using YouTube Data API. Result size per
-        # query is limited (currently to 50 videos) so we need to query
-        # page by page until there are no video ids - it means we got
-        # all of them.
-
-        def download_page(pagenum):
-            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
-
-            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(
-                gdata_url, username,
-                'Downloading video ids from %d to %d' % (
-                    start_index, start_index + self._GDATA_PAGE_SIZE))
  
  
-            try:
-                response = json.loads(page)
-            except ValueError as err:
-                raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
-            if 'entry' not in response['feed']:
-                return
-
-            # Extract video identifiers
-            entries = response['feed']['entry']
-            for entry in entries:
-                title = entry['title']['$t']
-                video_id = entry['id']['$t'].split('/')[-1]
-                yield {
-                    '_type': 'url',
-                    'url': video_id,
-                    'ie_key': 'Youtube',
-                    'id': video_id,
-                    'title': title,
-                }
-        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
-
-        return self.playlist_result(url_results, playlist_title=username)
-
-
-class YoutubeSearchIE(SearchInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
      IE_DESC = 'YouTube.com searches'
      IE_DESC = 'YouTube.com searches'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
-    _MAX_RESULTS = 1000
+    # there doesn't appear to be a real limit, for example if you search for
+    # 'python' you get more than 8.000.000 results
+    _MAX_RESULTS = float('inf')
      IE_NAME = 'youtube:search'
      _SEARCH_KEY = 'ytsearch'
      IE_NAME = 'youtube:search'
      _SEARCH_KEY = 'ytsearch'
+    _EXTRA_QUERY_ARGS = {}
+    _TESTS = []
  
      def _get_n_results(self, query, n):
          """Get a specified number of results for a query"""
  
  
      def _get_n_results(self, query, n):
          """Get a specified number of results for a query"""
  
-        video_ids = []
-        pagenum = 0
+        videos = []
          limit = n
          limit = n
-        PAGE_SIZE = 50
  
  
-        while (PAGE_SIZE * pagenum) < limit:
-            result_url = self._API_URL % (
-                compat_urllib_parse.quote_plus(query.encode('utf-8')),
-                (PAGE_SIZE * pagenum) + 1)
-            data_json = self._download_webpage(
+        for pagenum in itertools.count(1):
+            url_query = {
+                'search_query': query,
+                'page': pagenum,
+                'spf': 'navigate',
+            }
+            url_query.update(self._EXTRA_QUERY_ARGS)
+            result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
+            data = self._download_json(
                  result_url, video_id='query "%s"' % query,
                  result_url, video_id='query "%s"' % query,
-                note='Downloading page %s' % (pagenum + 1),
+                note='Downloading page %s' % pagenum,
                  errnote='Unable to download API page')
                  errnote='Unable to download API page')
-            data = json.loads(data_json)
-            api_response = data['data']
+            html_content = data[1]['body']['content']
  
  
-            if 'items' not in api_response:
+            if 'class="search-message' in html_content:
                  raise ExtractorError(
                      '[youtube] No video results', expected=True)
  
                  raise ExtractorError(
                      '[youtube] No video results', expected=True)
  
-            new_ids = list(video['id'] for video in api_response['items'])
-            video_ids += new_ids
-
-            limit = min(n, api_response['totalItems'])
-            pagenum += 1
+            new_videos = self._ids_to_results(orderedSet(re.findall(
+                r'href="/watch\?v=(.{11})', html_content)))
+            videos += new_videos
+            if not new_videos or len(videos) > limit:
+                break
  
  
-        if len(video_ids) > n:
-            video_ids = video_ids[:n]
-        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
-                  for video_id in video_ids]
+        if len(videos) > n:
+            videos = videos[:n]
          return self.playlist_result(videos, query)
  
  
  class YoutubeSearchDateIE(YoutubeSearchIE):
      IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
          return self.playlist_result(videos, query)
  
  
  class YoutubeSearchDateIE(YoutubeSearchIE):
      IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
      _SEARCH_KEY = 'ytsearchdate'
      IE_DESC = 'YouTube.com searches, newest videos first'
      _SEARCH_KEY = 'ytsearchdate'
      IE_DESC = 'YouTube.com searches, newest videos first'
+    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
  
  
  class YoutubeSearchURLIE(InfoExtractor):
  
  
  class YoutubeSearchURLIE(InfoExtractor):
@@ -1632,20 +1601,10 @@ class YoutubeShowIE(InfoExtractor):
  
  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
      """
  
  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
      """
-    Base class for extractors that fetch info from
-    http://www.youtube.com/feed_ajax
+    Base class for feed extractors
      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
      """
      _LOGIN_REQUIRED = True
      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
      """
      _LOGIN_REQUIRED = True
-    # use action_load_personal_feed instead of action_load_system_feed
-    _PERSONAL_FEED = False
-
-    @property
-    def _FEED_TEMPLATE(self):
-        action = 'action_load_system_feed'
-        if self._PERSONAL_FEED:
-            action = 'action_load_personal_feed'
-        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
  
      @property
      def IE_NAME(self):
  
      @property
      def IE_NAME(self):
@@ -1655,36 +1614,38 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          self._login()
  
      def _real_extract(self, url):
          self._login()
  
      def _real_extract(self, url):
-        feed_entries = []
-        paging = 0
-        for i in itertools.count(1):
-            info = self._download_json(
-                self._FEED_TEMPLATE % paging,
-                '%s feed' % self._FEED_NAME,
-                'Downloading page %s' % i,
-                transform_source=uppercase_escape)
-            feed_html = info.get('feed_html') or info.get('content_html')
-            load_more_widget_html = info.get('load_more_widget_html') or feed_html
-            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
-            ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in ids)
-            mobj = re.search(
-                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                load_more_widget_html)
-            if mobj is None:
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
+
+        # The extraction process is the same as for playlists, but the regex
+        # for the video ids doesn't contain an index
+        ids = []
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
                  break
                  break
-            paging = mobj.group('paging')
-        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
  
+            ids.extend(new_ids)
  
  
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:recommended'
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
-    _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            more_widget_html = more['load_more_widget_html']
+
+        return self.playlist_result(
+            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
  
  
  class YoutubeWatchLaterIE(YoutubePlaylistIE):
  
  
  class YoutubeWatchLaterIE(YoutubePlaylistIE):
@@ -1698,15 +1659,6 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE):
          return self._extract_playlist('WL')
  
  
          return self._extract_playlist('WL')
  
  
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:history'
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
-    _FEED_NAME = 'history'
-    _PERSONAL_FEED = True
-    _PLAYLIST_TITLE = 'Youtube Watch History'
-
-
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
      IE_NAME = 'youtube:favorites'
      IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
      IE_NAME = 'youtube:favorites'
      IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
@@ -1719,42 +1671,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
          return self.url_result(playlist_id, 'YoutubePlaylist')
  
  
          return self.url_result(playlist_id, 'YoutubePlaylist')
  
  
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:subscriptions'
-    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube Subscriptions'
-        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
  
  
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
-            ids.extend(new_ids)
  
  
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
  
  
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
  
  
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
  
  
  class YoutubeTruncatedURLIE(InfoExtractor):
  
  
  class YoutubeTruncatedURLIE(InfoExtractor):