[youtube:ytsearch] Use the same system as the search webpage (fixes #5483)
[youtube-dl] / youtube_dl / extractor / youtube.py
index 4ec39c589c0c89c6dda6ba067fdc1c49616f062a..f1a5c00775209a9b41d3cb72d8dc2ec1e19d7cb3 100644 (file)
@@ -28,7 +28,6 @@ from ..utils import (
     get_element_by_attribute,
     get_element_by_id,
     int_or_none,
-    OnDemandPagedList,
     orderedSet,
     unescapeHTML,
     unified_strdate,
@@ -1358,6 +1357,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 class YoutubeChannelIE(InfoExtractor):
     IE_DESC = 'YouTube.com channels'
     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
     IE_NAME = 'youtube:channel'
     _TESTS = [{
         'note': 'paginated channel',
@@ -1368,7 +1368,8 @@ class YoutubeChannelIE(InfoExtractor):
         }
     }]
 
-    def extract_videos_from_page(self, page):
+    @staticmethod
+    def extract_videos_from_page(page):
         ids_in_page = []
         titles_in_page = []
         for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
@@ -1386,8 +1387,8 @@ class YoutubeChannelIE(InfoExtractor):
     def _real_extract(self, url):
         channel_id = self._match_id(url)
 
-        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
-        channel_page = self._download_webpage(url, channel_id)
+        url = self._TEMPLATE_URL % channel_id
+        channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
         autogenerated = re.search(r'''(?x)
                 class="[^"]*?(?:
                     channel-header-autogenerated-label|
@@ -1429,12 +1430,10 @@ class YoutubeChannelIE(InfoExtractor):
         return self.playlist_result(_entries(), channel_id)
 
 
-class YoutubeUserIE(InfoExtractor):
+class YoutubeUserIE(YoutubeChannelIE):
     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
-    _GDATA_PAGE_SIZE = 50
-    _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
+    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
     IE_NAME = 'youtube:user'
 
     _TESTS = [{
@@ -1458,95 +1457,56 @@ class YoutubeUserIE(InfoExtractor):
         else:
             return super(YoutubeUserIE, cls).suitable(url)
 
-    def _real_extract(self, url):
-        username = self._match_id(url)
-
-        # Download video ids using YouTube Data API. Result size per
-        # query is limited (currently to 50 videos) so we need to query
-        # page by page until there are no video ids - it means we got
-        # all of them.
-
-        def download_page(pagenum):
-            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
-
-            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(
-                gdata_url, username,
-                'Downloading video ids from %d to %d' % (
-                    start_index, start_index + self._GDATA_PAGE_SIZE))
-
-            try:
-                response = json.loads(page)
-            except ValueError as err:
-                raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
-            if 'entry' not in response['feed']:
-                return
-
-            # Extract video identifiers
-            entries = response['feed']['entry']
-            for entry in entries:
-                title = entry['title']['$t']
-                video_id = entry['id']['$t'].split('/')[-1]
-                yield {
-                    '_type': 'url',
-                    'url': video_id,
-                    'ie_key': 'Youtube',
-                    'id': video_id,
-                    'title': title,
-                }
-        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
-
-        return self.playlist_result(url_results, playlist_title=username)
-
 
-class YoutubeSearchIE(SearchInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
     IE_DESC = 'YouTube.com searches'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
-    _MAX_RESULTS = 1000
+    # there doesn't appear to be a real limit, for example if you search for
+    # 'python' you get more than 8.000.000 results
+    _MAX_RESULTS = float('inf')
     IE_NAME = 'youtube:search'
     _SEARCH_KEY = 'ytsearch'
+    _EXTRA_QUERY_ARGS = {}
 
     def _get_n_results(self, query, n):
         """Get a specified number of results for a query"""
 
-        video_ids = []
-        pagenum = 0
+        videos = []
         limit = n
-        PAGE_SIZE = 50
 
-        while (PAGE_SIZE * pagenum) < limit:
-            result_url = self._API_URL % (
-                compat_urllib_parse.quote_plus(query.encode('utf-8')),
-                max((PAGE_SIZE * pagenum) + 1), 2)
-            data_json = self._download_webpage(
+        for pagenum in itertools.count(1):
+            url_query = {
+                'search_query': query,
+                'page': pagenum,
+                'spf': 'navigate',
+            }
+            url_query.update(self._EXTRA_QUERY_ARGS)
+            result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
+            data = self._download_json(
                 result_url, video_id='query "%s"' % query,
-                note='Downloading page %s' % (pagenum + 1),
+                note='Downloading page %s' % pagenum,
                 errnote='Unable to download API page')
-            data = json.loads(data_json)
-            api_response = data['data']
+            html_content = data[1]['body']['content']
 
-            if 'items' not in api_response:
+            if 'class="search-message' in html_content:
                 raise ExtractorError(
                     '[youtube] No video results', expected=True)
 
-            new_ids = list(video['id'] for video in api_response['items'])
-            video_ids += new_ids
-
-            limit = min(n, api_response['totalItems'])
-            pagenum += 1
+            new_videos = self._ids_to_results(orderedSet(re.findall(
+                r'href="/watch\?v=(.{11})', html_content)))
+            videos += new_videos
+            if not new_videos or len(videos) > limit:
+                break
 
-        if len(video_ids) > n:
-            video_ids = video_ids[:n]
-        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
-                  for video_id in video_ids]
+        if len(videos) > n:
+            videos = videos[:n]
         return self.playlist_result(videos, query)
 
 
 class YoutubeSearchDateIE(YoutubeSearchIE):
     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = 'YouTube.com searches, newest videos first'
+    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
 
 
 class YoutubeSearchURLIE(InfoExtractor):