X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=0869c9fd468286e9c4d7125a76b41f2a51b7d29a;hb=39b62db1160f5a4770348f1d01daeb0ce049c28c;hp=1469b932fe1686a1dfa5a1a61afbb5896cdb317d;hpb=60bf45c80d377a38b00b9ec1426c4cc1d9003742;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1469b932f..0869c9fd4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -28,7 +28,6 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, int_or_none, - OnDemandPagedList, orderedSet, unescapeHTML, unified_strdate, @@ -1292,12 +1291,22 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page - # Check if the playlist exists or is private - if re.search(r'
[^<]*?(The|This) playlist (does not exist|is private)[^<]*?
', page) is not None: - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + for match in re.findall(r'
([^<]+)
', page): + match = match.strip() + # Check if the playlist exists or is private + if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): + raise ExtractorError( + 'The playlist doesn\'t exist or is private, use --username or ' + '--netrc to access it.', + expected=True) + elif re.match(r'[^<]*Invalid parameters[^<]*', match): + raise ExtractorError( + 'Invalid parameters. Maybe URL is incorrect.', + expected=True) + elif re.match(r'[^<]*Choose your language[^<]*', match): + continue + else: + self.report_warning('Youtube gives an alert message: ' + match) # Extract the video ids from the playlist pages ids = [] @@ -1369,7 +1378,8 @@ class YoutubeChannelIE(InfoExtractor): } }] - def extract_videos_from_page(self, page): + @staticmethod + def extract_videos_from_page(page): ids_in_page = [] titles_in_page = [] for mobj in re.finditer(r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): @@ -1458,54 +1468,56 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) -class YoutubeSearchIE(SearchInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): IE_DESC = 'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' - _MAX_RESULTS = 1000 + # there doesn't appear to be a real limit, for example if you search for + # 'python' you get more than 8.000.000 results + _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' + _EXTRA_QUERY_ARGS = {} + _TESTS = [] def _get_n_results(self, query, n): """Get a specified number of results for a query""" - video_ids = [] - pagenum = 0 + videos = [] limit = n - PAGE_SIZE = 50 - while (PAGE_SIZE * pagenum) < limit: - result_url = self._API_URL % ( - compat_urllib_parse.quote_plus(query.encode('utf-8')), - max((PAGE_SIZE * pagenum) + 1), 2) - data_json = self._download_webpage( + for pagenum in itertools.count(1): + url_query = { + 'search_query': query, + 'page': pagenum, + 'spf': 'navigate', + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) + data = self._download_json( result_url, video_id='query "%s"' % query, - note='Downloading page %s' % (pagenum + 1), + note='Downloading page %s' % pagenum, errnote='Unable to download API page') - data = json.loads(data_json) - api_response = data['data'] + html_content = data[1]['body']['content'] - if 'items' not in api_response: + if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) - new_ids = list(video['id'] for video in api_response['items']) - video_ids += new_ids - - limit = min(n, api_response['totalItems']) - pagenum += 1 + new_videos = self._ids_to_results(orderedSet(re.findall( + r'href="/watch\?v=(.{11})', html_content))) + videos += new_videos + if not new_videos or len(videos) > limit: + break - if len(video_ids) > n: - video_ids = video_ids[:n] - videos = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + if len(videos) > n: + videos = videos[:n] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' + _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} class YoutubeSearchURLIE(InfoExtractor):