X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=5d1297e0d27260dd1e0f389d5add17061fad0644;hb=6800d3372f35e08dcc4d34d06601815bf0cb0a3d;hp=b2ae0841832c8e002b95fd5e929d2c7d3ebf110e;hpb=cc38fa6cfbdab2ca77ecb1155d64574ab0004bb5;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b2ae08418..5d1297e0d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) + def _ids_to_results(self, ids): + return [ + self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + def _login(self): """ Attempt to log in to YouTube. @@ -797,6 +802,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # TODO implement WebVTT downloading pass elif mime_type.startswith('audio/') or mime_type.startswith('video/'): + segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList') format_id = r.attrib['id'] video_url = url_el.text filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) @@ -810,6 +816,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } + if segment_list: + f.update({ + 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')] + }) try: existing_format = next( fo for fo in formats @@ -1121,12 +1132,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: - # Hide the formats we found through non-DASH + # Remove the formats we found through non-DASH, they + # contain less info and it can be wrong, because we use + # fixed values (for example the resolution). See + # https://github.com/rg3/youtube-dl/issues/5774 for an + # example. dash_keys = set(df['format_id'] for df in dash_formats) - for f in formats: - if f['format_id'] in dash_keys: - f['format_id'] = 'nondash-%s' % f['format_id'] - f['preference'] = f.get('preference', 0) - 10000 + formats = [f for f in formats if f['format_id'] not in dash_keys] formats.extend(dash_formats) # Check for malformed aspect ratio @@ -1261,11 +1273,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -1291,12 +1298,22 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page - # Check if the playlist exists or is private - if re.search(r'
[^<]*?(The|This) playlist (does not exist|is private)[^<]*?
', page) is not None: - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + for match in re.findall(r'
([^<]+)
', page): + match = match.strip() + # Check if the playlist exists or is private + if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): + raise ExtractorError( + 'The playlist doesn\'t exist or is private, use --username or ' + '--netrc to access it.', + expected=True) + elif re.match(r'[^<]*Invalid parameters[^<]*', match): + raise ExtractorError( + 'Invalid parameters. Maybe URL is incorrect.', + expected=True) + elif re.match(r'[^<]*Choose your language[^<]*', match): + continue + else: + self.report_warning('Youtube gives an alert message: ' + match) # Extract the video ids from the playlist pages ids = [] @@ -1388,6 +1405,22 @@ class YoutubeChannelIE(InfoExtractor): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + channel_playlist_id = self._search_regex( + [r'', + r'data-channel-external-id="([^"]+)"'], + channel_page, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + return self.url_result( + compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: @@ -1458,54 +1491,56 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) -class YoutubeSearchIE(SearchInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): IE_DESC = 'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' - _MAX_RESULTS = 1000 + # there doesn't appear to be a real limit, for example if you search for + # 'python' you get more than 8.000.000 results + _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' + _EXTRA_QUERY_ARGS = {} + _TESTS = [] def _get_n_results(self, query, n): """Get a specified number of results for a query""" - video_ids = [] - pagenum = 0 + videos = [] limit = n - PAGE_SIZE = 50 - while (PAGE_SIZE * pagenum) < limit: - result_url = self._API_URL % ( - compat_urllib_parse.quote_plus(query.encode('utf-8')), - max((PAGE_SIZE * pagenum) + 1), 2) - data_json = self._download_webpage( + for pagenum in itertools.count(1): + url_query = { + 'search_query': query, + 'page': pagenum, + 'spf': 'navigate', + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) + data = self._download_json( result_url, video_id='query "%s"' % query, - note='Downloading page %s' % (pagenum + 1), + note='Downloading page %s' % pagenum, errnote='Unable to download API page') - data = json.loads(data_json) - api_response = data['data'] + html_content = data[1]['body']['content'] - if 'items' not in api_response: + if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) - new_ids = list(video['id'] for video in api_response['items']) - video_ids += new_ids - - limit = min(n, api_response['totalItems']) - pagenum += 1 + new_videos = self._ids_to_results(orderedSet(re.findall( + r'href="/watch\?v=(.{11})', html_content))) + videos += new_videos + if not new_videos or len(videos) > limit: + break - if len(video_ids) > n: - video_ids = video_ids[:n] - videos = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + if len(videos) > n: + videos = videos[:n] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' + _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} class YoutubeSearchURLIE(InfoExtractor): @@ -1589,20 +1624,10 @@ class YoutubeShowIE(InfoExtractor): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ - Base class for extractors that fetch info from - http://www.youtube.com/feed_ajax + Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - # use action_load_personal_feed instead of action_load_system_feed - _PERSONAL_FEED = False - - @property - def _FEED_TEMPLATE(self): - action = 'action_load_system_feed' - if self._PERSONAL_FEED: - action = 'action_load_personal_feed' - return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): @@ -1612,36 +1637,38 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): self._login() def _real_extract(self, url): - feed_entries = [] - paging = 0 - for i in itertools.count(1): - info = self._download_json( - self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i, - transform_source=uppercase_escape) - feed_html = info.get('feed_html') or info.get('content_html') - load_more_widget_html = info.get('load_more_widget_html') or feed_html - m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) - ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend( - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids) - mobj = re.search( - r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)', - load_more_widget_html) - if mobj is None: + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) + + # The extraction process is the same as for playlists, but the regex + # for the video ids doesn't contain an index + ids = [] + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + if not new_ids: break - paging = mobj.group('paging') - return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) + ids.extend(new_ids) -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_NAME = 'youtube:recommended' - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + return self.playlist_result( + self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): @@ -1655,15 +1682,6 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): return self._extract_playlist('WL') -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_NAME = 'youtube:history' - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PERSONAL_FEED = True - _PLAYLIST_TITLE = 'Youtube Watch History' - - class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' @@ -1676,42 +1694,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): return self.url_result(playlist_id, 'YoutubePlaylist') -class YoutubeSubscriptionsIE(YoutubePlaylistIE): - IE_NAME = 'youtube:subscriptions' - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _TESTS = [] - - def _real_extract(self, url): - title = 'Youtube Subscriptions' - page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) - - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = 'Youtube Recommended videos' - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) - ids.extend(new_ids) - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = 'Youtube Subscriptions' - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' + _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PLAYLIST_TITLE = 'Youtube History' class YoutubeTruncatedURLIE(InfoExtractor):