X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=fcdbfe0bc959a011bebf8656184fe164b3eca84a;hb=1ebc05df91b769452da50c5fad0b413550d5e1de;hp=9096a29756ca6e1a66ecd442a92977fa1b999b31;hpb=25f14e9f93295a787e0cb436a5f6179d6174733d;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9096a2975..fcdbfe0bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1126,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: - # Hide the formats we found through non-DASH + # Remove the formats we found through non-DASH, they + # contain less info and it can be wrong, because we use + # fixed values (for example the resolution). See + # https://github.com/rg3/youtube-dl/issues/5774 for an + # example. dash_keys = set(df['format_id'] for df in dash_formats) - for f in formats: - if f['format_id'] in dash_keys: - f['format_id'] = 'nondash-%s' % f['format_id'] - f['preference'] = f.get('preference', 0) - 10000 + formats = [f for f in formats if f['format_id'] not in dash_keys] formats.extend(dash_formats) # Check for malformed aspect ratio @@ -1398,6 +1399,26 @@ class YoutubeChannelIE(InfoExtractor): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + channel_playlist_id = self._search_regex( + [r'', + r'data-channel-external-id="([^"]+)"'], + channel_page, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + channel_playlist = unescapeHTML(self._search_regex( + r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&list=%s)"' % playlist_id, + channel_page, 'channel playlist URL', default=None)) + if channel_playlist: + return self.url_result( + compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist') + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: @@ -1621,10 +1642,16 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # for the video ids doesn't contain an index ids = [] more_widget_html = content_html = page - for page_num in itertools.count(1): matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) + + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + if not new_ids: + break + ids.extend(new_ids) mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html)