X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=3da83e3a840c0f50c18a8e021e5a13e6d85cbd84;hb=fccae2b911970d0ffa97800b27e70b1937cd3058;hp=6216028cf127852d18f4aedd9ac466342b302063;hpb=4bb4a18876f5489db77365528638da8d46890a38;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6216028cf..3da83e3a8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -418,6 +418,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'upload_date': '20140605', }, }, + # Age-gate video with encrypted signature + { + 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'info_dict': { + 'id': '6kLq3WMV1nU', + 'ext': 'mp4', + 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', + 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'uploader': 'LloydVEVO', + 'uploader_id': 'LloydVEVO', + 'upload_date': '20110629', + }, + }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) { 'url': '__2ABJjxzNo', @@ -433,7 +446,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'expected_warnings': [ 'DASH manifest missing', ] - } + }, + # Olympics (https://github.com/rg3/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'upload_date': '20120731', + 'uploader_id': 'olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympics', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, ] def __init__(self, *args, **kwargs): @@ -462,7 +491,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -511,8 +540,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return 's[%s%s%s]' % (starts, ends, steps) step = None - start = '(Never used)' # Quelch pyflakes warnings - start will be - # set as soon as step is set + # Quelch pyflakes warnings - start will be set when step is set + start = '(Never used)' for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: @@ -750,11 +779,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube + url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') data = compat_urllib_parse.urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage( @@ -857,7 +888,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): m_cat_container = self._search_regex( r'(?s)]*>\s*Category\s*\s*]*>(.*?)', - video_webpage, 'categories', fatal=False) + video_webpage, 'categories', default=None) if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', @@ -935,7 +966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: + elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) @@ -952,11 +983,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif 's' in url_data: encrypted_sig = url_data['s'][0] - if not age_gate: - jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + embed_webpage if age_gate else video_webpage, 'JS player URL') + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', @@ -1001,9 +1031,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd = video_info.get('dashmpd') - if not dash_mpd: - self.report_warning('%s: DASH manifest missing' % video_id) - else: + if dash_mpd: dash_manifest_url = dash_mpd[0] try: dash_formats = self._parse_dash_manifest( @@ -1058,7 +1086,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _MORE_PAGES_INDICATOR = r'data-link-type="next"' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' IE_NAME = 'youtube:playlist' _TESTS = [{ @@ -1115,6 +1142,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'JODA7', } + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + }, + 'playlist_mincout': 21, }] def _real_initialize(self): @@ -1199,6 +1233,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( @@ -1255,8 +1293,6 @@ class YoutubeTopListIE(YoutubePlaylistIE): class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P[0-9A-Za-z_-]+)' - _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', @@ -1293,20 +1329,27 @@ class YoutubeChannelIE(InfoExtractor): return self.playlist_result(entries, channel_id) def _entries(): + more_widget_html = content_html = channel_page for pagenum in itertools.count(1): - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_json( - url, channel_id, note='Downloading page #%s' % pagenum, - transform_source=uppercase_escape) - ids_in_page = self.extract_videos_from_page(page['content_html']) + ids_in_page = self.extract_videos_from_page(content_html) for video_id in ids_in_page: yield self.url_result( video_id, 'Youtube', video_id=video_id) - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + mobj = re.search( + r'data-uix-load-more-href="/?(?P[^"]+)"', + more_widget_html) + if not mobj: break + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), channel_id, + 'Downloading page #%s' % (pagenum + 1), + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + return self.playlist_result(_entries(), channel_id) @@ -1537,9 +1580,11 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_entries = [] paging = 0 for i in itertools.count(1): - info = self._download_json(self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i) + info = self._download_json( + self._FEED_TEMPLATE % paging, + '%s feed' % self._FEED_NAME, + 'Downloading page %s' % i, + transform_source=uppercase_escape) feed_html = info.get('feed_html') or info.get('content_html') load_more_widget_html = info.get('load_more_widget_html') or feed_html m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)