X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=52909b0daa945170e2d3ebec6892b9f906f3087e;hb=8f02ad4f12549865a2a4436328075f4b20b906ef;hp=35ef4c30359cb70fa58b69eff16d75406190c5ed;hpb=024c53694d674f046fdad593e03bfaeec17cc559;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35ef4c303..52909b0da 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -496,7 +495,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader': '孫艾倫', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, - } + }, + # url_encoded_fmt_stream_map is empty string + { + 'url': 'qEJwOuvDf7I', + 'info_dict': { + 'id': 'qEJwOuvDf7I', + 'ext': 'mp4', + 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', + 'description': '', + 'upload_date': '20150404', + 'uploader_id': 'spbelect', + 'uploader': 'Наблюдатели Петербурга', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, ] def __init__(self, *args, **kwargs): @@ -541,26 +556,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) + download_note = ( + 'Downloading player %s' % player_url + if self._downloader.params.get('verbose') else + 'Downloading %s player %s' % (player_type, player_id) + ) if player_type == 'js': code = self._download_webpage( player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), + note=download_note, errnote='Download of %s failed' % player_url) res = self._parse_sig_js(code) elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), + note=download_note, errnote='Download of %s failed' % player_url) code = urlh.read() res = self._parse_sig_swf(code) else: assert False, 'Invalid player type %r' % player_type - if cache_spec is None: - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res @@ -644,7 +663,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -658,23 +677,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -704,14 +727,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@ -759,33 +788,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): errnote='Could not download DASH manifest') formats = [] - for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): - url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') - if url_el is None: - continue - format_id = r.attrib['id'] - video_url = url_el.text - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) - f = { - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(r.attrib.get('width')), - 'height': int_or_none(r.attrib.get('height')), - 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), - 'asr': int_or_none(r.attrib.get('audioSamplingRate')), - 'filesize': filesize, - 'fps': int_or_none(r.attrib.get('frameRate')), - } - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == format_id) - except StopIteration: - full_info = self._formats.get(format_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'): + mime_type = a.attrib.get('mimeType') + for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + if mime_type == 'text/vtt': + # TODO implement WebVTT downloading + pass + elif mime_type.startswith('audio/') or mime_type.startswith('video/'): + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'height': int_or_none(r.attrib.get('height')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + 'fps': int_or_none(r.attrib.get('frameRate')), + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + full_info = self._formats.get(format_id, {}).copy() + full_info.update(f) + formats.append(full_info) + else: + existing_format.update(f) + else: + self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats def _real_extract(self, url): @@ -842,7 +879,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): args = ytplayer_config['args'] # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) - if 'url_encoded_fmt_stream_map' not in args: + if not args.get('url_encoded_fmt_stream_map'): raise ValueError('No stream_map present') # caught below except ValueError: # We fallback to the get_video_info pages (used by the embed page) @@ -966,10 +1003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@ -1118,6 +1152,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, @@ -1142,13 +1177,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' @@ -1233,7 +1268,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for vid_id in ids] def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( @@ -1252,27 +1287,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - - if playlist_id.startswith('RD'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - + def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page @@ -1316,6 +1331,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, 'Youtube', video_id=video_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + + return self._extract_playlist(playlist_id) + class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' @@ -1332,15 +1370,22 @@ class YoutubeChannelIE(InfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page + titles_in_page = [] + for mobj in re.finditer(r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) def _real_extract(self, url): channel_id = self._match_id(url) - video_ids = [] url = 'https://www.youtube.com/channel/%s/videos' % channel_id channel_page = self._download_webpage(url, channel_id) autogenerated = re.search(r'''(?x) @@ -1352,20 +1397,21 @@ class YoutubeChannelIE(InfoExtractor): if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty - video_ids = self.extract_videos_from_page(channel_page) entries = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) def _entries(): more_widget_html = content_html = channel_page for pagenum in itertools.count(1): - ids_in_page = self.extract_videos_from_page(content_html) - for video_id in ids_in_page: + for video_id, video_title in self.extract_videos_from_page(content_html): yield self.url_result( - video_id, 'Youtube', video_id=video_id) + video_id, 'Youtube', video_id=video_id, + video_title=video_title) mobj = re.search( r'data-uix-load-more-href="/?(?P<more>[^"]+)"', @@ -1521,7 +1567,7 @@ class YoutubeSearchURLIE(InfoExtractor): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') + r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') part_codes = re.findall( r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) @@ -1632,21 +1678,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:recommended' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = 'Youtube Watch Later' - _PERSONAL_FEED = True + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history'