X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=5488101e1997d1078c44a73ef0af5487aa3dcea9;hb=b7a2268e7b52fbedd1630ad101460d76cca9dcdd;hp=3d3d43491c293c79334ec2c1a8a42fec93063796;hpb=785521bf4fbd99b2916bdab5d847d84424196c1d;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d3d43491..5488101e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -648,7 +647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -662,23 +661,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -708,14 +711,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@ -970,10 +979,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@ -1122,6 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, @@ -1146,13 +1153,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' @@ -1237,7 +1244,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for vid_id in ids] def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( @@ -1256,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - - if playlist_id.startswith('RD'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - + def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page @@ -1320,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, 'Youtube', video_id=video_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + + return self._extract_playlist(playlist_id) + class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' @@ -1525,7 +1535,7 @@ class YoutubeSearchURLIE(InfoExtractor): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)
    ', webpage, 'result HTML') + r'(?s)]+class="item-section"(.*?)
', webpage, 'result HTML') part_codes = re.findall( r'(?s)

(.*?)

', result_code) @@ -1636,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:recommended' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = 'Youtube Watch Later' - _PERSONAL_FEED = True + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history'