return
-class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
- # Extract the video ids from the playlist pages
+class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
+ # Extract entries from page with "Load more" button
def _entries(self, page, playlist_id):
more_widget_html = content_html = page
for page_num in itertools.count(1):
- for video_id, video_title in self.extract_videos_from_page(content_html):
- yield self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
+ for entry in self._process_page(content_html):
+ yield entry
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj:
break
more_widget_html = more['load_more_widget_html']
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for video_id, video_title in self.extract_videos_from_page(content):
+ yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
return zip(ids_in_page, titles_in_page)
-class YoutubePlaylistsBaseInfoExtractor(InfoExtractor):
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- entries = [
- self.url_result(compat_urlparse.urljoin(url, playlist), 'YoutubePlaylist')
- for playlist in re.findall(r'href="(/playlist\?list=.+?)"', webpage)]
title = self._og_search_title(webpage, fatal=False)
- return self.playlist_result(entries, playlist_id, title)
+ return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
class YoutubeIE(YoutubeBaseInfoExtractor):
{
'url': 'http://vid.plus/FlRa-iH7PGw',
'only_matching': True,
- }
+ },
+ {
+ # Title with JS-like syntax "};"
+ 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+ 'info_dict': {
+ 'id': 'lsguqyKfVQg',
+ 'ext': 'mp4',
+ 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+ 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+ 'upload_date': '20151119',
+ 'uploader_id': 'IronSoulElf',
+ 'uploader': 'IronSoulElf',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):
return {}
return sub_lang_list
+ def _get_ytplayer_config(self, video_id, webpage):
+ patterns = (
+ r';ytplayer\.config\s*=\s*({.+?});ytplayer',
+ r';ytplayer\.config\s*=\s*({.+?});',
+ )
+ config = self._search_regex(
+ patterns, webpage, 'ytplayer.config', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
self.to_screen('%s: Looking for automatic captions' % video_id)
- mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+ player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
- if mobj is None:
+ if not player_config:
self._downloader.report_warning(err_msg)
return {}
- player_config = json.loads(mobj.group(1))
try:
args = player_config['args']
caption_url = args['ttsurl']
age_gate = False
video_info = None
# Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if mobj:
- json_code = uppercase_escape(mobj.group(1))
- ytplayer_config = json.loads(json_code)
+ ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+ if ytplayer_config:
args = ytplayer_config['args']
if args.get('url_encoded_fmt_stream_map'):
# Convert to the same format returned by compat_parse_qs
_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'
IE_NAME = 'youtube:user:playlists'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
'playlist_mincount': 4,
'info_dict': {
'id': 'ThirstForScience',
'title': 'Thirst for Science',
},
- }
+ }, {
+ # with "Load more" button
+ 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 70,
+ 'info_dict': {
+ 'id': 'igorkle1',
+ 'title': 'Игорь Клейнер',
+ },
+ }]
class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
}
-class YoutubeShowIE(InfoExtractor):
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
IE_NAME = 'youtube:show'
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
- webpage = self._download_webpage(
- 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
- # There's one playlist for each season of the show
- m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
- entries = [
- self.url_result(
- 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
- for season in m_seasons
- ]
- title = self._og_search_title(webpage, fatal=False)
-
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': title,
- 'entries': entries,
- }
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):