From: Sergey M․ Date: Sun, 12 Apr 2015 17:19:00 +0000 (+0600) Subject: [youtube] Extract video titles for channel playlist if possible (Closes #4971) X-Git-Url: http://git.bitcoin.ninja/?a=commitdiff_plain;h=fb69240ca0934299583bf6c7a855d5c602a4a7e0;p=youtube-dl [youtube] Extract video titles for channel playlist if possible (Closes #4971) --- diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2774ec30b..791e1fe62 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page + titles_in_page = [] + for mobj in re.finditer(r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor): if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty - video_ids = self.extract_videos_from_page(channel_page) + videos = self.extract_videos_from_page(channel_page) entries = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + for video_id, video_title in videos] return self.playlist_result(entries, channel_id) def _entries(): @@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor): for pagenum in itertools.count(1): ids_in_page = self.extract_videos_from_page(content_html) - for video_id in ids_in_page: + for video_id, video_title in ids_in_page: yield self.url_result( - video_id, 'Youtube', video_id=video_id) + video_id, 'Youtube', video_id=video_id, + video_title=video_title) mobj = re.search( r'data-uix-load-more-href="/?(?P<more>[^"]+)"',