[youtube] Generalize playlist entries extraction (Closes #6699, closes #6992)

author Sergey M․ <dstftw@gmail.com>

Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)

committer Sergey M․ <dstftw@gmail.com>

Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)
author Sergey M․ <dstftw@gmail.com>
Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)
committer Sergey M․ <dstftw@gmail.com>
Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index b252e36e1162406dedfcc531d7d038e6bd357348..08e821362eaf04c9fac3fbb948774abe87554a20 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              return
  
  
+class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
+    # Extract the video ids from the playlist pages
+    def _entries(self, page, playlist_id):
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            for video_id, video_title in self.extract_videos_from_page(content_html):
+                yield self.url_result(
+                    video_id, 'Youtube', video_id=video_id,
+                    video_title=video_title)
+
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            if not content_html.strip():
+                # Some webpages show a "Load more" button but they don't
+                # have more videos
+                break
+            more_widget_html = more['load_more_widget_html']
+
+    def extract_videos_from_page(self, page):
+        ids_in_page = []
+        titles_in_page = []
+        for mobj in re.finditer(self._VIDEO_RE, page):
+            # The link with index 0 is not the first video of the playlist (not sure if still actual)
+            if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+                continue
+            video_id = mobj.group('id')
+            video_title = unescapeHTML(mobj.group('title'))
+            if video_title:
+                video_title = video_title.strip()
+            try:
+                idx = ids_in_page.index(video_id)
+                if video_title and not titles_in_page[idx]:
+                    titles_in_page[idx] = video_title
+            except ValueError:
+                ids_in_page.append(video_id)
+                titles_in_page.append(video_title)
+        return zip(ids_in_page, titles_in_page)
+
+
  class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
@@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          }
  
  
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
      IE_DESC = 'YouTube.com playlists'
      _VALID_URL = r"""(?x)(?:
                          (?:https?://)?
@@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                          ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
                       )"""
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
-    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
      IE_NAME = 'youtube:playlist'
      _TESTS = [{
          'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
              else:
                  self.report_warning('Youtube gives an alert message: ' + match)
  
-        # Extract the video ids from the playlist pages
-        def _entries():
-            more_widget_html = content_html = page
-            for page_num in itertools.count(1):
-                matches = re.finditer(self._VIDEO_RE, content_html)
-                # We remove the duplicates and the link with index 0
-                # (it's not the first video of the playlist)
-                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-                for vid_id in new_ids:
-                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
-
-                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-                if not mobj:
-                    break
-
-                more = self._download_json(
-                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                    'Downloading page #%s' % page_num,
-                    transform_source=uppercase_escape)
-                content_html = more['content_html']
-                if not content_html.strip():
-                    # Some webpages show a "Load more" button but they don't
-                    # have more videos
-                    break
-                more_widget_html = more['load_more_widget_html']
-
          playlist_title = self._html_search_regex(
              r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
              page, 'title')
  
-        return self.playlist_result(_entries(), playlist_id, playlist_title)
+        return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
  
      def _real_extract(self, url):
          # Extract playlist id
@@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
          return self._extract_playlist(playlist_id)
  
  
-class YoutubeChannelIE(InfoExtractor):
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
      IE_DESC = 'YouTube.com channels'
      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
      _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
      IE_NAME = 'youtube:channel'
      _TESTS = [{
          'note': 'paginated channel',
@@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor):
          }
      }]
  
-    @staticmethod
-    def extract_videos_from_page(page):
-        ids_in_page = []
-        titles_in_page = []
-        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
-            video_id = mobj.group('id')
-            video_title = unescapeHTML(mobj.group('title'))
-            try:
-                idx = ids_in_page.index(video_id)
-                if video_title and not titles_in_page[idx]:
-                    titles_in_page[idx] = video_title
-            except ValueError:
-                ids_in_page.append(video_id)
-                titles_in_page.append(video_title)
-        return zip(ids_in_page, titles_in_page)
-
      def _real_extract(self, url):
          channel_id = self._match_id(url)
  
@@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor):
                  for video_id, video_title in self.extract_videos_from_page(channel_page)]
              return self.playlist_result(entries, channel_id)
  
-        def _entries():
-            more_widget_html = content_html = channel_page
-            for pagenum in itertools.count(1):
-
-                for video_id, video_title in self.extract_videos_from_page(content_html):
-                    yield self.url_result(
-                        video_id, 'Youtube', video_id=video_id,
-                        video_title=video_title)
-
-                mobj = re.search(
-                    r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
-                    more_widget_html)
-                if not mobj:
-                    break
-
-                more = self._download_json(
-                    'https://youtube.com/%s' % mobj.group('more'), channel_id,
-                    'Downloading page #%s' % (pagenum + 1),
-                    transform_source=uppercase_escape)
-                content_html = more['content_html']
-                more_widget_html = more['load_more_widget_html']
-
-        return self.playlist_result(_entries(), channel_id)
+        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
  
  
  class YoutubeUserIE(YoutubeChannelIE):
author	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)
committer	Sergey M․ <dstftw@gmail.com>
	Sat, 17 Oct 2015 18:11:34 +0000 (00:11 +0600)