[youtube:user] Workaround 35 pages limitation (Closes #5778)

author Sergey M․ <dstftw@gmail.com>

Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)

committer Sergey M․ <dstftw@gmail.com>

Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)
author Sergey M․ <dstftw@gmail.com>
Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)
committer Sergey M․ <dstftw@gmail.com>
Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 0301682b8dd228cab336bc1e68eaf868660fd5c7..fcdbfe0bc959a011bebf8656184fe164b3eca84a 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1399,6 +1399,26 @@ class YoutubeChannelIE(InfoExtractor):
          channel_id = self._match_id(url)
  
          url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._search_regex(
+            [r'<meta itemprop="channelId" content="([^"]+)">',
+             r'data-channel-external-id="([^"]+)"'],
+            channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            channel_playlist = unescapeHTML(self._search_regex(
+                r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&amp;list=%s)"' % playlist_id,
+                channel_page, 'channel playlist URL', default=None))
+            if channel_playlist:
+                return self.url_result(
+                    compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist')
+
          channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
          autogenerated = re.search(r'''(?x)
                  class="[^"]*?(?:
author	Sergey M․ <dstftw@gmail.com>
	Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)
committer	Sergey M․ <dstftw@gmail.com>
	Sat, 30 May 2015 12:29:16 +0000 (18:29 +0600)