Add infrastructure for paged lists

author Philipp Hagemeister <phihag@phihag.de>

Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)
diff --git a/test/test_utils.py b/test/test_utils.py

index bee355ee0e0605a5134dc37b8556e9e233728902..349c1107f4c123fd043682428b36def8690708cb 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -18,6 +18,7 @@ from youtube_dl.utils import (
      find_xpath_attr,
      get_meta_content,
      orderedSet,
+    PagedList,
      parse_duration,
      sanitize_filename,
      shell_quote,
@@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(parse_duration('9:12:43'), 33163)
          self.assertEqual(parse_duration('x:y'), None)
  
+    def test_paged_list(self):
+        def testPL(size, pagesize, sliceargs, expected):
+            def get_page(pagenum):
+                firstid = pagenum * pagesize
+                upto = min(size, pagenum * pagesize + pagesize)
+                for i in range(firstid, upto):
+                    yield i
+
+            pl = PagedList(get_page, pagesize)
+            got = pl.getslice(*sliceargs)
+            self.assertEqual(got, expected)
+
+        testPL(5, 2, (), [0, 1, 2, 3, 4])
+        testPL(5, 2, (1,), [1, 2, 3, 4])
+        testPL(5, 2, (2,), [2, 3, 4])
+        testPL(5, 2, (4,), [4])
+        testPL(5, 2, (0, 3), [0, 1, 2])
+        testPL(5, 2, (1, 4), [1, 2, 3])
+        testPL(5, 2, (2, 99), [2, 3, 4])
+        testPL(5, 2, (20, 99), [])
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index a0ab89b3d6c0193fa35eee2e56cbc5b86a314e68..2ad6f10286784585865e1445d223f39a2e5d31d7 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -39,6 +39,7 @@ from .utils import (
      locked_file,
      make_HTTPS_handler,
      MaxDownloadsReached,
+    PagedList,
      PostProcessingError,
      platform_name,
      preferredencoding,
@@ -575,19 +576,27 @@ class YoutubeDL(object):
  
              playlist_results = []
  
-            n_all_entries = len(ie_result['entries'])
              playliststart = self.params.get('playliststart', 1) - 1
              playlistend = self.params.get('playlistend', None)
              # For backwards compatibility, interpret -1 as whole list
              if playlistend == -1:
                  playlistend = None
  
-            entries = ie_result['entries'][playliststart:playlistend]
-            n_entries = len(entries)
-
-            self.to_screen(
-                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
-                (ie_result['extractor'], playlist, n_all_entries, n_entries))
+            if isinstance(ie_result['entries'], list):
+                n_all_entries = len(ie_result['entries'])
+                entries = ie_result['entries'][playliststart:playlistend]
+                n_entries = len(entries)
+                self.to_screen(
+                    "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
+                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
+            else:
+                assert isinstance(ie_result['entries'], PagedList)
+                entries = ie_result['entries'].getslice(
+                    playliststart, playlistend)
+                n_entries = len(entries)
+                self.to_screen(
+                    "[%s] playlist %s: Downloading %d videos" %
+                    (ie_result['extractor'], playlist, n_entries))
  
              for i, entry in enumerate(entries, 1):
                  self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 248b30ffb329d3870e0249e4d775c3f969849ce8..dd1a58f3fc0f5448fdf1266ef0b51840a6307d8e 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -27,6 +27,7 @@ from ..utils import (
      get_element_by_id,
      get_element_by_attribute,
      ExtractorError,
+    PagedList,
      RegexNotFoundError,
      unescapeHTML,
      unified_strdate,
@@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):
          # page by page until there are no video ids - it means we got
          # all of them.
  
-        url_results = []
-
-        for pagenum in itertools.count(0):
+        def download_page(pagenum):
              start_index = pagenum * self._GDATA_PAGE_SIZE + 1
  
              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(gdata_url, username,
-                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+            page = self._download_webpage(
+                gdata_url, username,
+                u'Downloading video ids from %d to %d' % (
+                    start_index, start_index + self._GDATA_PAGE_SIZE))
  
              try:
                  response = json.loads(page)
              except ValueError as err:
                  raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
              if 'entry' not in response['feed']:
-                # Number of videos is a multiple of self._MAX_RESULTS
-                break
+                return
  
              # Extract video identifiers
              entries = response['feed']['entry']
              for entry in entries:
                  title = entry['title']['$t']
                  video_id = entry['id']['$t'].split('/')[-1]
-                url_results.append({
+                yield {
                      '_type': 'url',
                      'url': video_id,
                      'ie_key': 'Youtube',
                      'id': 'video_id',
                      'title': title,
-                })
-
-            # A little optimization - if current page is not
-            # "full", ie. does not contain PAGE_SIZE video ids then
-            # we can assume that this page is the last one - there
-            # are no more ids on further pages - no need to query
-            # again.
-
-            if len(entries) < self._GDATA_PAGE_SIZE:
-                break
+                }
+        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
  
          return self.playlist_result(url_results, playlist_title=username)
  
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 73fe1ad0a3a27165d3dffc61927733beb9c5ed33..ff124d9e8cbd42d04904e527750b7c3d94e74f59 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -6,6 +6,7 @@ import datetime
  import email.utils
  import errno
  import gzip
+import itertools
  import io
  import json
  import locale
@@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
      except OSError:
          return False
      return exe
+
+
+class PagedList(object):
+    def __init__(self, pagefunc, pagesize):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        for pagenum in itertools.count(start // self._pagesize):
+            firstid = pagenum * self._pagesize
+            nextfirstid = pagenum * self._pagesize + self._pagesize
+            if start >= nextfirstid:
+                continue
+
+            page_results = list(self._pagefunc(pagenum))
+
+            startv = (
+                start % self._pagesize
+                if firstid <= start < nextfirstid
+                else 0)
+
+            endv = (
+                ((end - 1) % self._pagesize) + 1
+                if (end is not None and firstid <= end <= nextfirstid)
+                else None)
+
+            if startv != 0 or endv is not None:
+                page_results = page_results[startv:endv]
+            res.extend(page_results)
+
+            # A little optimization - if current page is not "full", ie. does
+            # not contain page_size videos then we can assume that this page
+            # is the last one - there are no more ids on further pages -
+            # i.e. no need to query again.
+            if len(page_results) + startv < self._pagesize:
+                break
+
+            # If we got the whole page, but the next page is not interesting,
+            # break out early as well
+            if end == nextfirstid:
+                break
+        return res
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 20 Jan 2014 10:36:47 +0000 (11:36 +0100)
test/test_utils.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history