[vimeo:likes] Support large like lists (Fixes #3847)

[youtube-dl] / youtube_dl / extractor / vimeo.py
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index bc01d7fbf583eb44ccc708c7fb00b4b113f82bc0..403d0bb28ebb4e3d3f1d93af46452485dfe44bb6 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -8,17 +8,19 @@ import itertools
  from .common import InfoExtractor
  from .subtitles import SubtitlesInfoExtractor
  from ..utils import (
+    clean_html,
      compat_HTTPError,
      compat_urllib_parse,
      compat_urllib_request,
-    clean_html,
-    get_element_by_attribute,
+    compat_urlparse,
      ExtractorError,
+    get_element_by_attribute,
+    InAdvancePagedList,
+    int_or_none,
      RegexNotFoundError,
      std_headers,
      unsmuggle_url,
      urlencode_postdata,
-    int_or_none,
  )
  
  
@@ -529,3 +531,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
  
      def _real_extract(self, url):
          return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
+
+
+class VimeoLikesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
+    IE_NAME = 'vimeo:likes'
+    IE_DESC = 'Vimeo user likes'
+    _TEST = {
+        'url': 'https://vimeo.com/user755559/likes/',
+        'playlist_mincount': 293,
+        "info_dict": {
+            "description": "See all the videos urza likes",
+            "title": 'Videos urza likes',
+        },
+    }
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+        webpage = self._download_webpage(url, user_id)
+        page_count = self._int(
+            self._search_regex(
+                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
+                    .*?</a></li>\s*<li\s+class="pagination_next">
+                ''', webpage, 'page count'),
+            'page count', fatal=True)
+        PAGE_SIZE = 12
+        title = self._html_search_regex(
+            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
+        description = self._html_search_meta('description', webpage)
+
+        def _get_page(idx):
+            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
+                self.http_scheme(), user_id, idx + 1)
+            webpage = self._download_webpage(
+                page_url, user_id,
+                note='Downloading page %d/%d' % (idx + 1, page_count))
+            video_list = self._search_regex(
+                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
+                webpage, 'video content')
+            paths = re.findall(
+                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
+            for path in paths:
+                yield {
+                    '_type': 'url',
+                    'url': compat_urlparse.urljoin(page_url, path),
+                }
+
+        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+
+        return {
+            '_type': 'playlist',
+            'id': 'user%s_likes' % user_id,
+            'title': title,
+            'description': description,
+            'entries': pl,
+        }