[internetvideoarchive] extract all formats

[youtube-dl] / youtube_dl / extractor / vimeo.py
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index 1f163d6a4ce1b8b0069ff5d2bce2bfa889a49709..7e854f3265eac3312f1b63199ce8633a17eb7d04 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -16,6 +16,7 @@ from ..utils import (
      ExtractorError,
      InAdvancePagedList,
      int_or_none,
+    NO_DEFAULT,
      RegexNotFoundError,
      sanitized_Request,
      smuggle_url,
@@ -56,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):
          self._set_vimeo_cookie('vuid', vuid)
          self._download_webpage(login_request, None, False, 'Wrong login info')
  
+    def _verify_video_password(self, url, video_id, webpage):
+        password = self._downloader.params.get('videopassword')
+        if password is None:
+            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+        token, vuid = self._extract_xsrft_and_vuid(webpage)
+        data = urlencode_postdata({
+            'password': password,
+            'token': token,
+        })
+        if url.startswith('http://'):
+            # vimeo only supports https now, but the user can give an http url
+            url = url.replace('http://', 'https://')
+        password_request = sanitized_Request(url + '/password', data)
+        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        password_request.add_header('Referer', url)
+        self._set_vimeo_cookie('vuid', vuid)
+        return self._download_webpage(
+            password_request, video_id,
+            'Verifying the password', 'Wrong password')
+
      def _extract_xsrft_and_vuid(self, webpage):
          xsrft = self._search_regex(
              r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
@@ -146,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                              \.
                          )?
                          vimeo(?P<pro>pro)?\.com/
-                        (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/)
+                        (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
                          (?:.*?/)?
                          (?:
                              (?:
@@ -314,6 +335,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
              'only_matching': True,
          },
+        {
+            'url': 'https://vimeo.com/album/2632481/video/79010983',
+            'only_matching': True,
+        },
          {
              # source file returns 403: Forbidden
              'url': 'https://vimeo.com/7809605',
@@ -339,26 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
              r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
          if mobj:
              return mobj.group(1)
-
-    def _verify_video_password(self, url, video_id, webpage):
-        password = self._downloader.params.get('videopassword')
-        if password is None:
-            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
-        token, vuid = self._extract_xsrft_and_vuid(webpage)
-        data = urlencode_postdata({
-            'password': password,
-            'token': token,
-        })
-        if url.startswith('http://'):
-            # vimeo only supports https now, but the user can give an http url
-            url = url.replace('http://', 'https://')
-        password_request = sanitized_Request(url + '/password', data)
-        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        password_request.add_header('Referer', url)
-        self._set_vimeo_cookie('vuid', vuid)
-        return self._download_webpage(
-            password_request, video_id,
-            'Verifying the password', 'Wrong password')
+        # Look more for non-standard embedded Vimeo player
+        mobj = re.search(
+            r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage)
+        if mobj:
+            return mobj.group('url')
  
      def _verify_player_video_password(self, url, video_id):
          password = self._downloader.params.get('videopassword')
@@ -651,8 +661,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
                  webpage = self._login_list_password(page_url, list_id, webpage)
                  yield self._extract_list_title(webpage)
  
-            for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
-                yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
+            # Try extracting href first since not all videos are available via
+            # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
+            clips = re.findall(
+                r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage)
+            if clips:
+                for video_id, video_url in clips:
+                    yield self.url_result(
+                        compat_urlparse.urljoin(base_url, video_url),
+                        VimeoIE.ie_key(), video_id=video_id)
+            # More relaxed fallback
+            else:
+                for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
+                    yield self.url_result(
+                        'https://vimeo.com/%s' % video_id,
+                        VimeoIE.ie_key(), video_id=video_id)
  
              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
                  break
@@ -689,7 +712,7 @@ class VimeoUserIE(VimeoChannelIE):
  
  class VimeoAlbumIE(VimeoChannelIE):
      IE_NAME = 'vimeo:album'
-    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
+    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
      _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
      _TESTS = [{
          'url': 'https://vimeo.com/album/2632481',
@@ -709,6 +732,13 @@ class VimeoAlbumIE(VimeoChannelIE):
          'params': {
              'videopassword': 'youtube-dl',
          }
+    }, {
+        'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
+        'only_matching': True,
+    }, {
+        # TODO: respect page number
+        'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
+        'only_matching': True,
      }]
  
      def _page_url(self, base_url, pagenum):
@@ -767,12 +797,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
              'thumbnail': 're:^https?://.*\.jpg$',
              'uploader_id': 'user22258446',
          }
+    }, {
+        'note': 'Password protected',
+        'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
+        'info_dict': {
+            'id': '138823582',
+            'ext': 'mp4',
+            'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
+            'uploader': 'TMB',
+            'uploader_id': 'user37284429',
+        },
+        'params': {
+            'videopassword': 'holygrail',
+        },
      }]
  
+    def _real_initialize(self):
+        self._login()
+
+    def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
+        webpage = self._download_webpage(webpage_url, video_id)
+        config_url = self._html_search_regex(
+            r'data-config-url="([^"]+)"', webpage, 'config URL',
+            default=NO_DEFAULT if video_password_verified else None)
+        if config_url is None:
+            self._verify_video_password(webpage_url, video_id, webpage)
+            config_url = self._get_config_url(
+                webpage_url, video_id, video_password_verified=True)
+        return config_url
+
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        config = self._download_json(
-            'https://player.vimeo.com/video/%s/config' % video_id, video_id)
+        config_url = self._get_config_url(url, video_id)
+        config = self._download_json(config_url, video_id)
          info_dict = self._parse_config(config, video_id)
          self._vimeo_sort_formats(info_dict['formats'])
          info_dict['id'] = video_id