X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvimeo.py;h=309a47bf010f301872c1ed4d28e21360146c1a8b;hp=71c30d2cde54f11802f1e187160ae48c0ea88423;hb=dcdb292fddc82ae11f4c0b647815a45c88a6b6d5;hpb=49dea4913bea3b8e5c7d65dd932aa68ada526088 diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 71c30d2cd..309a47bf0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import json @@ -8,14 +8,15 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( determine_ext, - encode_dict, ExtractorError, InAdvancePagedList, int_or_none, + NO_DEFAULT, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -25,6 +26,7 @@ from ..utils import ( urlencode_postdata, unescapeHTML, parse_filesize, + try_get, ) @@ -42,19 +44,39 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - })) + }) login_request = sanitized_Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Referer', self._LOGIN_URL) self._set_vimeo_cookie('vuid', vuid) self._download_webpage(login_request, None, False, 'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') + def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( r'(?:(?P["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P["\'])(?P.+?)(?P=q)', @@ -67,6 +89,69 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _set_vimeo_cookie(self, name, value): self._set_cookie('vimeo.com', name, value) + def _vimeo_sort_formats(self, formats): + # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. This lead to wrong sorting. + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) + + def _parse_config(self, config, video_id): + # Extract title + video_title = config['video']['title'] + + # Extract uploader, uploader_url and uploader_id + video_uploader = config['video'].get('owner', {}).get('name') + video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None + + # Extract video thumbnail + video_thumbnail = config['video'].get('thumbnail') + if video_thumbnail is None: + video_thumbs = config['video'].get('thumbs') + if video_thumbs and isinstance(video_thumbs, dict): + _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] + + # Extract video duration + video_duration = int_or_none(config['video'].get('duration')) + + formats = [] + config_files = config['video'].get('files') or config['request'].get('files', {}) + for f in config_files.get('progressive', []): + video_url = f.get('url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': 'http-%s' % f.get('quality'), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'fps': int_or_none(f.get('fps')), + 'tbr': int_or_none(f.get('bitrate')), + }) + m3u8_url = config_files.get('hls', {}).get('url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + subtitles = {} + text_tracks = config['request'].get('text_tracks') + if text_tracks: + for tt in text_tracks: + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'https://vimeo.com' + tt['url'], + }] + + return { + 'title': video_title, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, + 'thumbnail': video_thumbnail, + 'duration': video_duration, + 'formats': formats, + 'subtitles': subtitles, + } + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" @@ -82,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor): \. )? vimeo(?Ppro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/) + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: (?: @@ -91,6 +176,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) + (?:/[\da-f]+)? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -153,7 +239,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', + 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.', }, 'params': { 'videopassword': 'youtube-dl', @@ -162,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', @@ -207,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor): { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'md5': '2d9f5475e0537f013d0073e812ab89e6', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -219,6 +303,29 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, }, + { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', @@ -228,13 +335,25 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, { # source file returns 403: Forbidden 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, + { + 'url': 'https://vimeo.com/160743502/abd0e13fb4', + 'only_matching': True, + } ] + @staticmethod + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + @staticmethod def _extract_vimeo_url(url, webpage): # Look for embedded (iframe) Vimeo player @@ -242,46 +361,30 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) if mobj: player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) - return surl + return VimeoIE._smuggle_referrer(player_url, url) # Look for embedded (swf embed) Vimeo player mobj = re.search( r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return mobj.group(1) - - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ - 'password': password, - 'token': token, - })) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + # Look more for non-standard embedded Vimeo player + mobj = re.search( + r']+src=(?P[\'"])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) + if mobj: + return mobj.group('url') def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata(encode_dict({'password': password})) + data = urlencode_postdata({'password': password}) pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) return self._download_json( password_request, video_id, - 'Verifying the password', - 'Wrong password') + 'Verifying the password', 'Wrong password') def _real_initialize(self): self._login() @@ -300,7 +403,7 @@ class VimeoIE(VimeoBaseInfoExtractor): orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -378,28 +481,24 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(url, video_id) - if '>You rented this title.<' in webpage: + def is_rented(): + if '>You rented this title.<' in webpage: + return True + if config.get('user', {}).get('purchased'): + return True + label = try_get( + config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) + if label and label.startswith('You rented this'): + return True + return False + + if is_rented(): feature_id = config.get('video', {}).get('vod', {}).get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract title - video_title = config['video']['title'] - - # Extract uploader, uploader_url and uploader_id - video_uploader = config['video'].get('owner', {}).get('name') - video_uploader_url = config['video'].get('owner', {}).get('url') - video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None - - # Extract video thumbnail - video_thumbnail = config['video'].get('thumbnail') - if video_thumbnail is None: - video_thumbs = config['video'].get('thumbs') - if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] - # Extract video description video_description = self._html_search_regex( @@ -419,9 +518,6 @@ class VimeoIE(VimeoBaseInfoExtractor): if not video_description and not mobj.group('player'): self._downloader.report_warning('Cannot find video description') - # Extract video duration - video_duration = int_or_none(config['video'].get('duration')) - # Extract upload date video_upload_date = None mobj = re.search(r']+datetime="([^"]+)"', webpage) @@ -459,53 +555,22 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format_id': source_name, 'preference': 1, }) - config_files = config['video'].get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): - video_url = f.get('url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'format_id': 'http-%s' % f.get('quality'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'fps': int_or_none(f.get('fps')), - 'tbr': int_or_none(f.get('bitrate')), - }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) - subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], - }] - - return { + info_dict = self._parse_config(config, video_id) + formats.extend(info_dict['formats']) + self._vimeo_sort_formats(formats) + info_dict.update({ 'id': video_id, - 'uploader': video_uploader, - 'uploader_url': video_uploader_url, - 'uploader_id': video_uploader_id, + 'formats': formats, 'upload_date': video_upload_date, - 'title': video_title, - 'thumbnail': video_thumbnail, 'description': video_description, - 'duration': video_duration, - 'formats': formats, 'webpage_url': url, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': subtitles, - } + }) + + return info_dict class VimeoOndemandIE(VimeoBaseInfoExtractor): @@ -523,6 +588,20 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', }, + }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126682985', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user14430847', + 'uploader_id': 'user14430847', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -537,7 +616,12 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + return self.url_result( + # Some videos require Referer to be passed along with og:video:url + # similarly to generic vimeo embeds (e.g. + # https://vimeo.com/ondemand/36938/126682985). + VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), + VimeoIE.ie_key()) class VimeoChannelIE(VimeoBaseInfoExtractor): @@ -575,7 +659,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(encode_dict(fields)) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) @@ -599,8 +683,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*]+href="(/(?:[^/]+/)*\1)', webpage) + if clips: + for video_id, video_url in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -637,7 +734,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'