X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvimeo.py;h=fb2bd225ab0b3c21b16b9a717475cfc42232d4e5;hb=ffa8f0df0a878463078467709f615b1e57c61ec1;hp=ef90fecc07b596c70f4ff6f3ff2cbb7c6fc86092;hpb=93b22c7828911668c503e868d6be053e8a0deb7c;p=youtube-dl diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ef90fecc0..fb2bd225a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,14 +20,14 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?$' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|(?Pplayer))\.)?vimeo(?Ppro)?\.com/(?:.*?/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ { - u'url': u'http://vimeo.com/56015672', + u'url': u'http://vimeo.com/56015672#at=0', u'file': u'56015672.mp4', - u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', + u'md5': u'8879b6cc097e987f02484baf890129e5', u'info_dict': { u"upload_date": u"20121220", u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", @@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') - if not mobj.group('proto'): - url = 'https://' + url - elif mobj.group('pro'): + if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id - elif mobj.group('direct_link'): + else: url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -153,7 +151,7 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except Exception as e: @@ -198,6 +196,16 @@ class VimeoIE(InfoExtractor): if mobj is not None: video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) + try: + view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) + like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) + comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) + except RegexNotFoundError: + # This info is only available in vimeo.com/{id} urls + view_count = None + like_count = None + comment_count = None + # Vimeo specific: extract request signature and timestamp sig = config['request']['signature'] timestamp = config['request']['timestamp'] @@ -205,7 +213,7 @@ class VimeoIE(InfoExtractor): # Vimeo specific: extract video codec and quality information # First consider quality, then codecs, then take everything codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] - files = { 'hd': [], 'sd': [], 'other': []} + files = {'hd': [], 'sd': [], 'other': []} config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: for quality in config_files.get(codec_name, []): @@ -234,7 +242,7 @@ class VimeoIE(InfoExtractor): if len(formats) == 0: raise ExtractorError(u'No known codec found') - return [{ + return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, @@ -243,32 +251,88 @@ class VimeoIE(InfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'formats': formats, - }] + 'webpage_url': url, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + } class VimeoChannelIE(InfoExtractor): IE_NAME = u'vimeo:channel' _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P[^/]+)' _MORE_PAGES_INDICATOR = r']+?title="(.*?)"' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - video_ids = [] + def _page_url(self, base_url, pagenum): + return '%s/videos/page:%d/' % (base_url, pagenum) + + def _extract_list_title(self, webpage): + return self._html_search_regex(self._TITLE_RE, webpage, u'list title') + def _extract_videos(self, list_id, base_url): + video_ids = [] for pagenum in itertools.count(1): - webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum), - channel_id, u'Downloading page %s' % pagenum) + webpage = self._download_webpage( + self._page_url(base_url, pagenum) ,list_id, + u'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') for video_id in video_ids] - channel_title = self._html_search_regex(r'(.*?)' % channel_id, - webpage, u'channel title') return {'_type': 'playlist', - 'id': channel_id, - 'title': channel_title, + 'id': list_id, + 'title': self._extract_list_title(webpage), 'entries': entries, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) + + +class VimeoUserIE(VimeoChannelIE): + IE_NAME = u'vimeo:user' + _VALID_URL = r'(?:https?://)?vimeo.\com/(?P[^/]+)' + _TITLE_RE = r']+?class="user">([^<>]+?)' + + @classmethod + def suitable(cls, url): + if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url): + return False + return super(VimeoUserIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + return self._extract_videos(name, 'http://vimeo.com/%s' % name) + + +class VimeoAlbumIE(VimeoChannelIE): + IE_NAME = u'vimeo:album' + _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P\d+)' + _TITLE_RE = r'