X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvk.py;h=b50d4f170328728fbfc75b75e5a5ed6dcf281f84;hb=4c76aa06665621c7689938afd7bbdbc797b5c7ea;hp=9f7a593efed4a3851fe81cf4f184dfa57950307a;hpb=475f8a458099c64d367356471069bd0ff2bd1b0d;p=youtube-dl diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 9f7a593ef..b50d4f170 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,9 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import collections import re -import json import sys from .common import InfoExtractor @@ -20,12 +19,13 @@ from ..utils import ( remove_start, str_to_int, unescapeHTML, - unified_strdate, + unified_timestamp, urlencode_postdata, ) from .dailymotion import DailymotionIE from .pladform import PladformIE from .vimeo import VimeoIE +from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): @@ -67,7 +67,7 @@ class VKBaseIE(InfoExtractor): login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, - note='Logging in as %s' % username, + note='Logging in', data=urlencode_postdata(login_form)) if re.search(r'onLoginFailed', login_page): @@ -99,13 +99,14 @@ class VKIE(VKBaseIE): _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'md5': '0deae91935c54e00003c2a00646315f0', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { 'id': '162222515', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, + 'timestamp': 1329060660, 'upload_date': '20120212', 'view_count': int, }, @@ -119,6 +120,7 @@ class VKIE(VKBaseIE): 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, + 'timestamp': 1374374880, 'upload_date': '20130721', 'view_count': int, } @@ -195,6 +197,7 @@ class VKIE(VKBaseIE): 'upload_date': '20150709', 'view_count': int, }, + 'skip': 'Removed', }, { # youtube embed @@ -237,12 +240,13 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', + 'timestamp': 1454870100, 'upload_date': '20160207', 'view_count': int, }, }, { - # finished live stream, live_mp4 + # finished live stream, postlive_mp4 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', 'md5': '90d22d051fccbbe9becfccc615be6791', 'info_dict': { @@ -251,10 +255,11 @@ class VKIE(VKBaseIE): 'title': 'ИгроМир 2016 — день 1', 'uploader': 'Игромания', 'duration': 5239, + 'view_count': int, }, }, { - # live stream, hls and rtmp links,most likely already finished live + # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment 'url': 'https://vk.com/video-140332_456239111', 'only_matching': True, @@ -277,6 +282,11 @@ class VKIE(VKBaseIE): { 'url': 'http://new.vk.com/video205387401_165548505', 'only_matching': True, + }, + { + # This video is no longer available, because its author has been blocked. + 'url': 'https://vk.com/video-10639516_456240611', + 'only_matching': True, } ] @@ -308,9 +318,14 @@ class VKIE(VKBaseIE): 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', expected=True) + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - 'Video %s has been removed from public access due to rightholder complaint.', + ERROR_COPYRIGHT, + + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, r'Please log in or <': 'Video %s is only available for registered users, ' @@ -324,19 +339,23 @@ class VKIE(VKBaseIE): r'Access denied': 'Access denied to video %s.', + + r'Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', } for error_re, error_msg in ERRORS.items(): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) + vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: return self.url_result(vimeo_url) @@ -364,34 +383,50 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') - data = json.loads(data_json) + # vars does not look to be served anymore since 24.10.2016 + data = self._parse_json( + self._search_regex( + r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), + video_id, fatal=False) + + # is served instead + if not data: + data = self._parse_json( + self._search_regex( + r'\s*({.+?})\s*', info_page, 'json', default='{}'), + video_id) + if data: + data = data['player']['params'][0] + + if not data: + data = self._parse_json( + self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, + 'player params'), + video_id)['params'][0] title = unescapeHTML(data['md_title']) - if data.get('live') == 2: + # 2 = live + # 3 = post live (finished live) + is_live = data.get('live') == 2 + if is_live: title = self._live_title(title) - # Extract upload date - upload_date = None - mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) - if mobj is not None: - mobj.group(1) + ' ' + mobj.group(2) - upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - - view_count = None - views = self._html_search_regex( - r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', default=None) - if views: - view_count = str_to_int(self._search_regex( - r'([\d,.]+)', views, 'view count', fatal=False)) + timestamp = unified_timestamp(self._html_search_regex( + r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, + 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', + info_page, 'view count', default=None)) formats = [] for format_id, format_url in data.items(): if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): continue - if format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4'): + if (format_id.startswith(('url', 'cache')) or + format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): height = int_or_none(self._search_regex( r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) formats.append({ @@ -401,8 +436,8 @@ class VKIE(VKBaseIE): }) elif format_id == 'hls': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, - fatal=False, live=True)) + format_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False, live=is_live)) elif format_id == 'rtmp': formats.append({ 'format_id': format_id, @@ -418,8 +453,9 @@ class VKIE(VKBaseIE): 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), 'duration': data.get('duration'), - 'upload_date': upload_date, + 'timestamp': timestamp, 'view_count': view_count, + 'is_live': is_live, }