X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvk.py;h=105e172d539d4376e4534df31266c489f3d62518;hb=757984af90ce924f917ce9d940ebd120e95a4c4e;hp=3cfbd97af48d1bd8028394e5cbde00dba2c34df6;hpb=3aa3953d28dae68b87aa83682043b5eec0973ddc;p=youtube-dl diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 3cfbd97af..105e172d5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,9 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import collections import re -import json import sys from .common import InfoExtractor @@ -20,12 +19,13 @@ from ..utils import ( remove_start, str_to_int, unescapeHTML, - unified_strdate, + unified_timestamp, urlencode_postdata, ) from .dailymotion import DailymotionIE from .pladform import PladformIE from .vimeo import VimeoIE +from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): @@ -106,6 +106,7 @@ class VKIE(VKBaseIE): 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, + 'timestamp': 1329060660, 'upload_date': '20120212', 'view_count': int, }, @@ -119,6 +120,7 @@ class VKIE(VKBaseIE): 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, + 'timestamp': 1374374880, 'upload_date': '20130721', 'view_count': int, } @@ -195,6 +197,7 @@ class VKIE(VKBaseIE): 'upload_date': '20150709', 'view_count': int, }, + 'skip': 'Removed', }, { # youtube embed @@ -237,12 +240,13 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': 'S-Dance, репетиции к The way show', 'uploader': 'THE WAY SHOW | 17 апреля', + 'timestamp': 1454870100, 'upload_date': '20160207', 'view_count': int, }, }, { - # finished live stream, live_mp4 + # finished live stream, postlive_mp4 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', 'md5': '90d22d051fccbbe9becfccc615be6791', 'info_dict': { @@ -251,10 +255,11 @@ class VKIE(VKBaseIE): 'title': 'ИгроМир 2016 — день 1', 'uploader': 'Игромания', 'duration': 5239, + 'view_count': int, }, }, { - # live stream, hls and rtmp links,most likely already finished live + # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment 'url': 'https://vk.com/video-140332_456239111', 'only_matching': True, @@ -277,6 +282,11 @@ class VKIE(VKBaseIE): { 'url': 'http://new.vk.com/video205387401_165548505', 'only_matching': True, + }, + { + # This video is no longer available, because its author has been blocked. + 'url': 'https://vk.com/video-10639516_456240611', + 'only_matching': True, } ] @@ -324,19 +334,23 @@ class VKIE(VKBaseIE): r'Access denied': 'Access denied to video %s.', + + r'Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', } for error_re, error_msg in ERRORS.items(): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) + vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: return self.url_result(vimeo_url) @@ -364,27 +378,50 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') - data = json.loads(data_json) + # vars does not look to be served anymore since 24.10.2016 + data = self._parse_json( + self._search_regex( + r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), + video_id, fatal=False) + + # is served instead + if not data: + data = self._parse_json( + self._search_regex( + r'\s*({.+?})\s*', info_page, 'json', default='{}'), + video_id) + if data: + data = data['player']['params'][0] + + if not data: + data = self._parse_json( + self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, + 'player params'), + video_id)['params'][0] title = unescapeHTML(data['md_title']) - if data.get('live') == 2: + # 2 = live + # 3 = post live (finished live) + is_live = data.get('live') == 2 + if is_live: title = self._live_title(title) - # Extract upload date - upload_date = unified_strdate(self._html_search_regex( - r'class="mv_info_date[^>]*>([^<]*)<', info_page, 'upload date', default=None)) + timestamp = unified_timestamp(self._html_search_regex( + r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, + 'upload date', fatal=False)) - view_count = str_to_int(self._html_search_regex( - r'class="mv_views_count[^>]*>([\d,.]+)', - info_page, 'view count', default=None)) + view_count = str_to_int(self._search_regex( + r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', + info_page, 'view count', fatal=False)) formats = [] for format_id, format_url in data.items(): if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): continue - if format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4'): + if (format_id.startswith(('url', 'cache')) or + format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): height = int_or_none(self._search_regex( r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) formats.append({ @@ -394,8 +431,8 @@ class VKIE(VKBaseIE): }) elif format_id == 'hls': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, - fatal=False, live=True)) + format_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False, live=is_live)) elif format_id == 'rtmp': formats.append({ 'format_id': format_id, @@ -411,8 +448,9 @@ class VKIE(VKBaseIE): 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), 'duration': data.get('duration'), - 'upload_date': upload_date, + 'timestamp': timestamp, 'view_count': view_count, + 'is_live': is_live, }