X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvoicerepublic.py;h=59e1359c48628af9b4c53bedc337fa6b9b3d1396;hb=ec85ded83cbfa652ba94cb080aab52d8b270212a;hp=960974e167a18bd3b4688d130a333bbe7474cdc0;hpb=03f760b1c0478c1f65cf6e978d7592be46873313;p=youtube-dl diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 960974e16..59e1359c4 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,53 +1,100 @@ -# coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +import re +from .common import InfoExtractor from ..compat import ( - compat_urllib_request, + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + sanitized_Request, ) class VoiceRepublicIE(InfoExtractor): - _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' - _TEST = { - 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', - 'md5': '0554a24d1657915aa8e8f84e15dc9353', + _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' + _TESTS = [{ + 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + 'md5': 'b9174d651323f17783000876347116e3', 'info_dict': { 'id': '2296', + 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', - 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'duration': 1800, + 'view_count': int, } - } + }, { + 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - req = compat_urllib_request.Request(url) + + req = sanitized_Request( + compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') webpage = self._download_webpage(req, display_id) - thumbnail = self._og_search_thumbnail(webpage) - video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if '
Queued for processing, please stand by...<' in webpage: + raise ExtractorError( + 'Audio is still queued for processing', expected=True) + + config = self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None) + data = self._parse_json(config, display_id, fatal=False) if config else None + if data: + title = data['title'] + description = data.get('teaser') + talk_id = compat_str(data.get('talk_id') or display_id) + talk = data['talk'] + duration = int_or_none(talk.get('duration')) formats = [{ - 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), - 'ext': ext, - 'format_id': ext, + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', - } for ext in ['m4a', 'mp3', 'ogg']] - self._sort_formats(formats) + } for format_id, talk_url in talk['links'].items()] else: - # Audio is still queued for processing - formats = [] + title = self._og_search_title(webpage) + description = self._html_search_regex( + r"(?s)
]*>(.+?)
", + webpage, 'description', fatal=False) + talk_id = self._search_regex( + [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], + webpage, 'talk id', default=None) or display_id + duration = None + player = self._search_regex( + r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + view_count = int_or_none(self._search_regex( + r"class='play-count[^']*'>\s*(\d+) plays", + webpage, 'play count', fatal=False)) return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'formats': formats, - 'url': self._og_search_url(webpage), + 'id': talk_id, + 'display_id': display_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'description': self._og_search_description(webpage), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, }