X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fvoicerepublic.py;h=59e1359c48628af9b4c53bedc337fa6b9b3d1396;hb=2f483bc1c389709623117079439708783122b5ec;hp=1106c655b871367abf3aad2ae65867d167a36353;hpb=a6762c4a22325b5b69770de82df8725d2eb5c3df;p=youtube-dl diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 1106c655b..59e1359c4 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -4,13 +4,14 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_request, + compat_str, compat_urlparse, ) from ..utils import ( ExtractorError, determine_ext, int_or_none, + sanitized_Request, ) @@ -18,14 +19,14 @@ class VoiceRepublicIE(InfoExtractor): _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' _TESTS = [{ 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', - 'md5': '0554a24d1657915aa8e8f84e15dc9353', + 'md5': 'b9174d651323f17783000876347116e3', 'info_dict': { 'id': '2296', 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'description': 'md5:715ba964958afa2398df615809cfecb1', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'duration': 1800, 'view_count': int, } @@ -37,7 +38,7 @@ class VoiceRepublicIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - req = compat_urllib_request.Request( + req = sanitized_Request( compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') @@ -47,16 +48,14 @@ class VoiceRepublicIE(InfoExtractor): raise ExtractorError( 'Audio is still queued for processing', expected=True) - data = self._parse_json( - self._search_regex( - r'(?s)return ({.+?});\s*\n', webpage, - 'data', default=None), - display_id, fatal=False) - + config = self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None) + data = self._parse_json(config, display_id, fatal=False) if config else None if data: title = data['title'] description = data.get('teaser') - talk_id = data.get('talk_id') or display_id + talk_id = compat_str(data.get('talk_id') or display_id) talk = data['talk'] duration = int_or_none(talk.get('duration')) formats = [{ @@ -74,12 +73,14 @@ class VoiceRepublicIE(InfoExtractor): [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], webpage, 'talk id', default=None) or display_id duration = None + player = self._search_regex( + r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') formats = [{ 'url': compat_urlparse.urljoin(url, talk_url), 'format_id': format_id, 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', - } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)] + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage)