X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fhark.py;h=342a6130ea10325d4b7e7ecee7ee86b130e90173;hb=54fc90aabfb71968f28af68dfe3f7a3544cc2f0b;hp=ab0a69697ed0dcc07ecd50ec04c3ed4ef264a4d6;hpb=6dc63025992404ddb4f7e3f9d42957e83621011d;p=youtube-dl diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..342a6130e 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,35 +1,33 @@ -# -*- coding: utf-8 -*- - -import re +# coding: utf-8 +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext + class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P.+?)-.+' _TEST = { - u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - u'file': u'mmbzyhkgny.mp3', - u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', - u'info_dict': { - u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + 'md5': '6783a58491b47b92c7c1af5a77d4cbee', + 'info_dict': { + 'id': 'mmbzyhkgny', + 'ext': 'mp3', + 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', + 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + 'duration': 11, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) - webpage = self._download_webpage(embed_url, video_id) + video_id = self._match_id(url) + data = self._download_json( + 'http://www.hark.com/clips/%s.json' % video_id, video_id) - final_url = self._search_regex(r'src="(.+?).mp3"', - webpage, 'video url')+'.mp3' - title = self._html_search_regex(r'(.+?)', - webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( - 'Sound Clip , Quote, MP3, and Ringtone - Hark','') - - return {'id': video_id, - 'url' : final_url, - 'title': title, - 'ext': determine_ext(final_url), - } + return { + 'id': video_id, + 'url': data['url'], + 'title': data['name'], + 'description': data.get('description'), + 'thumbnail': data.get('image_original'), + 'duration': data.get('duration'), + }