X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fyahoo.py;h=f9afbdbab611e233c7f7014ae7d66e996f2b7c31;hb=6dd94d3a79353f8e694efaf2fa27f4bb40227aff;hp=eed9d4325e6e59fb3c5eca20d264f2623de1077b;hpb=85fab7e47b4d4d69a8d105e1d3a07d8a0dd67685;p=youtube-dl diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index eed9d4325..f9afbdbab 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,16 +15,18 @@ from ..utils import ( unescapeHTML, ExtractorError, int_or_none, + mimetype2ext, ) +from .nbc import NBCSportsVPlayerIE + class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P(?Phttps?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P.+?)-(?P[0-9]+)(?:-[a-z]+)?\.html)' + _VALID_URL = r'(?P(?Phttps?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P.+)?-(?P[0-9]+)(?:-[a-z]+)?\.html)' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'md5': '4962b075c08be8690a922ee026d05e69', 'info_dict': { 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', 'ext': 'mp4', @@ -130,12 +132,24 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } + }, { + 'url': 'https://tw.news.yahoo.com/-100120367.html', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or self._match_id(url) page_id = mobj.group('id') url = mobj.group('url') host = mobj.group('host') @@ -152,6 +166,10 @@ class YahooIE(InfoExtractor): items = json.loads(items_json) video_id = items[0]['id'] return self._get_info(video_id, display_id, webpage) + # Look for NBCSports iframes + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -176,17 +194,15 @@ class YahooIE(InfoExtractor): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', webpage, 'region', fatal=False, default='US') - query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' - ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"' - ' AND protocol="http"' % (video_id, region)) data = compat_urllib_parse.urlencode({ - 'q': query, - 'env': 'prod', - 'format': 'json', + 'protocol': 'http', + 'region': region, }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) query_result = self._download_json( - 'http://video.query.yahoo.com/v1/public/yql?' + data, - display_id, 'Downloading video info') + query_url, display_id, 'Downloading video info') info = query_result['query']['results']['mediaObj'][0] meta = info.get('meta') @@ -221,6 +237,22 @@ class YahooIE(InfoExtractor): self._sort_formats(formats) + closed_captions = self._html_search_regex( + r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', + default='[]') + + cc_json = self._parse_json(closed_captions, video_id, fatal=False) + subtitles = {} + if cc_json: + for closed_caption in cc_json: + lang = closed_caption['lang'] + if lang not in subtitles: + subtitles[lang] = [] + subtitles[lang].append({ + 'url': closed_caption['url'], + 'ext': mimetype2ext(closed_caption['content_type']), + }) + return { 'id': video_id, 'display_id': display_id, @@ -229,6 +261,7 @@ class YahooIE(InfoExtractor): 'description': clean_html(meta['description']), 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), 'duration': int_or_none(meta.get('duration')), + 'subtitles': subtitles, }