X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Ftheplatform.py;h=25edc310008ef0da7b407c97db86ce5e49c78a50;hb=c3c9f879541b99a5456991d887ed03a2aea5dcff;hp=f02e0f58d5752000ab0036be78c293cbc4b4ce62;hpb=26e1c3514f4af1ed60cd1114a653fe49e1fa8d11;p=youtube-dl diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f02e0f58d..25edc3100 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -1,7 +1,7 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import re -import json import time import hmac import binascii @@ -9,6 +9,10 @@ import hashlib from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -24,7 +28,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(InfoExtractor): - def _extract_theplatform_smil_formats(self, smil_url, video_id, note='Downloading SMIL data'): + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note) try: error_msg = next( @@ -50,12 +54,13 @@ class ThePlatformBaseIE(InfoExtractor): self._sort_formats(formats) - return formats + subtitles = self._parse_smil_subtitles(meta, default_ns) + + return formats, subtitles def get_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info_json = self._download_webpage(info_url, video_id) - info = json.loads(info_json) + info = self._download_json(info_url, video_id) subtitles = {} captions = info.get('captions') @@ -120,6 +125,20 @@ class ThePlatformIE(ThePlatformBaseIE): }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, + }, { + 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', + 'md5': '734f3790fb5fc4903da391beeebc4836', + 'info_dict': { + 'id': 'tdy_or_siri_150701', + 'ext': 'mp4', + 'title': 'iPhone Siri’s sassy response to a math question has people talking', + 'description': 'md5:a565d1deadd5086f3331d57298ec6333', + 'duration': 83.0, + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1435752600, + 'upload_date': '20150701', + 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], + }, }] @staticmethod @@ -154,6 +173,24 @@ class ThePlatformIE(ThePlatformBaseIE): path += '/media' path += '/' + video_id + qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'guid' in qs_dict: + webpage = self._download_webpage(url, video_id) + scripts = re.findall(r']+src="([^"]+)"', webpage) + feed_id = None + # feed id usually locates in the last script. + # Seems there's no pattern for the interested script filename, so + # I try one by one + for script in reversed(scripts): + feed_script = self._download_webpage(script, video_id, 'Downloading feed script') + feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None) + if feed_id is not None: + break + if feed_id is None: + raise ExtractorError('Unable to find feed id') + return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % ( + provider_id, feed_id, qs_dict['guid'][0])) + if smuggled_data.get('force_smil_url', False): smil_url = url elif mobj.group('config'): @@ -173,12 +210,14 @@ class ThePlatformIE(ThePlatformBaseIE): if sig: smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) - formats = self._extract_theplatform_smil_formats(smil_url, video_id) + formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) ret = self.get_metadata(path, video_id) + combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, 'formats': formats, + 'subtitles': combined_subtitles, }) return ret @@ -216,6 +255,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): entry = feed['entries'][0] formats = [] + subtitles = {} first_video_id = None duration = None for item in entry['media$content']: @@ -224,7 +264,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE): if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - formats.extend(self._extract_theplatform_smil_formats(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)) + cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id) + formats.extend(cur_formats) + subtitles = self._merge_subtitles(subtitles, cur_subtitles) self._sort_formats(formats) @@ -238,9 +280,11 @@ class ThePlatformFeedIE(ThePlatformBaseIE): categories = [item['media$name'] for item in entry.get('media$categories', [])] ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': duration, 'timestamp': timestamp,