X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fted.py;h=cf8851438bb74000abb2692c34607f3137505f1d;hb=64f08d4ff2392135be07774f2d5371f111f21592;hp=cd4af96fdb02b2f4ea392fe19482371b6a02acad;hpb=98e1d289820ef513c3727d27148ca557c165a72a;p=youtube-dl diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index cd4af96fd..cf8851438 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,17 +3,17 @@ from __future__ import unicode_literals import json import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor -from ..utils import ( - compat_str, -) +from ..compat import compat_str +from ..utils import int_or_none -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor): + IE_NAME = 'ted' _VALID_URL = r'''(?x) (?Phttps?://) - (?Pwww|embed)(?P\.ted\.com/ + (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ ( (?Pplaylists(?:/\d+)?) # We have a playlist | @@ -33,11 +33,12 @@ class TEDIE(SubtitlesInfoExtractor): 'ext': 'mp4', 'title': 'The illusion of consciousness', 'description': ('Philosopher Dan Dennett makes a compelling ' - 'argument that not only don\'t we understand our own ' - 'consciousness, but that half the time our brains are ' - 'actively fooling us.'), + 'argument that not only don\'t we understand our own ' + 'consciousness, but that half the time our brains are ' + 'actively fooling us.'), 'uploader': 'Dan Dennett', 'width': 854, + 'duration': 1308, } }, { 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', @@ -57,6 +58,7 @@ class TEDIE(SubtitlesInfoExtractor): 'title': 'Be passionate. Be courageous. Be your best.', 'uploader': 'Gabby Giffords and Mark Kelly', 'description': 'md5:5174aed4d0f16021b704120360f72b92', + 'duration': 1128, }, }, { 'url': 'http://www.ted.com/playlists/who_are_the_hackers', @@ -71,7 +73,7 @@ class TEDIE(SubtitlesInfoExtractor): 'add_ie': ['Youtube'], 'info_dict': { 'id': '_ZG8HBuDjgc', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Douglas Adams: Parrots the Universe and Everything', 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 'uploader': 'University of California Television (UCTV)', @@ -81,6 +83,22 @@ class TEDIE(SubtitlesInfoExtractor): 'params': { 'skip_download': True, }, + }, { + # YouTube video + 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': 'aFBIPO-P7LM', + 'ext': 'mp4', + 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', + 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', + 'uploader': 'TEDx Talks', + 'uploader_id': 'TEDxTalks', + 'upload_date': '20111216', + }, + 'params': { + 'skip_download': True, + }, }] _NATIVE_FORMATS = { @@ -91,12 +109,12 @@ class TEDIE(SubtitlesInfoExtractor): def _extract_info(self, webpage): info_json = self._search_regex(r'q\("\w+.init",({.+})\)', - webpage, 'info json') + webpage, 'info json') return json.loads(info_json) def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) - if m.group('type') == 'embed': + if m.group('type').startswith('embed'): desktop_url = m.group('proto') + 'www' + m.group('urlmain') return self.url_result(desktop_url, 'TED') name = m.group('name') @@ -111,7 +129,7 @@ class TEDIE(SubtitlesInfoExtractor): '''Returns the videos of the playlist''' webpage = self._download_webpage(url, name, - 'Downloading playlist webpage') + 'Downloading playlist webpage') info = self._extract_info(webpage) playlist_info = info['playlist'] @@ -130,11 +148,16 @@ class TEDIE(SubtitlesInfoExtractor): talk_info = self._extract_info(webpage)['talks'][0] - if talk_info.get('external') is not None: - self.to_screen('Found video from %s' % talk_info['external']['service']) + external = talk_info.get('external') + if external: + service = external['service'] + self.to_screen('Found video from %s' % service) + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') return { '_type': 'url', - 'url': talk_info['external']['uri'], + 'url': ext_url or external['uri'], } formats = [{ @@ -147,25 +170,54 @@ class TEDIE(SubtitlesInfoExtractor): finfo = self._NATIVE_FORMATS.get(f['format_id']) if finfo: f.update(finfo) - else: - # Use rtmp downloads - formats = [{ - 'format_id': f['name'], - 'url': talk_info['streamer'], - 'play_path': f['file'], - 'ext': 'flv', - 'width': f['width'], - 'height': f['height'], - 'tbr': f['bitrate'], - } for f in talk_info['resources']['rtmp']] + + for format_id, resources in talk_info['resources'].items(): + if format_id == 'h264': + for resource in resources: + bitrate = int_or_none(resource.get('bitrate')) + formats.append({ + 'url': resource['file'], + 'format_id': '%s-%sk' % (format_id, bitrate), + 'tbr': bitrate, + }) + elif format_id == 'rtmp': + streamer = talk_info.get('streamer') + if not streamer: + continue + for resource in resources: + formats.append({ + 'format_id': '%s-%s' % (format_id, resource.get('name')), + 'url': streamer, + 'play_path': resource['file'], + 'ext': 'flv', + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + 'tbr': int_or_none(resource.get('bitrate')), + }) + elif format_id == 'hls': + hls_formats = self._extract_m3u8_formats( + resources.get('stream'), video_name, 'mp4', m3u8_id=format_id) + for f in hls_formats: + if f.get('format_id') == 'hls-meta': + continue + if not f.get('height'): + f['vcodec'] = 'none' + else: + f['acodec'] = 'none' + formats.extend(hls_formats) + + audio_download = talk_info.get('audioDownload') + if audio_download: + formats.append({ + 'url': audio_download, + 'format_id': 'audio', + 'vcodec': 'none', + 'preference': -0.5, + }) + self._sort_formats(formats) video_id = compat_str(talk_info['id']) - # subtitles - video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -176,28 +228,34 @@ class TEDIE(SubtitlesInfoExtractor): 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'subtitles': video_subtitles, + 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, + 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: - self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): webpage = self._download_webpage(url, name) config_json = self._html_search_regex( - r"data-config='([^']+)", webpage, 'config') - config = json.loads(config_json) + r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*', + webpage, 'config') + config = json.loads(config_json)['config'] video_url = config['video']['url'] thumbnail = config.get('image', {}).get('url')