1 from __future__ import unicode_literals
6 from .common import InfoExtractor
8 from ..compat import compat_str
9 from ..utils import int_or_none
12 class TEDIE(InfoExtractor):
16 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
18 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
20 ((?P<type_talk>talks)) # We have a simple talk
22 (?P<type_watch>watch)/[^/]+/[^/]+
24 (/lang/(.*?))? # The url may contain the language
25 /(?P<name>[\w-]+) # Here goes the name and then ".html"
29 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
34 'title': 'The illusion of consciousness',
35 'description': ('Philosopher Dan Dennett makes a compelling '
36 'argument that not only don\'t we understand our own '
37 'consciousness, but that half the time our brains are '
38 'actively fooling us.'),
39 'uploader': 'Dan Dennett',
44 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45 'md5': '226f4fb9c62380d11b7995efa4c87994',
47 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
49 'title': 'Vishal Sikka: The beauty and power of algorithms',
50 'thumbnail': 're:^https?://.+\.jpg',
51 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
54 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
58 'title': 'Be passionate. Be courageous. Be your best.',
59 'uploader': 'Gabby Giffords and Mark Kelly',
60 'description': 'md5:5174aed4d0f16021b704120360f72b92',
64 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
67 'title': 'Who are the hackers?',
69 'playlist_mincount': 6,
71 # contains a youtube video
72 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73 'add_ie': ['Youtube'],
77 'title': 'Douglas Adams: Parrots the Universe and Everything',
78 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79 'uploader': 'University of California Television (UCTV)',
80 'uploader_id': 'UCtelevision',
81 'upload_date': '20080522',
84 'skip_download': True,
88 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89 'add_ie': ['Youtube'],
93 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95 'uploader': 'TEDx Talks',
96 'uploader_id': 'TEDxTalks',
97 'upload_date': '20111216',
100 'skip_download': True,
105 'low': {'width': 320, 'height': 180},
106 'medium': {'width': 512, 'height': 288},
107 'high': {'width': 854, 'height': 480},
110 def _extract_info(self, webpage):
111 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
112 webpage, 'info json')
113 return json.loads(info_json)
115 def _real_extract(self, url):
116 m = re.match(self._VALID_URL, url, re.VERBOSE)
117 if m.group('type').startswith('embed'):
118 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
119 return self.url_result(desktop_url, 'TED')
120 name = m.group('name')
121 if m.group('type_talk'):
122 return self._talk_info(url, name)
123 elif m.group('type_watch'):
124 return self._watch_info(url, name)
126 return self._playlist_videos_info(url, name)
128 def _playlist_videos_info(self, url, name):
129 '''Returns the videos of the playlist'''
131 webpage = self._download_webpage(url, name,
132 'Downloading playlist webpage')
133 info = self._extract_info(webpage)
134 playlist_info = info['playlist']
137 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
138 for talk in info['talks']
140 return self.playlist_result(
142 playlist_id=compat_str(playlist_info['id']),
143 playlist_title=playlist_info['title'])
145 def _talk_info(self, url, video_name):
146 webpage = self._download_webpage(url, video_name)
147 self.report_extraction(video_name)
149 talk_info = self._extract_info(webpage)['talks'][0]
151 external = talk_info.get('external')
153 service = external['service']
154 self.to_screen('Found video from %s' % service)
156 if service.lower() == 'youtube':
157 ext_url = external.get('code')
160 'url': ext_url or external['uri'],
165 'format_id': format_id,
167 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
170 finfo = self._NATIVE_FORMATS.get(f['format_id'])
175 for format_id, resources in talk_info['resources'].items():
176 if format_id == 'h264':
177 for resource in resources:
178 h264_url = resource.get('file')
181 bitrate = int_or_none(resource.get('bitrate'))
184 'format_id': '%s-%sk' % (format_id, bitrate),
187 if re.search('\d+k', h264_url):
189 elif format_id == 'rtmp':
190 streamer = talk_info.get('streamer')
193 for resource in resources:
195 'format_id': '%s-%s' % (format_id, resource.get('name')),
197 'play_path': resource['file'],
199 'width': int_or_none(resource.get('width')),
200 'height': int_or_none(resource.get('height')),
201 'tbr': int_or_none(resource.get('bitrate')),
203 elif format_id == 'hls':
204 formats.extend(self._extract_m3u8_formats(
205 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
207 m3u8_formats = list(filter(
208 lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
211 for m3u8_format in m3u8_formats:
212 bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
215 f = m3u8_format.copy()
217 'url': re.sub(r'\d+k', bitrate, http_url),
218 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
223 audio_download = talk_info.get('audioDownload')
226 'url': audio_download,
227 'format_id': 'audio',
231 self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
233 video_id = compat_str(talk_info['id'])
235 thumbnail = talk_info['thumb']
236 if not thumbnail.startswith('http'):
237 thumbnail = 'http://' + thumbnail
240 'title': talk_info['title'].strip(),
241 'uploader': talk_info['speaker'],
242 'thumbnail': thumbnail,
243 'description': self._og_search_description(webpage),
244 'subtitles': self._get_subtitles(video_id, talk_info),
246 'duration': talk_info.get('duration'),
249 def _get_subtitles(self, video_id, talk_info):
250 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
256 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
259 for ext in ['ted', 'srt']
265 def _watch_info(self, url, name):
266 webpage = self._download_webpage(url, name)
268 config_json = self._html_search_regex(
269 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
271 config = json.loads(config_json)['config']
272 video_url = config['video']['url']
273 thumbnail = config.get('image', {}).get('url')
275 title = self._html_search_regex(
276 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
277 description = self._html_search_regex(
279 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
280 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
282 webpage, 'description', fatal=False)
288 'thumbnail': thumbnail,
289 'description': description,