1 from __future__ import unicode_literals
6 from .common import InfoExtractor
8 from ..compat import compat_str
9 from ..utils import int_or_none
12 class TEDIE(InfoExtractor):
15 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
17 (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19 ((?P<type_talk>talks)) # We have a simple talk
21 (?P<type_watch>watch)/[^/]+/[^/]+
23 (/lang/(.*?))? # The url may contain the language
24 /(?P<name>[\w-]+) # Here goes the name and then ".html"
28 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
29 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
33 'title': 'The illusion of consciousness',
34 'description': ('Philosopher Dan Dennett makes a compelling '
35 'argument that not only don\'t we understand our own '
36 'consciousness, but that half the time our brains are '
37 'actively fooling us.'),
38 'uploader': 'Dan Dennett',
43 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44 'md5': '226f4fb9c62380d11b7995efa4c87994',
46 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48 'title': 'Vishal Sikka: The beauty and power of algorithms',
49 'thumbnail': 're:^https?://.+\.jpg',
50 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
53 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
57 'title': 'Be passionate. Be courageous. Be your best.',
58 'uploader': 'Gabby Giffords and Mark Kelly',
59 'description': 'md5:5174aed4d0f16021b704120360f72b92',
63 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
66 'title': 'Who are the hackers?',
68 'playlist_mincount': 6,
70 # contains a youtube video
71 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
72 'add_ie': ['Youtube'],
76 'title': 'Douglas Adams: Parrots the Universe and Everything',
77 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
78 'uploader': 'University of California Television (UCTV)',
79 'uploader_id': 'UCtelevision',
80 'upload_date': '20080522',
83 'skip_download': True,
87 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
88 'add_ie': ['Youtube'],
92 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
93 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
94 'uploader': 'TEDx Talks',
95 'uploader_id': 'TEDxTalks',
96 'upload_date': '20111216',
99 'skip_download': True,
104 'low': {'preference': 1, 'width': 320, 'height': 180},
105 'medium': {'preference': 2, 'width': 512, 'height': 288},
106 'high': {'preference': 3, 'width': 854, 'height': 480},
109 def _extract_info(self, webpage):
110 info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
111 webpage, 'info json')
112 return json.loads(info_json)
114 def _real_extract(self, url):
115 m = re.match(self._VALID_URL, url, re.VERBOSE)
116 if m.group('type').startswith('embed'):
117 desktop_url = m.group('proto') + 'www' + m.group('urlmain')
118 return self.url_result(desktop_url, 'TED')
119 name = m.group('name')
120 if m.group('type_talk'):
121 return self._talk_info(url, name)
122 elif m.group('type_watch'):
123 return self._watch_info(url, name)
125 return self._playlist_videos_info(url, name)
127 def _playlist_videos_info(self, url, name):
128 '''Returns the videos of the playlist'''
130 webpage = self._download_webpage(url, name,
131 'Downloading playlist webpage')
132 info = self._extract_info(webpage)
133 playlist_info = info['playlist']
136 self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
137 for talk in info['talks']
139 return self.playlist_result(
141 playlist_id=compat_str(playlist_info['id']),
142 playlist_title=playlist_info['title'])
144 def _talk_info(self, url, video_name):
145 webpage = self._download_webpage(url, video_name)
146 self.report_extraction(video_name)
148 talk_info = self._extract_info(webpage)['talks'][0]
150 external = talk_info.get('external')
152 service = external['service']
153 self.to_screen('Found video from %s' % service)
155 if service.lower() == 'youtube':
156 ext_url = external.get('code')
159 'url': ext_url or external['uri'],
164 'format_id': format_id,
166 } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
169 finfo = self._NATIVE_FORMATS.get(f['format_id'])
173 for format_id, resources in talk_info['resources'].items():
174 if format_id == 'h264':
175 for resource in resources:
176 bitrate = int_or_none(resource.get('bitrate'))
178 'url': resource['file'],
179 'format_id': '%s-%sk' % (format_id, bitrate),
182 elif format_id == 'rtmp':
183 streamer = talk_info.get('streamer')
186 for resource in resources:
188 'format_id': '%s-%s' % (format_id, resource.get('name')),
190 'play_path': resource['file'],
192 'width': int_or_none(resource.get('width')),
193 'height': int_or_none(resource.get('height')),
194 'tbr': int_or_none(resource.get('bitrate')),
196 elif format_id == 'hls':
197 formats.extend(self._extract_m3u8_formats(
198 resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
200 audio_download = talk_info.get('audioDownload')
203 'url': audio_download,
204 'format_id': 'audio',
207 self._sort_formats(formats)
209 video_id = compat_str(talk_info['id'])
211 thumbnail = talk_info['thumb']
212 if not thumbnail.startswith('http'):
213 thumbnail = 'http://' + thumbnail
216 'title': talk_info['title'].strip(),
217 'uploader': talk_info['speaker'],
218 'thumbnail': thumbnail,
219 'description': self._og_search_description(webpage),
220 'subtitles': self._get_subtitles(video_id, talk_info),
222 'duration': talk_info.get('duration'),
225 def _get_subtitles(self, video_id, talk_info):
226 languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
232 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
235 for ext in ['ted', 'srt']
241 def _watch_info(self, url, name):
242 webpage = self._download_webpage(url, name)
244 config_json = self._html_search_regex(
245 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
247 config = json.loads(config_json)['config']
248 video_url = config['video']['url']
249 thumbnail = config.get('image', {}).get('url')
251 title = self._html_search_regex(
252 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
253 description = self._html_search_regex(
255 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
256 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
258 webpage, 'description', fatal=False)
264 'thumbnail': thumbnail,
265 'description': description,