[ted] extract all http formats
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .common import InfoExtractor
7
8 from ..compat import compat_str
9 from ..utils import int_or_none
10
11
12 class TEDIE(InfoExtractor):
13     IE_NAME = 'ted'
14     _VALID_URL = r'''(?x)
15         (?P<proto>https?://)
16         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
17         (
18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19             |
20             ((?P<type_talk>talks)) # We have a simple talk
21             |
22             (?P<type_watch>watch)/[^/]+/[^/]+
23         )
24         (/lang/(.*?))? # The url may contain the language
25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
26         .*)$
27         '''
28     _TESTS = [{
29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
31         'info_dict': {
32             'id': '102',
33             'ext': 'mp4',
34             'title': 'The illusion of consciousness',
35             'description': ('Philosopher Dan Dennett makes a compelling '
36                             'argument that not only don\'t we understand our own '
37                             'consciousness, but that half the time our brains are '
38                             'actively fooling us.'),
39             'uploader': 'Dan Dennett',
40             'width': 854,
41             'duration': 1308,
42         }
43     }, {
44         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
45         'md5': '226f4fb9c62380d11b7995efa4c87994',
46         'info_dict': {
47             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
48             'ext': 'mp4',
49             'title': 'Vishal Sikka: The beauty and power of algorithms',
50             'thumbnail': 're:^https?://.+\.jpg',
51             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
52         }
53     }, {
54         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
55         'info_dict': {
56             'id': '1972',
57             'ext': 'mp4',
58             'title': 'Be passionate. Be courageous. Be your best.',
59             'uploader': 'Gabby Giffords and Mark Kelly',
60             'description': 'md5:5174aed4d0f16021b704120360f72b92',
61             'duration': 1128,
62         },
63     }, {
64         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
65         'info_dict': {
66             'id': '10',
67             'title': 'Who are the hackers?',
68         },
69         'playlist_mincount': 6,
70     }, {
71         # contains a youtube video
72         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
73         'add_ie': ['Youtube'],
74         'info_dict': {
75             'id': '_ZG8HBuDjgc',
76             'ext': 'webm',
77             'title': 'Douglas Adams: Parrots the Universe and Everything',
78             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
79             'uploader': 'University of California Television (UCTV)',
80             'uploader_id': 'UCtelevision',
81             'upload_date': '20080522',
82         },
83         'params': {
84             'skip_download': True,
85         },
86     }, {
87         # YouTube video
88         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
89         'add_ie': ['Youtube'],
90         'info_dict': {
91             'id': 'aFBIPO-P7LM',
92             'ext': 'mp4',
93             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
94             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
95             'uploader': 'TEDx Talks',
96             'uploader_id': 'TEDxTalks',
97             'upload_date': '20111216',
98         },
99         'params': {
100             'skip_download': True,
101         },
102     }]
103
104     _NATIVE_FORMATS = {
105         'low': {'width': 320, 'height': 180},
106         'medium': {'width': 512, 'height': 288},
107         'high': {'width': 854, 'height': 480},
108     }
109
110     def _extract_info(self, webpage):
111         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
112                                        webpage, 'info json')
113         return json.loads(info_json)
114
115     def _real_extract(self, url):
116         m = re.match(self._VALID_URL, url, re.VERBOSE)
117         if m.group('type').startswith('embed'):
118             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
119             return self.url_result(desktop_url, 'TED')
120         name = m.group('name')
121         if m.group('type_talk'):
122             return self._talk_info(url, name)
123         elif m.group('type_watch'):
124             return self._watch_info(url, name)
125         else:
126             return self._playlist_videos_info(url, name)
127
128     def _playlist_videos_info(self, url, name):
129         '''Returns the videos of the playlist'''
130
131         webpage = self._download_webpage(url, name,
132                                          'Downloading playlist webpage')
133         info = self._extract_info(webpage)
134         playlist_info = info['playlist']
135
136         playlist_entries = [
137             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
138             for talk in info['talks']
139         ]
140         return self.playlist_result(
141             playlist_entries,
142             playlist_id=compat_str(playlist_info['id']),
143             playlist_title=playlist_info['title'])
144
145     def _talk_info(self, url, video_name):
146         webpage = self._download_webpage(url, video_name)
147         self.report_extraction(video_name)
148
149         talk_info = self._extract_info(webpage)['talks'][0]
150
151         external = talk_info.get('external')
152         if external:
153             service = external['service']
154             self.to_screen('Found video from %s' % service)
155             ext_url = None
156             if service.lower() == 'youtube':
157                 ext_url = external.get('code')
158             return {
159                 '_type': 'url',
160                 'url': ext_url or external['uri'],
161             }
162
163         formats = [{
164             'url': format_url,
165             'format_id': format_id,
166             'format': format_id,
167         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
168         if formats:
169             for f in formats:
170                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
171                 if finfo:
172                     f.update(finfo)
173
174         http_url = None
175         for format_id, resources in talk_info['resources'].items():
176             if format_id == 'h264':
177                 for resource in resources:
178                     h264_url = resource.get('file')
179                     if not h264_url:
180                         continue
181                     bitrate = int_or_none(resource.get('bitrate'))
182                     formats.append({
183                         'url': h264_url,
184                         'format_id': '%s-%sk' % (format_id, bitrate),
185                         'tbr': bitrate,
186                     })
187                     if re.search('\d+k', h264_url):
188                         http_url = h264_url
189             elif format_id == 'rtmp':
190                 streamer = talk_info.get('streamer')
191                 if not streamer:
192                     continue
193                 for resource in resources:
194                     formats.append({
195                         'format_id': '%s-%s' % (format_id, resource.get('name')),
196                         'url': streamer,
197                         'play_path': resource['file'],
198                         'ext': 'flv',
199                         'width': int_or_none(resource.get('width')),
200                         'height': int_or_none(resource.get('height')),
201                         'tbr': int_or_none(resource.get('bitrate')),
202                     })
203             elif format_id == 'hls':
204                 formats.extend(self._extract_m3u8_formats(
205                     resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
206
207         m3u8_formats = list(filter(
208             lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
209             formats))
210         if http_url:
211             for m3u8_format in m3u8_formats:
212                 bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
213                 if not bitrate:
214                     continue
215                 f = m3u8_format.copy()
216                 f.update({
217                     'url': re.sub(r'\d+k', bitrate, http_url),
218                     'format_id': m3u8_format['format_id'].replace('hls', 'http'),
219                     'protocol': 'http',
220                 })
221                 formats.append(f)
222
223         audio_download = talk_info.get('audioDownload')
224         if audio_download:
225             formats.append({
226                 'url': audio_download,
227                 'format_id': 'audio',
228                 'vcodec': 'none',
229             })
230
231         self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
232
233         video_id = compat_str(talk_info['id'])
234
235         thumbnail = talk_info['thumb']
236         if not thumbnail.startswith('http'):
237             thumbnail = 'http://' + thumbnail
238         return {
239             'id': video_id,
240             'title': talk_info['title'].strip(),
241             'uploader': talk_info['speaker'],
242             'thumbnail': thumbnail,
243             'description': self._og_search_description(webpage),
244             'subtitles': self._get_subtitles(video_id, talk_info),
245             'formats': formats,
246             'duration': talk_info.get('duration'),
247         }
248
249     def _get_subtitles(self, video_id, talk_info):
250         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
251         if languages:
252             sub_lang_list = {}
253             for l in languages:
254                 sub_lang_list[l] = [
255                     {
256                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
257                         'ext': ext,
258                     }
259                     for ext in ['ted', 'srt']
260                 ]
261             return sub_lang_list
262         else:
263             return {}
264
265     def _watch_info(self, url, name):
266         webpage = self._download_webpage(url, name)
267
268         config_json = self._html_search_regex(
269             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
270             webpage, 'config')
271         config = json.loads(config_json)['config']
272         video_url = config['video']['url']
273         thumbnail = config.get('image', {}).get('url')
274
275         title = self._html_search_regex(
276             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
277         description = self._html_search_regex(
278             [
279                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
280                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
281             ],
282             webpage, 'description', fatal=False)
283
284         return {
285             'id': name,
286             'url': video_url,
287             'title': title,
288             'thumbnail': thumbnail,
289             'description': description,
290         }