[ted] Extract all formats (Closes #5397)
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .common import InfoExtractor
7
8 from ..compat import compat_str
9 from ..utils import int_or_none
10
11
12 class TEDIE(InfoExtractor):
13     _VALID_URL = r'''(?x)
14         (?P<proto>https?://)
15         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
16         (
17             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
18             |
19             ((?P<type_talk>talks)) # We have a simple talk
20             |
21             (?P<type_watch>watch)/[^/]+/[^/]+
22         )
23         (/lang/(.*?))? # The url may contain the language
24         /(?P<name>[\w-]+) # Here goes the name and then ".html"
25         .*)$
26         '''
27     _TESTS = [{
28         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
29         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
30         'info_dict': {
31             'id': '102',
32             'ext': 'mp4',
33             'title': 'The illusion of consciousness',
34             'description': ('Philosopher Dan Dennett makes a compelling '
35                             'argument that not only don\'t we understand our own '
36                             'consciousness, but that half the time our brains are '
37                             'actively fooling us.'),
38             'uploader': 'Dan Dennett',
39             'width': 854,
40             'duration': 1308,
41         }
42     }, {
43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44         'md5': '226f4fb9c62380d11b7995efa4c87994',
45         'info_dict': {
46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47             'ext': 'mp4',
48             'title': 'Vishal Sikka: The beauty and power of algorithms',
49             'thumbnail': 're:^https?://.+\.jpg',
50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51         }
52     }, {
53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54         'info_dict': {
55             'id': '1972',
56             'ext': 'mp4',
57             'title': 'Be passionate. Be courageous. Be your best.',
58             'uploader': 'Gabby Giffords and Mark Kelly',
59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
60             'duration': 1128,
61         },
62     }, {
63         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
64         'info_dict': {
65             'id': '10',
66             'title': 'Who are the hackers?',
67         },
68         'playlist_mincount': 6,
69     }, {
70         # contains a youtube video
71         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
72         'add_ie': ['Youtube'],
73         'info_dict': {
74             'id': '_ZG8HBuDjgc',
75             'ext': 'mp4',
76             'title': 'Douglas Adams: Parrots the Universe and Everything',
77             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
78             'uploader': 'University of California Television (UCTV)',
79             'uploader_id': 'UCtelevision',
80             'upload_date': '20080522',
81         },
82         'params': {
83             'skip_download': True,
84         },
85     }, {
86         # YouTube video
87         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
88         'add_ie': ['Youtube'],
89         'info_dict': {
90             'id': 'aFBIPO-P7LM',
91             'ext': 'mp4',
92             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
93             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
94             'uploader': 'TEDx Talks',
95             'uploader_id': 'TEDxTalks',
96             'upload_date': '20111216',
97         },
98         'params': {
99             'skip_download': True,
100         },
101     }]
102
103     _NATIVE_FORMATS = {
104         'low': {'preference': 1, 'width': 320, 'height': 180},
105         'medium': {'preference': 2, 'width': 512, 'height': 288},
106         'high': {'preference': 3, 'width': 854, 'height': 480},
107     }
108
109     def _extract_info(self, webpage):
110         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
111                                        webpage, 'info json')
112         return json.loads(info_json)
113
114     def _real_extract(self, url):
115         m = re.match(self._VALID_URL, url, re.VERBOSE)
116         if m.group('type').startswith('embed'):
117             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
118             return self.url_result(desktop_url, 'TED')
119         name = m.group('name')
120         if m.group('type_talk'):
121             return self._talk_info(url, name)
122         elif m.group('type_watch'):
123             return self._watch_info(url, name)
124         else:
125             return self._playlist_videos_info(url, name)
126
127     def _playlist_videos_info(self, url, name):
128         '''Returns the videos of the playlist'''
129
130         webpage = self._download_webpage(url, name,
131                                          'Downloading playlist webpage')
132         info = self._extract_info(webpage)
133         playlist_info = info['playlist']
134
135         playlist_entries = [
136             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
137             for talk in info['talks']
138         ]
139         return self.playlist_result(
140             playlist_entries,
141             playlist_id=compat_str(playlist_info['id']),
142             playlist_title=playlist_info['title'])
143
144     def _talk_info(self, url, video_name):
145         webpage = self._download_webpage(url, video_name)
146         self.report_extraction(video_name)
147
148         talk_info = self._extract_info(webpage)['talks'][0]
149
150         external = talk_info.get('external')
151         if external:
152             service = external['service']
153             self.to_screen('Found video from %s' % service)
154             ext_url = None
155             if service.lower() == 'youtube':
156                 ext_url = external.get('code')
157             return {
158                 '_type': 'url',
159                 'url': ext_url or external['uri'],
160             }
161
162         formats = [{
163             'url': format_url,
164             'format_id': format_id,
165             'format': format_id,
166         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
167         if formats:
168             for f in formats:
169                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
170                 if finfo:
171                     f.update(finfo)
172
173         for format_id, resources in talk_info['resources'].items():
174             if format_id == 'h264':
175                 for resource in resources:
176                     bitrate = int_or_none(resource.get('bitrate'))
177                     formats.append({
178                         'url': resource['file'],
179                         'format_id': '%s-%sk' % (format_id, bitrate),
180                         'tbr': bitrate,
181                     })
182             elif format_id == 'rtmp':
183                 streamer = talk_info.get('streamer')
184                 if not streamer:
185                     continue
186                 for resource in resources:
187                     formats.append({
188                         'format_id': '%s-%s' % (format_id, resource.get('name')),
189                         'url': streamer,
190                         'play_path': resource['file'],
191                         'ext': 'flv',
192                         'width': int_or_none(resource.get('width')),
193                         'height': int_or_none(resource.get('height')),
194                         'tbr': int_or_none(resource.get('bitrate')),
195                     })
196             elif format_id == 'hls':
197                 formats.extend(self._extract_m3u8_formats(
198                     resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
199
200         audio_download = talk_info.get('audioDownload')
201         if audio_download:
202             formats.append({
203                 'url': audio_download,
204                 'format_id': 'audio',
205             })
206
207         self._sort_formats(formats)
208
209         video_id = compat_str(talk_info['id'])
210
211         thumbnail = talk_info['thumb']
212         if not thumbnail.startswith('http'):
213             thumbnail = 'http://' + thumbnail
214         return {
215             'id': video_id,
216             'title': talk_info['title'].strip(),
217             'uploader': talk_info['speaker'],
218             'thumbnail': thumbnail,
219             'description': self._og_search_description(webpage),
220             'subtitles': self._get_subtitles(video_id, talk_info),
221             'formats': formats,
222             'duration': talk_info.get('duration'),
223         }
224
225     def _get_subtitles(self, video_id, talk_info):
226         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
227         if languages:
228             sub_lang_list = {}
229             for l in languages:
230                 sub_lang_list[l] = [
231                     {
232                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
233                         'ext': ext,
234                     }
235                     for ext in ['ted', 'srt']
236                 ]
237             return sub_lang_list
238         else:
239             return {}
240
241     def _watch_info(self, url, name):
242         webpage = self._download_webpage(url, name)
243
244         config_json = self._html_search_regex(
245             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
246             webpage, 'config')
247         config = json.loads(config_json)['config']
248         video_url = config['video']['url']
249         thumbnail = config.get('image', {}).get('url')
250
251         title = self._html_search_regex(
252             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
253         description = self._html_search_regex(
254             [
255                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
256                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
257             ],
258             webpage, 'description', fatal=False)
259
260         return {
261             'id': name,
262             'url': video_url,
263             'title': title,
264             'thumbnail': thumbnail,
265             'description': description,
266         }