[ted] Clarify audio/video-only formats
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .common import InfoExtractor
7
8 from ..compat import compat_str
9 from ..utils import int_or_none
10
11
12 class TEDIE(InfoExtractor):
13     _VALID_URL = r'''(?x)
14         (?P<proto>https?://)
15         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
16         (
17             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
18             |
19             ((?P<type_talk>talks)) # We have a simple talk
20             |
21             (?P<type_watch>watch)/[^/]+/[^/]+
22         )
23         (/lang/(.*?))? # The url may contain the language
24         /(?P<name>[\w-]+) # Here goes the name and then ".html"
25         .*)$
26         '''
27     _TESTS = [{
28         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
29         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
30         'info_dict': {
31             'id': '102',
32             'ext': 'mp4',
33             'title': 'The illusion of consciousness',
34             'description': ('Philosopher Dan Dennett makes a compelling '
35                             'argument that not only don\'t we understand our own '
36                             'consciousness, but that half the time our brains are '
37                             'actively fooling us.'),
38             'uploader': 'Dan Dennett',
39             'width': 854,
40             'duration': 1308,
41         }
42     }, {
43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44         'md5': '226f4fb9c62380d11b7995efa4c87994',
45         'info_dict': {
46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47             'ext': 'mp4',
48             'title': 'Vishal Sikka: The beauty and power of algorithms',
49             'thumbnail': 're:^https?://.+\.jpg',
50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51         }
52     }, {
53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54         'info_dict': {
55             'id': '1972',
56             'ext': 'mp4',
57             'title': 'Be passionate. Be courageous. Be your best.',
58             'uploader': 'Gabby Giffords and Mark Kelly',
59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
60             'duration': 1128,
61         },
62     }, {
63         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
64         'info_dict': {
65             'id': '10',
66             'title': 'Who are the hackers?',
67         },
68         'playlist_mincount': 6,
69     }, {
70         # contains a youtube video
71         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
72         'add_ie': ['Youtube'],
73         'info_dict': {
74             'id': '_ZG8HBuDjgc',
75             'ext': 'mp4',
76             'title': 'Douglas Adams: Parrots the Universe and Everything',
77             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
78             'uploader': 'University of California Television (UCTV)',
79             'uploader_id': 'UCtelevision',
80             'upload_date': '20080522',
81         },
82         'params': {
83             'skip_download': True,
84         },
85     }, {
86         # YouTube video
87         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
88         'add_ie': ['Youtube'],
89         'info_dict': {
90             'id': 'aFBIPO-P7LM',
91             'ext': 'mp4',
92             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
93             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
94             'uploader': 'TEDx Talks',
95             'uploader_id': 'TEDxTalks',
96             'upload_date': '20111216',
97         },
98         'params': {
99             'skip_download': True,
100         },
101     }]
102
103     _NATIVE_FORMATS = {
104         'low': {'preference': 1, 'width': 320, 'height': 180},
105         'medium': {'preference': 2, 'width': 512, 'height': 288},
106         'high': {'preference': 3, 'width': 854, 'height': 480},
107     }
108
109     def _extract_info(self, webpage):
110         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
111                                        webpage, 'info json')
112         return json.loads(info_json)
113
114     def _real_extract(self, url):
115         m = re.match(self._VALID_URL, url, re.VERBOSE)
116         if m.group('type').startswith('embed'):
117             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
118             return self.url_result(desktop_url, 'TED')
119         name = m.group('name')
120         if m.group('type_talk'):
121             return self._talk_info(url, name)
122         elif m.group('type_watch'):
123             return self._watch_info(url, name)
124         else:
125             return self._playlist_videos_info(url, name)
126
127     def _playlist_videos_info(self, url, name):
128         '''Returns the videos of the playlist'''
129
130         webpage = self._download_webpage(url, name,
131                                          'Downloading playlist webpage')
132         info = self._extract_info(webpage)
133         playlist_info = info['playlist']
134
135         playlist_entries = [
136             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
137             for talk in info['talks']
138         ]
139         return self.playlist_result(
140             playlist_entries,
141             playlist_id=compat_str(playlist_info['id']),
142             playlist_title=playlist_info['title'])
143
144     def _talk_info(self, url, video_name):
145         webpage = self._download_webpage(url, video_name)
146         self.report_extraction(video_name)
147
148         talk_info = self._extract_info(webpage)['talks'][0]
149
150         external = talk_info.get('external')
151         if external:
152             service = external['service']
153             self.to_screen('Found video from %s' % service)
154             ext_url = None
155             if service.lower() == 'youtube':
156                 ext_url = external.get('code')
157             return {
158                 '_type': 'url',
159                 'url': ext_url or external['uri'],
160             }
161
162         formats = [{
163             'url': format_url,
164             'format_id': format_id,
165             'format': format_id,
166         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
167         if formats:
168             for f in formats:
169                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
170                 if finfo:
171                     f.update(finfo)
172
173         for format_id, resources in talk_info['resources'].items():
174             if format_id == 'h264':
175                 for resource in resources:
176                     bitrate = int_or_none(resource.get('bitrate'))
177                     formats.append({
178                         'url': resource['file'],
179                         'format_id': '%s-%sk' % (format_id, bitrate),
180                         'tbr': bitrate,
181                     })
182             elif format_id == 'rtmp':
183                 streamer = talk_info.get('streamer')
184                 if not streamer:
185                     continue
186                 for resource in resources:
187                     formats.append({
188                         'format_id': '%s-%s' % (format_id, resource.get('name')),
189                         'url': streamer,
190                         'play_path': resource['file'],
191                         'ext': 'flv',
192                         'width': int_or_none(resource.get('width')),
193                         'height': int_or_none(resource.get('height')),
194                         'tbr': int_or_none(resource.get('bitrate')),
195                     })
196             elif format_id == 'hls':
197                 hls_formats = self._extract_m3u8_formats(
198                     resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
199                 for f in hls_formats:
200                     f['acodec'] = 'none'
201                 formats.extend(hls_formats)
202
203         audio_download = talk_info.get('audioDownload')
204         if audio_download:
205             formats.append({
206                 'url': audio_download,
207                 'format_id': 'audio',
208                 'vcodec': 'none',
209             })
210
211         self._sort_formats(formats)
212
213         video_id = compat_str(talk_info['id'])
214
215         thumbnail = talk_info['thumb']
216         if not thumbnail.startswith('http'):
217             thumbnail = 'http://' + thumbnail
218         return {
219             'id': video_id,
220             'title': talk_info['title'].strip(),
221             'uploader': talk_info['speaker'],
222             'thumbnail': thumbnail,
223             'description': self._og_search_description(webpage),
224             'subtitles': self._get_subtitles(video_id, talk_info),
225             'formats': formats,
226             'duration': talk_info.get('duration'),
227         }
228
229     def _get_subtitles(self, video_id, talk_info):
230         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
231         if languages:
232             sub_lang_list = {}
233             for l in languages:
234                 sub_lang_list[l] = [
235                     {
236                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
237                         'ext': ext,
238                     }
239                     for ext in ['ted', 'srt']
240                 ]
241             return sub_lang_list
242         else:
243             return {}
244
245     def _watch_info(self, url, name):
246         webpage = self._download_webpage(url, name)
247
248         config_json = self._html_search_regex(
249             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
250             webpage, 'config')
251         config = json.loads(config_json)['config']
252         video_url = config['video']['url']
253         thumbnail = config.get('image', {}).get('url')
254
255         title = self._html_search_regex(
256             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
257         description = self._html_search_regex(
258             [
259                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
260                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
261             ],
262             webpage, 'description', fatal=False)
263
264         return {
265             'id': name,
266             'url': video_url,
267             'title': title,
268             'thumbnail': thumbnail,
269             'description': description,
270         }