[ted] Remove md5sum from test
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .subtitles import SubtitlesInfoExtractor
7
8 from ..utils import (
9     compat_str,
10 )
11
12
13 class TEDIE(SubtitlesInfoExtractor):
14     _VALID_URL = r'''(?x)
15         (?P<proto>https?://)
16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
17         (
18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19             |
20             ((?P<type_talk>talks)) # We have a simple talk
21             |
22             (?P<type_watch>watch)/[^/]+/[^/]+
23         )
24         (/lang/(.*?))? # The url may contain the language
25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
26         .*)$
27         '''
28     _TESTS = [{
29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
31         'info_dict': {
32             'id': '102',
33             'ext': 'mp4',
34             'title': 'The illusion of consciousness',
35             'description': ('Philosopher Dan Dennett makes a compelling '
36                 'argument that not only don\'t we understand our own '
37                 'consciousness, but that half the time our brains are '
38                 'actively fooling us.'),
39             'uploader': 'Dan Dennett',
40             'width': 854,
41         }
42     }, {
43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44         'md5': '226f4fb9c62380d11b7995efa4c87994',
45         'info_dict': {
46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47             'ext': 'mp4',
48             'title': 'Vishal Sikka: The beauty and power of algorithms',
49             'thumbnail': 're:^https?://.+\.jpg',
50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51         }
52     }, {
53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54         'info_dict': {
55             'id': '1972',
56             'ext': 'mp4',
57             'title': 'Be passionate. Be courageous. Be your best.',
58             'uploader': 'Gabby Giffords and Mark Kelly',
59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
60         },
61     }]
62
63     _NATIVE_FORMATS = {
64         'low': {'preference': 1, 'width': 320, 'height': 180},
65         'medium': {'preference': 2, 'width': 512, 'height': 288},
66         'high': {'preference': 3, 'width': 854, 'height': 480},
67     }
68
69     def _extract_info(self, webpage):
70         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
71             webpage, 'info json')
72         return json.loads(info_json)
73
74     def _real_extract(self, url):
75         m = re.match(self._VALID_URL, url, re.VERBOSE)
76         if m.group('type') == 'embed':
77             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
78             return self.url_result(desktop_url, 'TED')
79         name = m.group('name')
80         if m.group('type_talk'):
81             return self._talk_info(url, name)
82         elif m.group('type_watch'):
83             return self._watch_info(url, name)
84         else:
85             return self._playlist_videos_info(url, name)
86
87     def _playlist_videos_info(self, url, name):
88         '''Returns the videos of the playlist'''
89
90         webpage = self._download_webpage(url, name,
91             'Downloading playlist webpage')
92         info = self._extract_info(webpage)
93         playlist_info = info['playlist']
94
95         playlist_entries = [
96             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
97             for talk in info['talks']
98         ]
99         return self.playlist_result(
100             playlist_entries,
101             playlist_id=compat_str(playlist_info['id']),
102             playlist_title=playlist_info['title'])
103
104     def _talk_info(self, url, video_name):
105         webpage = self._download_webpage(url, video_name)
106         self.report_extraction(video_name)
107
108         talk_info = self._extract_info(webpage)['talks'][0]
109
110         formats = [{
111             'url': format_url,
112             'format_id': format_id,
113             'format': format_id,
114         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
115         if formats:
116             for f in formats:
117                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
118                 if finfo:
119                     f.update(finfo)
120         else:
121             # Use rtmp downloads
122             formats = [{
123                 'format_id': f['name'],
124                 'url': talk_info['streamer'],
125                 'play_path': f['file'],
126                 'ext': 'flv',
127                 'width': f['width'],
128                 'height': f['height'],
129                 'tbr': f['bitrate'],
130             } for f in talk_info['resources']['rtmp']]
131         self._sort_formats(formats)
132
133         video_id = compat_str(talk_info['id'])
134         # subtitles
135         video_subtitles = self.extract_subtitles(video_id, talk_info)
136         if self._downloader.params.get('listsubtitles', False):
137             self._list_available_subtitles(video_id, talk_info)
138             return
139
140         thumbnail = talk_info['thumb']
141         if not thumbnail.startswith('http'):
142             thumbnail = 'http://' + thumbnail
143         return {
144             'id': video_id,
145             'title': talk_info['title'],
146             'uploader': talk_info['speaker'],
147             'thumbnail': thumbnail,
148             'description': self._og_search_description(webpage),
149             'subtitles': video_subtitles,
150             'formats': formats,
151         }
152
153     def _get_available_subtitles(self, video_id, talk_info):
154         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
155         if languages:
156             sub_lang_list = {}
157             for l in languages:
158                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
159                 sub_lang_list[l] = url
160             return sub_lang_list
161         else:
162             self._downloader.report_warning('video doesn\'t have subtitles')
163             return {}
164
165     def _watch_info(self, url, name):
166         webpage = self._download_webpage(url, name)
167
168         config_json = self._html_search_regex(
169             r"data-config='([^']+)", webpage, 'config')
170         config = json.loads(config_json)
171         video_url = config['video']['url']
172         thumbnail = config.get('image', {}).get('url')
173
174         title = self._html_search_regex(
175             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
176         description = self._html_search_regex(
177             [
178                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
179                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
180             ],
181             webpage, 'description', fatal=False)
182
183         return {
184             'id': name,
185             'url': video_url,
186             'title': title,
187             'thumbnail': thumbnail,
188             'description': description,
189         }