[ted] Remove superfluous u prefixes
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .subtitles import SubtitlesInfoExtractor
7
8 from ..utils import (
9     compat_str,
10 )
11
12
13 class TEDIE(SubtitlesInfoExtractor):
14     _VALID_URL = r'''(?x)
15         (?P<proto>https?://)
16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
17         (
18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19             |
20             ((?P<type_talk>talks)) # We have a simple talk
21             |
22             (?P<type_watch>watch)/[^/]+/[^/]+
23         )
24         (/lang/(.*?))? # The url may contain the language
25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
26         .*)$
27         '''
28     _TESTS = [{
29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
31         'info_dict': {
32             'id': '102',
33             'ext': 'mp4',
34             'title': 'The illusion of consciousness',
35             'description': ('Philosopher Dan Dennett makes a compelling '
36                 'argument that not only don\'t we understand our own '
37                 'consciousness, but that half the time our brains are '
38                 'actively fooling us.'),
39             'uploader': 'Dan Dennett',
40             'width': 854,
41         }
42     }, {
43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44         'md5': '226f4fb9c62380d11b7995efa4c87994',
45         'info_dict': {
46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47             'ext': 'mp4',
48             'title': 'Vishal Sikka: The beauty and power of algorithms',
49             'thumbnail': 're:^https?://.+\.jpg',
50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51         }
52     }, {
53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54         'info_dict': {
55             'id': '1972',
56             'ext': 'flv',
57             'title': 'Be passionate. Be courageous. Be your best.',
58             'uploader': 'Gabby Giffords and Mark Kelly',
59             'description': 'md5:d89e1d8ebafdac8e55df4c219ecdbfe9',
60         },
61         'params': {
62             # rtmp download
63             'skip_download': True,
64         },
65     }]
66
67     _NATIVE_FORMATS = {
68         'low': {'preference': 1, 'width': 320, 'height': 180},
69         'medium': {'preference': 2, 'width': 512, 'height': 288},
70         'high': {'preference': 3, 'width': 854, 'height': 480},
71     }
72
73     def _extract_info(self, webpage):
74         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
75             webpage, 'info json')
76         return json.loads(info_json)
77
78     def _real_extract(self, url):
79         m = re.match(self._VALID_URL, url, re.VERBOSE)
80         if m.group('type') == 'embed':
81             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
82             return self.url_result(desktop_url, 'TED')
83         name = m.group('name')
84         if m.group('type_talk'):
85             return self._talk_info(url, name)
86         elif m.group('type_watch'):
87             return self._watch_info(url, name)
88         else:
89             return self._playlist_videos_info(url, name)
90
91     def _playlist_videos_info(self, url, name):
92         '''Returns the videos of the playlist'''
93
94         webpage = self._download_webpage(url, name,
95             'Downloading playlist webpage')
96         info = self._extract_info(webpage)
97         playlist_info = info['playlist']
98
99         playlist_entries = [
100             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
101             for talk in info['talks']
102         ]
103         return self.playlist_result(
104             playlist_entries,
105             playlist_id=compat_str(playlist_info['id']),
106             playlist_title=playlist_info['title'])
107
108     def _talk_info(self, url, video_name):
109         webpage = self._download_webpage(url, video_name)
110         self.report_extraction(video_name)
111
112         talk_info = self._extract_info(webpage)['talks'][0]
113
114         formats = [{
115             'url': format_url,
116             'format_id': format_id,
117             'format': format_id,
118         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
119         if formats:
120             for f in formats:
121                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
122                 if finfo:
123                     f.update(finfo)
124         else:
125             # Use rtmp downloads
126             formats = [{
127                 'format_id': f['name'],
128                 'url': talk_info['streamer'],
129                 'play_path': f['file'],
130                 'ext': 'flv',
131                 'width': f['width'],
132                 'height': f['height'],
133                 'tbr': f['bitrate'],
134             } for f in talk_info['resources']['rtmp']]
135         self._sort_formats(formats)
136
137         video_id = compat_str(talk_info['id'])
138         # subtitles
139         video_subtitles = self.extract_subtitles(video_id, talk_info)
140         if self._downloader.params.get('listsubtitles', False):
141             self._list_available_subtitles(video_id, talk_info)
142             return
143
144         thumbnail = talk_info['thumb']
145         if not thumbnail.startswith('http'):
146             thumbnail = 'http://' + thumbnail
147         return {
148             'id': video_id,
149             'title': talk_info['title'],
150             'uploader': talk_info['speaker'],
151             'thumbnail': thumbnail,
152             'description': self._og_search_description(webpage),
153             'subtitles': video_subtitles,
154             'formats': formats,
155         }
156
157     def _get_available_subtitles(self, video_id, talk_info):
158         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
159         if languages:
160             sub_lang_list = {}
161             for l in languages:
162                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
163                 sub_lang_list[l] = url
164             return sub_lang_list
165         else:
166             self._downloader.report_warning('video doesn\'t have subtitles')
167             return {}
168
169     def _watch_info(self, url, name):
170         webpage = self._download_webpage(url, name)
171
172         config_json = self._html_search_regex(
173             r"data-config='([^']+)", webpage, 'config')
174         config = json.loads(config_json)
175         video_url = config['video']['url']
176         thumbnail = config.get('image', {}).get('url')
177
178         title = self._html_search_regex(
179             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
180         description = self._html_search_regex(
181             r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
182             webpage, 'description', fatal=False)
183
184         return {
185             'id': name,
186             'url': video_url,
187             'title': title,
188             'thumbnail': thumbnail,
189             'description': description,
190         }