[vgtv] Add new extractor
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .subtitles import SubtitlesInfoExtractor
7
8 from ..utils import (
9     compat_str,
10 )
11
12
13 class TEDIE(SubtitlesInfoExtractor):
14     _VALID_URL = r'''(?x)
15         (?P<proto>https?://)
16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
17         (
18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19             |
20             ((?P<type_talk>talks)) # We have a simple talk
21             |
22             (?P<type_watch>watch)/[^/]+/[^/]+
23         )
24         (/lang/(.*?))? # The url may contain the language
25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
26         .*)$
27         '''
28     _TESTS = [{
29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
31         'info_dict': {
32             'id': '102',
33             'ext': 'mp4',
34             'title': 'The illusion of consciousness',
35             'description': ('Philosopher Dan Dennett makes a compelling '
36                 'argument that not only don\'t we understand our own '
37                 'consciousness, but that half the time our brains are '
38                 'actively fooling us.'),
39             'uploader': 'Dan Dennett',
40             'width': 854,
41         }
42     }, {
43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
44         'md5': '226f4fb9c62380d11b7995efa4c87994',
45         'info_dict': {
46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
47             'ext': 'mp4',
48             'title': 'Vishal Sikka: The beauty and power of algorithms',
49             'thumbnail': 're:^https?://.+\.jpg',
50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
51         }
52     }, {
53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
54         'md5': '49144e345a899b8cb34d315f3b9cfeeb',
55         'info_dict': {
56             'id': '1972',
57             'ext': 'mp4',
58             'title': 'Be passionate. Be courageous. Be your best.',
59             'uploader': 'Gabby Giffords and Mark Kelly',
60             'description': 'md5:5174aed4d0f16021b704120360f72b92',
61         },
62     }]
63
64     _NATIVE_FORMATS = {
65         'low': {'preference': 1, 'width': 320, 'height': 180},
66         'medium': {'preference': 2, 'width': 512, 'height': 288},
67         'high': {'preference': 3, 'width': 854, 'height': 480},
68     }
69
70     def _extract_info(self, webpage):
71         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
72             webpage, 'info json')
73         return json.loads(info_json)
74
75     def _real_extract(self, url):
76         m = re.match(self._VALID_URL, url, re.VERBOSE)
77         if m.group('type') == 'embed':
78             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
79             return self.url_result(desktop_url, 'TED')
80         name = m.group('name')
81         if m.group('type_talk'):
82             return self._talk_info(url, name)
83         elif m.group('type_watch'):
84             return self._watch_info(url, name)
85         else:
86             return self._playlist_videos_info(url, name)
87
88     def _playlist_videos_info(self, url, name):
89         '''Returns the videos of the playlist'''
90
91         webpage = self._download_webpage(url, name,
92             'Downloading playlist webpage')
93         info = self._extract_info(webpage)
94         playlist_info = info['playlist']
95
96         playlist_entries = [
97             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
98             for talk in info['talks']
99         ]
100         return self.playlist_result(
101             playlist_entries,
102             playlist_id=compat_str(playlist_info['id']),
103             playlist_title=playlist_info['title'])
104
105     def _talk_info(self, url, video_name):
106         webpage = self._download_webpage(url, video_name)
107         self.report_extraction(video_name)
108
109         talk_info = self._extract_info(webpage)['talks'][0]
110
111         formats = [{
112             'url': format_url,
113             'format_id': format_id,
114             'format': format_id,
115         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
116         if formats:
117             for f in formats:
118                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
119                 if finfo:
120                     f.update(finfo)
121         else:
122             # Use rtmp downloads
123             formats = [{
124                 'format_id': f['name'],
125                 'url': talk_info['streamer'],
126                 'play_path': f['file'],
127                 'ext': 'flv',
128                 'width': f['width'],
129                 'height': f['height'],
130                 'tbr': f['bitrate'],
131             } for f in talk_info['resources']['rtmp']]
132         self._sort_formats(formats)
133
134         video_id = compat_str(talk_info['id'])
135         # subtitles
136         video_subtitles = self.extract_subtitles(video_id, talk_info)
137         if self._downloader.params.get('listsubtitles', False):
138             self._list_available_subtitles(video_id, talk_info)
139             return
140
141         thumbnail = talk_info['thumb']
142         if not thumbnail.startswith('http'):
143             thumbnail = 'http://' + thumbnail
144         return {
145             'id': video_id,
146             'title': talk_info['title'],
147             'uploader': talk_info['speaker'],
148             'thumbnail': thumbnail,
149             'description': self._og_search_description(webpage),
150             'subtitles': video_subtitles,
151             'formats': formats,
152         }
153
154     def _get_available_subtitles(self, video_id, talk_info):
155         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
156         if languages:
157             sub_lang_list = {}
158             for l in languages:
159                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
160                 sub_lang_list[l] = url
161             return sub_lang_list
162         else:
163             self._downloader.report_warning('video doesn\'t have subtitles')
164             return {}
165
166     def _watch_info(self, url, name):
167         webpage = self._download_webpage(url, name)
168
169         config_json = self._html_search_regex(
170             r"data-config='([^']+)", webpage, 'config')
171         config = json.loads(config_json)
172         video_url = config['video']['url']
173         thumbnail = config.get('image', {}).get('url')
174
175         title = self._html_search_regex(
176             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
177         description = self._html_search_regex(
178             [
179                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
180                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
181             ],
182             webpage, 'description', fatal=False)
183
184         return {
185             'id': name,
186             'url': video_url,
187             'title': title,
188             'thumbnail': thumbnail,
189             'description': description,
190         }