[ted] Add support for watch/ URLs (Fixes #2637)
[youtube-dl] / youtube_dl / extractor / ted.py
1 from __future__ import unicode_literals
2
3 import json
4 import re
5
6 from .subtitles import SubtitlesInfoExtractor
7
8 from ..utils import (
9     compat_str,
10 )
11
12
13 class TEDIE(SubtitlesInfoExtractor):
14     _VALID_URL = r'''(?x)
15         (?P<proto>https?://)
16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
17         (
18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
19             |
20             ((?P<type_talk>talks)) # We have a simple talk
21             |
22             (?P<type_watch>watch)/[^/]+/[^/]+
23         )
24         (/lang/(.*?))? # The url may contain the language
25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
26         .*)$
27         '''
28     _TESTS = [{
29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
30         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
31         'info_dict': {
32             'id': '102',
33             'ext': 'mp4',
34             'title': 'The illusion of consciousness',
35             'description': ('Philosopher Dan Dennett makes a compelling '
36                 'argument that not only don\'t we understand our own '
37                 'consciousness, but that half the time our brains are '
38                 'actively fooling us.'),
39             'uploader': 'Dan Dennett',
40         }
41     }, {
42         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
43         'md5': '226f4fb9c62380d11b7995efa4c87994',
44         'info_dict': {
45             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
46             'ext': 'mp4',
47             'title': 'Vishal Sikka: The beauty and power of algorithms',
48             'thumbnail': 're:^https?://.+\.jpg',
49             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
50         }
51     }]
52
53     _FORMATS_PREFERENCE = {
54         'low': 1,
55         'medium': 2,
56         'high': 3,
57     }
58
59     def _extract_info(self, webpage):
60         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
61             webpage, 'info json')
62         return json.loads(info_json)
63
64     def _real_extract(self, url):
65         m = re.match(self._VALID_URL, url, re.VERBOSE)
66         if m.group('type') == 'embed':
67             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
68             return self.url_result(desktop_url, 'TED')
69         name = m.group('name')
70         if m.group('type_talk'):
71             return self._talk_info(url, name)
72         elif m.group('type_watch'):
73             return self._watch_info(url, name)
74         else:
75             return self._playlist_videos_info(url, name)
76
77     def _playlist_videos_info(self, url, name):
78         '''Returns the videos of the playlist'''
79
80         webpage = self._download_webpage(url, name,
81             'Downloading playlist webpage')
82         info = self._extract_info(webpage)
83         playlist_info = info['playlist']
84
85         playlist_entries = [
86             self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
87             for talk in info['talks']
88         ]
89         return self.playlist_result(
90             playlist_entries,
91             playlist_id=compat_str(playlist_info['id']),
92             playlist_title=playlist_info['title'])
93
94     def _talk_info(self, url, video_name):
95         webpage = self._download_webpage(url, video_name)
96         self.report_extraction(video_name)
97
98         talk_info = self._extract_info(webpage)['talks'][0]
99
100         formats = [{
101             'ext': 'mp4',
102             'url': format_url,
103             'format_id': format_id,
104             'format': format_id,
105             'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
106         } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
107         self._sort_formats(formats)
108
109         video_id = compat_str(talk_info['id'])
110         # subtitles
111         video_subtitles = self.extract_subtitles(video_id, talk_info)
112         if self._downloader.params.get('listsubtitles', False):
113             self._list_available_subtitles(video_id, talk_info)
114             return
115
116         thumbnail = talk_info['thumb']
117         if not thumbnail.startswith('http'):
118             thumbnail = 'http://' + thumbnail
119         return {
120             'id': video_id,
121             'title': talk_info['title'],
122             'uploader': talk_info['speaker'],
123             'thumbnail': thumbnail,
124             'description': self._og_search_description(webpage),
125             'subtitles': video_subtitles,
126             'formats': formats,
127         }
128
129     def _get_available_subtitles(self, video_id, talk_info):
130         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
131         if languages:
132             sub_lang_list = {}
133             for l in languages:
134                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
135                 sub_lang_list[l] = url
136             return sub_lang_list
137         else:
138             self._downloader.report_warning(u'video doesn\'t have subtitles')
139             return {}
140
141     def _watch_info(self, url, name):
142         webpage = self._download_webpage(url, name)
143
144         config_json = self._html_search_regex(
145             r"data-config='([^']+)", webpage, 'config')
146         config = json.loads(config_json)
147         video_url = config['video']['url']
148         thumbnail = config.get('image', {}).get('url')
149
150         title = self._html_search_regex(
151             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
152         description = self._html_search_regex(
153             r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
154             webpage, 'description', fatal=False)
155
156         return {
157             'id': name,
158             'url': video_url,
159             'title': title,
160             'thumbnail': thumbnail,
161             'description': description,
162         }