[frontendmasters] Add extractor
[youtube-dl] / youtube_dl / extractor / frontendmaster.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import collections
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_urlparse)
10 from ..utils import (
11     ExtractorError,
12     urlencode_postdata,
13     qualities, unescapeHTML)
14
15
16 class FrontEndMasterBaseIE(InfoExtractor):
17     _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses'
18     _VIDEO_BASE = 'http://www.frontendmasters.com/courses'
19     _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts'
20     _COOKIES_BASE = 'https://api.frontendmasters.com'
21     _LOGIN_URL = 'https://frontendmasters.com/login/'
22
23     _QUALITIES_PREFERENCE = ('low', 'medium', 'high')
24     _QUALITIES = {
25         'low': {'width': 480, 'height': 360},
26         'medium': {'width': 1280, 'height': 720},
27         'high': {'width': 1920, 'height': 1080}
28     }
29
30     AllowedQuality = collections.namedtuple('AllowedQuality',
31                                             ['ext', 'qualities'])
32     _ALLOWED_QUALITIES = [
33         AllowedQuality('webm', ['low', 'medium', 'high']),
34         AllowedQuality('mp4', ['low', 'medium', 'high'])
35     ]
36
37     def _real_initialize(self):
38         self._login()
39
40     def _login(self):
41         (username, password) = self._get_login_info()
42         if username is None:
43             return
44
45         login_page = self._download_webpage(
46             self._LOGIN_URL, None, 'Downloading login page')
47
48         login_form = self._hidden_inputs(login_page)
49
50         login_form.update({
51             'username': username,
52             'password': password
53         })
54
55         post_url = self._search_regex(
56             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
57             'post_url', default=self._LOGIN_URL, group='url')
58
59         if not post_url.startswith('http'):
60             post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
61
62         response = self._download_webpage(
63             post_url, None, 'Logging in',
64             data=urlencode_postdata(login_form),
65             headers={'Content-Type': 'application/x-www-form-urlencoded'}
66         )
67
68         error = self._search_regex(
69             r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>'
70             r'([^<]+)'
71             r'</div>',
72             response, 'error message', default=None)
73
74         if error:
75             raise ExtractorError('Unable to login: %s' % unescapeHTML(error),
76                                  expected=True)
77
78     def _download_course(self, course_id, url):
79         response = self._download_json(
80             '%s/%s' % (self._API_BASE, course_id), course_id,
81             'Downloading course JSON',
82             headers={
83                 'Content-Type': 'application/json;charset=utf-8',
84                 'Referer': url,
85             })
86         return response
87
88     @staticmethod
89     def _pair_section_video_element(lesson_elements):
90         sections = {}
91         current_section = None
92         current_section_number = 0
93         for elem in lesson_elements:
94             if not isinstance(elem, int):
95                 elem_name = elem
96                 if not isinstance(elem_name, str):
97                     # convert unicode to str
98                     elem_name = elem.encode('utf-8')
99                 (current_section, current_section_number) = \
100                     (elem_name, current_section_number + 1)
101             else:
102                 if current_section:
103                     sections[elem] = (current_section, current_section_number)
104
105         return sections
106
107
108 class FrontEndMasterIE(FrontEndMasterBaseIE):
109     IE_NAME = 'frontend-masters'
110     _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \
111                  r'(?P<courseid>[a-z\-]+)/' \
112                  r'(?P<id>[a-z\-]+)'
113
114     _NETRC_MACHINE = 'frontendmasters'
115
116     _TEST = {
117         'url': 'https://frontendmasters.com/courses/web-development/tools',
118         'md5': '7f161159710d6b7016a4f4af6fcb05e2',
119         'info_dict': {
120             'id': 'tools',
121             'title': 'Tools',
122             'display_id': 'tools',
123             'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
124             'ext': 'mp4'
125         },
126         'skip': 'Requires FrontendMasters account credentials',
127     }
128
129     def _get_subtitles(self, video_hash, video_id):
130         captions = self._download_webpage(
131             '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id,
132             fatal=False)
133         if captions:
134             return {
135                 'en': [{
136                     'ext': 'vtt',
137                     'data': captions
138                 }]
139             }
140
141     def _real_extract(self, url):
142         mobj = re.match(self._VALID_URL, url)
143         video_id = mobj.group('id')
144         course_id = mobj.group('courseid')
145
146         course_json_content = self._download_course(course_id=course_id,
147                                                     url=url)
148
149         # Necessary to get mandatory informations like title and video_url
150         lesson_index = course_json_content.get('lessonSlugs').index(video_id)
151         lesson_hash = course_json_content.get('lessonHashes')[lesson_index]
152         lesson_data = course_json_content.get('lessonData')[lesson_hash]
153         # This is necessary to get the link for the video
154         lesson_source_base = lesson_data['sourceBase']
155
156         lesson_title = lesson_data['title']
157
158         # Some optional fields
159         lesson_description = lesson_data.get('description')
160         lesson_index = lesson_data.get('index')
161         lesson_slug = lesson_data.get('slug')
162         lesson_thumbnail_url = lesson_data.get('thumbnail')
163         lesson_section_elements = course_json_content.get('lessonElements')
164
165         try:
166             course_sections_pairing = self._pair_section_video_element(
167                 lesson_section_elements)
168
169             lesson_section = \
170                 course_sections_pairing.get(lesson_index)[0]
171
172             lesson_section_number = \
173                 course_sections_pairing.get(lesson_index)[1]
174         except Exception:
175             lesson_section = None
176             lesson_section_number = None
177
178         video_request_url = '%s/source'
179         video_request_headers = {
180             'origin': 'https://frontendmasters.com',
181             'referer': lesson_source_base,
182         }
183
184         quality_key = qualities(self._QUALITIES_PREFERENCE)
185
186         formats = []
187         for ext, qualities_ in self._ALLOWED_QUALITIES:
188             for quality in qualities_:
189                 f = self._QUALITIES[quality].copy()
190                 video_request_params = {
191                     'r': f['height'],
192                     'f': ext
193                 }
194                 video_response = self._download_json(
195                     video_request_url % lesson_source_base, video_id,
196                     query=video_request_params, headers=video_request_headers)
197
198                 video_url = video_response.get('url')
199                 clip_f = f.copy()
200                 clip_f.update({
201                     'url': video_url,
202                     'ext': ext,
203                     'format_id': '%s-%s' % (ext, quality),
204                     'quality': quality_key(quality),
205                     'height': f['height']
206                 })
207                 formats.append(clip_f)
208
209         self._sort_formats(formats)
210
211         subtitles = self.extract_subtitles(lesson_hash, video_id)
212
213         return {
214             'id': video_id,
215             'display_id': lesson_slug,
216             'title': lesson_title,
217             'description': lesson_description,
218             'chapter': lesson_section,
219             'chapter_number': lesson_section_number,
220             'thumbnail': lesson_thumbnail_url,
221             'formats': formats,
222             'subtitles': subtitles
223         }
224
225
226 class FrontEndMasterCourseIE(FrontEndMasterBaseIE):
227     IE_NAME = 'frontend-masters:course'
228     _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$'
229
230     _NETRC_MACHINE = 'frontendmasters'
231
232     _TEST = {
233         'url': 'https://frontendmasters.com/courses/javascript-basics/',
234         'info_dict': {
235             'id': 'javascript-basics',
236             'title': 'Introduction to JavaScript Programming',
237             'description': 'md5:269412fbb76d86954761599ad8e4cbc9'
238         },
239         'playlist_count': 19,
240         'skip': 'Requires FrontendMasters account credentials'
241     }
242
243     @classmethod
244     def suitable(cls, url):
245         return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url)
246
247     def _real_extract(self, url):
248         mobj = re.match(self._VALID_URL, url)
249         course_id = mobj.group('courseid')
250         course_json_content = self._download_course(course_id=course_id,
251                                                     url=url)
252
253         title = course_json_content.get('title')
254         description = course_json_content.get('description')
255         course_display_id = course_json_content.get('slug')
256
257         videos_data = course_json_content.get('lessonData').values()
258         videos_data = sorted(videos_data, key=lambda video: video.get('index'))
259
260         entries = []
261         for video in videos_data:
262             video_slug = video.get('slug')
263             clip_url = '%s/%s/%s' % (
264                 self._VIDEO_BASE, course_display_id, video_slug)
265             entries.append({
266                 '_type': 'url_transparent',
267                 'url': clip_url,
268                 'ie_key': FrontEndMasterIE.ie_key()
269             })
270
271         return self.playlist_result(entries, course_id, title, description)