[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / linuxacademy.py
1 from __future__ import unicode_literals
2
3 import json
4 import random
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_b64decode,
10     compat_HTTPError,
11 )
12 from ..utils import (
13     ExtractorError,
14     orderedSet,
15     unescapeHTML,
16     urlencode_postdata,
17     urljoin,
18 )
19
20
21 class LinuxAcademyIE(InfoExtractor):
22     _VALID_URL = r'''(?x)
23                     https?://
24                         (?:www\.)?linuxacademy\.com/cp/
25                         (?:
26                             courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
27                             modules/view/id/(?P<course_id>\d+)
28                         )
29                     '''
30     _TESTS = [{
31         'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
32         'info_dict': {
33             'id': '1498-2',
34             'ext': 'mp4',
35             'title': "Introduction to the Practitioner's Brief",
36         },
37         'params': {
38             'skip_download': True,
39         },
40         'skip': 'Requires Linux Academy account credentials',
41     }, {
42         'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
43         'only_matching': True,
44     }, {
45         'url': 'https://linuxacademy.com/cp/modules/view/id/154',
46         'info_dict': {
47             'id': '154',
48             'title': 'AWS Certified Cloud Practitioner',
49             'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
50         },
51         'playlist_count': 41,
52         'skip': 'Requires Linux Academy account credentials',
53     }]
54
55     _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
56     _ORIGIN_URL = 'https://linuxacademy.com'
57     _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
58     _NETRC_MACHINE = 'linuxacademy'
59
60     def _real_initialize(self):
61         self._login()
62
63     def _login(self):
64         username, password = self._get_login_info()
65         if username is None:
66             return
67
68         def random_string():
69             return ''.join([
70                 random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
71                 for _ in range(32)])
72
73         webpage, urlh = self._download_webpage_handle(
74             self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
75                 'client_id': self._CLIENT_ID,
76                 'response_type': 'token id_token',
77                 'redirect_uri': self._ORIGIN_URL,
78                 'scope': 'openid email user_impersonation profile',
79                 'audience': self._ORIGIN_URL,
80                 'state': random_string(),
81                 'nonce': random_string(),
82             })
83
84         login_data = self._parse_json(
85             self._search_regex(
86                 r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
87                 'login info', group='value'), None,
88             transform_source=lambda x: compat_b64decode(x).decode('utf-8')
89         )['extraParams']
90
91         login_data.update({
92             'client_id': self._CLIENT_ID,
93             'redirect_uri': self._ORIGIN_URL,
94             'tenant': 'lacausers',
95             'connection': 'Username-Password-Authentication',
96             'username': username,
97             'password': password,
98             'sso': 'true',
99         })
100
101         login_state_url = urlh.geturl()
102
103         try:
104             login_page = self._download_webpage(
105                 'https://login.linuxacademy.com/usernamepassword/login', None,
106                 'Downloading login page', data=json.dumps(login_data).encode(),
107                 headers={
108                     'Content-Type': 'application/json',
109                     'Origin': 'https://login.linuxacademy.com',
110                     'Referer': login_state_url,
111                 })
112         except ExtractorError as e:
113             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
114                 error = self._parse_json(e.cause.read(), None)
115                 message = error.get('description') or error['code']
116                 raise ExtractorError(
117                     '%s said: %s' % (self.IE_NAME, message), expected=True)
118             raise
119
120         callback_page, urlh = self._download_webpage_handle(
121             'https://login.linuxacademy.com/login/callback', None,
122             'Downloading callback page',
123             data=urlencode_postdata(self._hidden_inputs(login_page)),
124             headers={
125                 'Content-Type': 'application/x-www-form-urlencoded',
126                 'Origin': 'https://login.linuxacademy.com',
127                 'Referer': login_state_url,
128             })
129
130         access_token = self._search_regex(
131             r'access_token=([^=&]+)', urlh.geturl(),
132             'access token')
133
134         self._download_webpage(
135             'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
136             % access_token, None, 'Downloading token validation page')
137
138     def _real_extract(self, url):
139         mobj = re.match(self._VALID_URL, url)
140         chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
141         item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
142
143         webpage = self._download_webpage(url, item_id)
144
145         # course path
146         if course_id:
147             entries = [
148                 self.url_result(
149                     urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
150                 for lesson_url in orderedSet(re.findall(
151                     r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
152                     webpage))]
153             title = unescapeHTML(self._html_search_regex(
154                 (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
155                  r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
156                 webpage, 'title', default=None, group='value'))
157             description = unescapeHTML(self._html_search_regex(
158                 r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
159                 webpage, 'description', default=None, group='value'))
160             return self.playlist_result(entries, course_id, title, description)
161
162         # single video path
163         info = self._extract_jwplayer_data(
164             webpage, item_id, require_title=False, m3u8_id='hls',)
165         title = self._search_regex(
166             (r'>Lecture\s*:\s*(?P<value>[^<]+)',
167              r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
168             'title', group='value')
169         info.update({
170             'id': item_id,
171             'title': title,
172         })
173         return info