_ Git - youtube-dl/blob - youtube_dl/extractor/frontendmaster.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import collections
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_urlparse)
  10 from ..utils import (
  11     ExtractorError,
  12     urlencode_postdata,
  13     qualities, unescapeHTML)
  14
  15
  16 class FrontEndMasterBaseIE(InfoExtractor):
  17     _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses'
  18     _VIDEO_BASE = 'http://www.frontendmasters.com/courses'
  19     _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts'
  20     _COOKIES_BASE = 'https://api.frontendmasters.com'
  21     _LOGIN_URL = 'https://frontendmasters.com/login/'
  22
  23     _QUALITIES_PREFERENCE = ('low', 'medium', 'high')
  24     _QUALITIES = {
  25         'low': {'width': 480, 'height': 360},
  26         'medium': {'width': 1280, 'height': 720},
  27         'high': {'width': 1920, 'height': 1080}
  28     }
  29
  30     AllowedQuality = collections.namedtuple('AllowedQuality',
  31                                             ['ext', 'qualities'])
  32     _ALLOWED_QUALITIES = [
  33         AllowedQuality('webm', ['low', 'medium', 'high']),
  34         AllowedQuality('mp4', ['low', 'medium', 'high'])
  35     ]
  36
  37     def _real_initialize(self):
  38         self._login()
  39
  40     def _login(self):
  41         (username, password) = self._get_login_info()
  42         if username is None:
  43             return
  44
  45         login_page = self._download_webpage(
  46             self._LOGIN_URL, None, 'Downloading login page')
  47
  48         login_form = self._hidden_inputs(login_page)
  49
  50         login_form.update({
  51             'username': username,
  52             'password': password
  53         })
  54
  55         post_url = self._search_regex(
  56             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
  57             'post_url', default=self._LOGIN_URL, group='url')
  58
  59         if not post_url.startswith('http'):
  60             post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
  61
  62         response = self._download_webpage(
  63             post_url, None, 'Logging in',
  64             data=urlencode_postdata(login_form),
  65             headers={'Content-Type': 'application/x-www-form-urlencoded'}
  66         )
  67
  68         error = self._search_regex(
  69             r'<div[^>]+class=["\']Message MessageAlert["\'][^>]*>'
  70             r'([^<]+)'
  71             r'</div>',
  72             response, 'error message', default=None)
  73
  74         if error:
  75             raise ExtractorError('Unable to login: %s' % unescapeHTML(error),
  76                                  expected=True)
  77
  78     def _download_course(self, course_id, url):
  79         response = self._download_json(
  80             '%s/%s' % (self._API_BASE, course_id), course_id,
  81             'Downloading course JSON',
  82             headers={
  83                 'Content-Type': 'application/json;charset=utf-8',
  84                 'Referer': url,
  85             })
  86         return response
  87
  88     @staticmethod
  89     def _pair_section_video_element(lesson_elements):
  90         sections = {}
  91         current_section = None
  92         current_section_number = 0
  93         for elem in lesson_elements:
  94             if not isinstance(elem, int):
  95                 elem_name = elem
  96                 if not isinstance(elem_name, str):
  97                     # convert unicode to str
  98                     elem_name = elem.encode('utf-8')
  99                 (current_section, current_section_number) = \
 100                     (elem_name, current_section_number + 1)
 101             else:
 102                 if current_section:
 103                     sections[elem] = (current_section, current_section_number)
 104
 105         return sections
 106
 107
 108 class FrontEndMasterIE(FrontEndMasterBaseIE):
 109     IE_NAME = 'frontend-masters'
 110     _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \
 111                  r'(?P<courseid>[a-z\-]+)/' \
 112                  r'(?P<id>[a-z\-]+)'
 113
 114     _NETRC_MACHINE = 'frontendmasters'
 115
 116     _TEST = {
 117         'url': 'https://frontendmasters.com/courses/web-development/tools',
 118         'md5': '7f161159710d6b7016a4f4af6fcb05e2',
 119         'info_dict': {
 120             'id': 'tools',
 121             'title': 'Tools',
 122             'display_id': 'tools',
 123             'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
 124             'ext': 'mp4'
 125         },
 126         'skip': 'Requires FrontendMasters account credentials',
 127     }
 128
 129     def _get_subtitles(self, video_hash, video_id):
 130         captions = self._download_webpage(
 131             '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id,
 132             fatal=False)
 133         if captions:
 134             return {
 135                 'en': [{
 136                     'ext': 'vtt',
 137                     'data': captions
 138                 }]
 139             }
 140
 141     def _real_extract(self, url):
 142         mobj = re.match(self._VALID_URL, url)
 143         video_id = mobj.group('id')
 144         course_id = mobj.group('courseid')
 145
 146         course_json_content = self._download_course(course_id=course_id,
 147                                                     url=url)
 148
 149         # Necessary to get mandatory informations like title and video_url
 150         lesson_index = course_json_content.get('lessonSlugs').index(video_id)
 151         lesson_hash = course_json_content.get('lessonHashes')[lesson_index]
 152         lesson_data = course_json_content.get('lessonData')[lesson_hash]
 153         # This is necessary to get the link for the video
 154         lesson_source_base = lesson_data['sourceBase']
 155
 156         lesson_title = lesson_data['title']
 157
 158         # Some optional fields
 159         lesson_description = lesson_data.get('description')
 160         lesson_index = lesson_data.get('index')
 161         lesson_slug = lesson_data.get('slug')
 162         lesson_thumbnail_url = lesson_data.get('thumbnail')
 163         lesson_section_elements = course_json_content.get('lessonElements')
 164
 165         try:
 166             course_sections_pairing = self._pair_section_video_element(
 167                 lesson_section_elements)
 168
 169             lesson_section = \
 170                 course_sections_pairing.get(lesson_index)[0]
 171
 172             lesson_section_number = \
 173                 course_sections_pairing.get(lesson_index)[1]
 174         except Exception:
 175             lesson_section = None
 176             lesson_section_number = None
 177
 178         video_request_url = '%s/source'
 179         video_request_headers = {
 180             'origin': 'https://frontendmasters.com',
 181             'referer': lesson_source_base,
 182         }
 183
 184         quality_key = qualities(self._QUALITIES_PREFERENCE)
 185
 186         formats = []
 187         for ext, qualities_ in self._ALLOWED_QUALITIES:
 188             for quality in qualities_:
 189                 f = self._QUALITIES[quality].copy()
 190                 video_request_params = {
 191                     'r': f['height'],
 192                     'f': ext
 193                 }
 194                 video_response = self._download_json(
 195                     video_request_url % lesson_source_base, video_id,
 196                     query=video_request_params, headers=video_request_headers)
 197
 198                 video_url = video_response.get('url')
 199                 clip_f = f.copy()
 200                 clip_f.update({
 201                     'url': video_url,
 202                     'ext': ext,
 203                     'format_id': '%s-%s' % (ext, quality),
 204                     'quality': quality_key(quality),
 205                     'height': f['height']
 206                 })
 207                 formats.append(clip_f)
 208
 209         self._sort_formats(formats)
 210
 211         subtitles = self.extract_subtitles(lesson_hash, video_id)
 212
 213         return {
 214             'id': video_id,
 215             'display_id': lesson_slug,
 216             'title': lesson_title,
 217             'description': lesson_description,
 218             'chapter': lesson_section,
 219             'chapter_number': lesson_section_number,
 220             'thumbnail': lesson_thumbnail_url,
 221             'formats': formats,
 222             'subtitles': subtitles
 223         }
 224
 225
 226 class FrontEndMasterCourseIE(FrontEndMasterBaseIE):
 227     IE_NAME = 'frontend-masters:course'
 228     _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<courseid>[a-z\-]+)/?$'
 229
 230     _NETRC_MACHINE = 'frontendmasters'
 231
 232     _TEST = {
 233         'url': 'https://frontendmasters.com/courses/javascript-basics/',
 234         'info_dict': {
 235             'id': 'javascript-basics',
 236             'title': 'Introduction to JavaScript Programming',
 237             'description': 'md5:269412fbb76d86954761599ad8e4cbc9'
 238         },
 239         'playlist_count': 19,
 240         'skip': 'Requires FrontendMasters account credentials'
 241     }
 242
 243     @classmethod
 244     def suitable(cls, url):
 245         return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url)
 246
 247     def _real_extract(self, url):
 248         mobj = re.match(self._VALID_URL, url)
 249         course_id = mobj.group('courseid')
 250         course_json_content = self._download_course(course_id=course_id,
 251                                                     url=url)
 252
 253         title = course_json_content.get('title')
 254         description = course_json_content.get('description')
 255         course_display_id = course_json_content.get('slug')
 256
 257         videos_data = course_json_content.get('lessonData').values()
 258         videos_data = sorted(videos_data, key=lambda video: video.get('index'))
 259
 260         entries = []
 261         for video in videos_data:
 262             video_slug = video.get('slug')
 263             clip_url = '%s/%s/%s' % (
 264                 self._VIDEO_BASE, course_display_id, video_slug)
 265             entries.append({
 266                 '_type': 'url_transparent',
 267                 'url': clip_url,
 268                 'ie_key': FrontEndMasterIE.ie_key()
 269             })
 270
 271         return self.playlist_result(entries, course_id, title, description)