X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fudemy.py;h=71bea5363ed77ddbf476bb92050e4d675c6f13a9;hb=c9c39c22c5740c1eedcc9ce7a10f5df199ea5c78;hp=59832b1ece75d480afdfa16c3c398a701f532a06;hpb=640bb54e73779f4a941eae1f17e4be049ca575db;p=youtube-dl diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 59832b1ec..71bea5363 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,26 +1,40 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_request, + compat_urlparse, ) from ..utils import ( + determine_ext, + extract_attributes, ExtractorError, float_or_none, int_or_none, sanitized_Request, + unescapeHTML, + urlencode_postdata, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -34,28 +48,42 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): - enroll_url = self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', - webpage, 'enroll url', group='url', - default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + + checkout_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', + webpage, 'checkout url', group='url', default=None)) + if checkout_url: + raise ExtractorError( + 'Course %s is not free. You have to pay for it before you can download. ' + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) + + enroll_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', default=None)) + if enroll_url: + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course') + if '>You have enrolled in' in webpage: + self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse.urlencode({ - 'video_only': '', - 'auto_play': '', - 'fields[lecture]': 'title,description,asset', + course_id, lecture_id, compat_urllib_parse_urlencode({ + 'fields[lecture]': 'title,description,view_html,asset', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - 'instructorPreviewMode': 'False', })), lecture_id, 'Downloading lecture JSON') @@ -118,7 +146,7 @@ class UdemyIE(InfoExtractor): }) request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) @@ -139,15 +167,16 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), + webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) - lecture_id = self._download_lecture(course_id, lecture_id) + self._enroll_course(url, webpage, course_id) + lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -171,41 +200,90 @@ class UdemyIE(InfoExtractor): video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), + + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('height') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + return output_format + return f + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r']+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P\d{3,4})_(?P\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) else: - f['format_id'] = '%sp' % format_id - formats.append(f) + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, @@ -238,16 +316,31 @@ class UdemyCourseIE(UdemyIE): course_id = response['id'] course_title = response.get('title') - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, course_id, 'Downloading course curriculum') - entries = [ - self.url_result( - 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy') - for asset in response if asset.get('assetType') or asset.get('asset_type') == 'Video' - ] + entries = [] + chapter, chapter_number = None, None + for asset in response: + asset_type = asset.get('assetType') or asset.get('asset_type') + if asset_type == 'Video': + asset_id = asset.get('id') + if asset_id: + entry = { + '_type': 'url_transparent', + 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'ie_key': UdemyIE.ie_key(), + } + if chapter_number: + entry['chapter_number'] = chapter_number + if chapter: + entry['chapter'] = chapter + entries.append(entry) + elif asset.get('type') == 'chapter': + chapter_number = asset.get('index') or asset.get('object_index') + chapter = asset.get('title') return self.playlist_result(entries, course_id, course_title)