X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fudemy.py;h=be6f3be5ed4ee395a64daa3ee7e9e8cc5e6ba542;hb=6e6bc8dae577c29c072ffc5c25078b5668435435;hp=59832b1ece75d480afdfa16c3c398a701f532a06;hpb=31d9ea4a3e641acfee8852c9324bd414047fce9e;p=youtube-dl diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 59832b1ec..be6f3be5e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -3,24 +3,34 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, float_or_none, int_or_none, sanitized_Request, + unescapeHTML, + urlencode_postdata, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -34,23 +44,35 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): - enroll_url = self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', - webpage, 'enroll url', group='url', - default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + def _enroll_course(self, base_url, webpage, course_id): + checkout_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + webpage, 'checkout url', group='url', default=None)) + if checkout_url: + raise ExtractorError( + 'Course %s is not free. You have to pay for it before you can download. ' + 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + + enroll_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', default=None)) + if enroll_url: + if not enroll_url.startswith('http'): + enroll_url = compat_urlparse.urljoin(base_url, enroll_url) + webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + if '>You have enrolled in' in webpage: + self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse.urlencode({ + course_id, lecture_id, compat_urllib_parse_urlencode({ 'video_only': '', 'auto_play': '', 'fields[lecture]': 'title,description,asset', @@ -118,7 +140,7 @@ class UdemyIE(InfoExtractor): }) request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) @@ -139,15 +161,16 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), + webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) - lecture_id = self._download_lecture(course_id, lecture_id) + self._enroll_course(url, webpage, course_id) + lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -171,39 +194,57 @@ class UdemyIE(InfoExtractor): video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), + + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('label') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) - else: - f['format_id'] = '%sp' % format_id - formats.append(f) + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + for format_id, output in outputs.items(): + if isinstance(output, dict) and output.get('url'): + formats.append(extract_output_format(output)) + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + f = output_format + else: + f['format_id'] = '%sp' % format_id + formats.append(f) self._sort_formats(formats) @@ -238,16 +279,31 @@ class UdemyCourseIE(UdemyIE): course_id = response['id'] course_title = response.get('title') - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, course_id, 'Downloading course curriculum') - entries = [ - self.url_result( - 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy') - for asset in response if asset.get('assetType') or asset.get('asset_type') == 'Video' - ] + entries = [] + chapter, chapter_number = None, None + for asset in response: + asset_type = asset.get('assetType') or asset.get('asset_type') + if asset_type == 'Video': + asset_id = asset.get('id') + if asset_id: + entry = { + '_type': 'url_transparent', + 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'ie_key': UdemyIE.ie_key(), + } + if chapter_number: + entry['chapter_number'] = chapter_number + if chapter: + entry['chapter'] = chapter + entries.append(entry) + elif asset.get('type') == 'chapter': + chapter_number = asset.get('index') or asset.get('object_index') + chapter = asset.get('title') return self.playlist_result(entries, course_id, course_title)