X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fudemy.py;h=71bea5363ed77ddbf476bb92050e4d675c6f13a9;hb=c9c39c22c5740c1eedcc9ce7a10f5df199ea5c78;hp=6adfb2ceea8b1d15bd124a173e9a702a49e35735;hpb=15707c7e024f1f29e7abd8ddaa362196ef2d4af6;p=youtube-dl diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 6adfb2cee..71bea5363 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -8,11 +10,14 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + determine_ext, + extract_attributes, ExtractorError, float_or_none, int_or_none, sanitized_Request, unescapeHTML, + urlencode_postdata, ) @@ -50,21 +55,26 @@ class UdemyIE(InfoExtractor): }] def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( 'Course %s is not free. You have to pay for it before you can download. ' - 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) enroll_url = unescapeHTML(self._search_regex( r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: - if not enroll_url.startswith('http'): - enroll_url = compat_urlparse.urljoin(base_url, enroll_url) - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) @@ -72,11 +82,8 @@ class UdemyIE(InfoExtractor): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( course_id, lecture_id, compat_urllib_parse_urlencode({ - 'video_only': '', - 'auto_play': '', - 'fields[lecture]': 'title,description,asset', + 'fields[lecture]': 'title,description,view_html,asset', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - 'instructorPreviewMode': 'False', })), lecture_id, 'Downloading lecture JSON') @@ -139,7 +146,7 @@ class UdemyIE(InfoExtractor): }) request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse_urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) @@ -199,7 +206,7 @@ class UdemyIE(InfoExtractor): def extract_output_format(src): return { 'url': src['url'], - 'format_id': '%sp' % (src.get('label') or format_id), + 'format_id': '%sp' % (src.get('height') or format_id), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), @@ -216,9 +223,13 @@ class UdemyIE(InfoExtractor): if not isinstance(outputs, dict): outputs = {} - for format_id, output in outputs.items(): - if isinstance(output, dict) and output.get('url'): - formats.append(extract_output_format(output)) + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + return output_format + return f download_urls = asset.get('download_urls') if isinstance(download_urls, dict): @@ -231,21 +242,48 @@ class UdemyIE(InfoExtractor): format_id = format_.get('label') f = { 'url': format_['file'], + 'format_id': '%sp' % format_id, 'height': int_or_none(format_id), } if format_id: # Some videos contain additional metadata (e.g. # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - output_format = extract_output_format(output) - output_format.update(f) - f = output_format - else: - f['format_id'] = '%sp' % format_id + f = add_output_format_meta(f, format_id) formats.append(f) - self._sort_formats(formats) + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r']+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P\d{3,4})_(?P\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) + else: + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) + + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id,