[udemy] Extract outputs renditions (closes #16289, closes #16291, closes #16320,...
[youtube-dl] / youtube_dl / extractor / udemy.py
index 207c4a6a7ee8131c3e2e5d5823aefb336ad47c47..bf1134e3ffb5c80d56898da57abe0ce84741581d 100644 (file)
@@ -5,6 +5,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
+    compat_kwargs,
     compat_str,
     compat_urllib_request,
     compat_urlparse,
@@ -57,16 +58,20 @@ class UdemyIE(InfoExtractor):
         # no url in outputs format entry
         'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812',
         'only_matching': True,
+    }, {
+        # only outputs rendition
+        'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0',
+        'only_matching': True,
     }]
 
     def _extract_course_info(self, webpage, video_id):
         course = self._parse_json(
             unescapeHTML(self._search_regex(
-                r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')),
+                r'ng-init=["\'].*\bcourse=({.+?})[;"\']',
+                webpage, 'course', default='{}')),
             video_id, fatal=False) or {}
         course_id = course.get('id') or self._search_regex(
-            (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'),
-            webpage, 'course id')
+            r'data-course-id=["\'](\d+)', webpage, 'course id')
         return course_id, course.get('title')
 
     def _enroll_course(self, base_url, webpage, course_id):
@@ -114,6 +119,11 @@ class UdemyIE(InfoExtractor):
                 error_str += ' - %s' % error_data.get('formErrors')
             raise ExtractorError(error_str, expected=True)
 
+    def _download_webpage_handle(self, *args, **kwargs):
+        kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4'
+        return super(UdemyIE, self)._download_webpage_handle(
+            *args, **compat_kwargs(kwargs))
+
     def _download_json(self, url_or_request, *args, **kwargs):
         headers = {
             'X-Udemy-Snail-Case': 'true',
@@ -164,7 +174,7 @@ class UdemyIE(InfoExtractor):
         })
 
         response = self._download_webpage(
-            self._LOGIN_URL, None, 'Logging in as %s' % username,
+            self._LOGIN_URL, None, 'Logging in',
             data=urlencode_postdata(login_form),
             headers={
                 'Referer': self._ORIGIN_URL,
@@ -257,6 +267,11 @@ class UdemyIE(InfoExtractor):
                 video_url = source.get('file') or source.get('src')
                 if not video_url or not isinstance(video_url, compat_str):
                     continue
+                if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                    continue
                 format_id = source.get('label')
                 f = {
                     'url': video_url,
@@ -346,6 +361,12 @@ class UdemyIE(InfoExtractor):
                     fatal=False)
                 extract_subtitles(text_tracks)
 
+        if not formats and outputs:
+            for format_id, output in outputs.items():
+                f = extract_output_format(output, format_id)
+                if f.get('url'):
+                    formats.append(f)
+
         self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
 
         return {