Add support for ocw.mit.edu video lectures
authorAnthony Weems <amlweems@gmail.com>
Tue, 25 Feb 2014 20:44:34 +0000 (14:44 -0600)
committerAnthony Weems <amlweems@gmail.com>
Tue, 25 Feb 2014 20:44:34 +0000 (14:44 -0600)
youtube_dl/extractor/__init__.py
youtube_dl/extractor/mit.py

index 8eff3df41827aa1894663715ff68df553f4c64f5..ee081b1f35545a50bdc5915045e6c52569f0961d 100644 (file)
@@ -137,7 +137,7 @@ from .malemotion import MalemotionIE
 from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
-from .mit import TechTVMITIE, MITIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mixcloud import MixcloudIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
 from .mixcloud import MixcloudIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
index 76b717fe5dbac08b8b103a1e44192a6fbf6d2a55..7c40cb8bd3e1fd52e74e09d07bb04ac1f584fd99 100644 (file)
@@ -3,6 +3,7 @@ import json
 
 from .common import InfoExtractor
 from ..utils import (
 
 from .common import InfoExtractor
 from ..utils import (
+    compat_urlparse,
     clean_html,
     get_element_by_id,
 )
     clean_html,
     get_element_by_id,
 )
@@ -81,3 +82,56 @@ class MITIE(TechTVMITIE):
         embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
             u'embed url')
         return self.url_result(embed_url, ie='TechTVMIT')
         embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
             u'embed url')
         return self.url_result(embed_url, ie='TechTVMIT')
+
+class OCWMITIE(InfoExtractor):
+    IE_NAME = u'ocw.mit.edu'
+    _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+    _BASE_URL = u'http://ocw.mit.edu/'
+
+    _TESTS = [
+        {
+            u'url': u'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+            u'md5': u'348bef727b573c0bd9ad8a7c08c89ebd',
+            u'info_dict': {
+                u'title': u'7. Discrete Random Variables III',
+                u'description': u'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+                u'subtitles': u'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+            }
+        },
+        {
+            u'url': u'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+            u'md5': u'f4a434f08f15e581eb67cec0b57bcf6f',
+            u'info_dict': {
+                u'title': u'Lec 1 _ MIT 18.01 Single Variable Calculus, Fall 2007',
+                u'subtitles': u'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self.IE_NAME)
+        title = self._html_search_meta(u'WT.cg_s', webpage)
+        description = self._html_search_meta(u'Description', webpage)
+
+        # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+        embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+        if embed_chapter_media:
+            metadata = re.sub(r'[\'"]', u'', embed_chapter_media.group(1))
+            metadata = re.split(r', ?', metadata)
+            yt = metadata[1]
+            subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
+        else:
+            # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+            embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+            if embed_media:
+                metadata = re.sub(r'[\'"]', u'', embed_media.group(1))
+                metadata = re.split(r', ?', metadata)
+                yt = metadata[1]
+                subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
+            else:
+                raise ExtractorError('Unable to find embedded YouTube video.')
+
+        data = self.url_result(yt, 'Youtube')
+        data['subtitles'] = subs
+        
+        return data