Move StanfordOC IE into its own file
[youtube-dl] / youtube_dl / extractor / stanfordoc.py
1 import re
2 import socket
3 import xml.etree.ElementTree
4
5 from .common import InfoExtractor
6 from ..utils import (
7     compat_http_client,
8     compat_str,
9     compat_urllib_error,
10     compat_urllib_request,
11
12     ExtractorError,
13     orderedSet,
14     unescapeHTML,
15 )
16
17
18 class StanfordOpenClassroomIE(InfoExtractor):
19     """Information extractor for Stanford's Open ClassRoom"""
20
21     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
22     IE_NAME = u'stanfordoc'
23
24     def _real_extract(self, url):
25         mobj = re.match(self._VALID_URL, url)
26         if mobj is None:
27             raise ExtractorError(u'Invalid URL: %s' % url)
28
29         if mobj.group('course') and mobj.group('video'): # A specific video
30             course = mobj.group('course')
31             video = mobj.group('video')
32             info = {
33                 'id': course + '_' + video,
34                 'uploader': None,
35                 'upload_date': None,
36             }
37
38             self.report_extraction(info['id'])
39             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
40             xmlUrl = baseUrl + video + '.xml'
41             try:
42                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
43             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
44                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
45             mdoc = xml.etree.ElementTree.fromstring(metaXml)
46             try:
47                 info['title'] = mdoc.findall('./title')[0].text
48                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
49             except IndexError:
50                 raise ExtractorError(u'Invalid metadata XML file')
51             info['ext'] = info['url'].rpartition('.')[2]
52             return [info]
53         elif mobj.group('course'): # A course page
54             course = mobj.group('course')
55             info = {
56                 'id': course,
57                 'type': 'playlist',
58                 'uploader': None,
59                 'upload_date': None,
60             }
61
62             coursepage = self._download_webpage(url, info['id'],
63                                         note='Downloading course info page',
64                                         errnote='Unable to download course info page')
65
66             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
67
68             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
69                 coursepage, u'description', fatal=False)
70
71             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
72             info['list'] = [
73                 {
74                     'type': 'reference',
75                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
76                 }
77                     for vpage in links]
78             results = []
79             for entry in info['list']:
80                 assert entry['type'] == 'reference'
81                 results += self.extract(entry['url'])
82             return results
83         else: # Root page
84             info = {
85                 'id': 'Stanford OpenClassroom',
86                 'type': 'playlist',
87                 'uploader': None,
88                 'upload_date': None,
89             }
90
91             self.report_download_webpage(info['id'])
92             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
93             try:
94                 rootpage = compat_urllib_request.urlopen(rootURL).read()
95             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
96                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
97
98             info['title'] = info['id']
99
100             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
101             info['list'] = [
102                 {
103                     'type': 'reference',
104                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
105                 }
106                     for cpage in links]
107
108             results = []
109             for entry in info['list']:
110                 assert entry['type'] == 'reference'
111                 results += self.extract(entry['url'])
112             return results