Remove the calls to 'compat_urllib_request.urlopen' in a few extractors
[youtube-dl] / youtube_dl / extractor / stanfordoc.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5     compat_str,
6
7     ExtractorError,
8     orderedSet,
9     unescapeHTML,
10 )
11
12
13 class StanfordOpenClassroomIE(InfoExtractor):
14     IE_NAME = u'stanfordoc'
15     IE_DESC = u'Stanford Open ClassRoom'
16     _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
17     _TEST = {
18         u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
19         u'file': u'PracticalUnix_intro-environment.mp4',
20         u'md5': u'544a9468546059d4e80d76265b0443b8',
21         u'info_dict': {
22             u"title": u"Intro Environment"
23         }
24     }
25
26     def _real_extract(self, url):
27         mobj = re.match(self._VALID_URL, url)
28         if mobj is None:
29             raise ExtractorError(u'Invalid URL: %s' % url)
30
31         if mobj.group('course') and mobj.group('video'): # A specific video
32             course = mobj.group('course')
33             video = mobj.group('video')
34             info = {
35                 'id': course + '_' + video,
36                 'uploader': None,
37                 'upload_date': None,
38             }
39
40             self.report_extraction(info['id'])
41             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
42             xmlUrl = baseUrl + video + '.xml'
43             mdoc = self._download_xml(xmlUrl, info['id'])
44             try:
45                 info['title'] = mdoc.findall('./title')[0].text
46                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
47             except IndexError:
48                 raise ExtractorError(u'Invalid metadata XML file')
49             info['ext'] = info['url'].rpartition('.')[2]
50             return [info]
51         elif mobj.group('course'): # A course page
52             course = mobj.group('course')
53             info = {
54                 'id': course,
55                 'type': 'playlist',
56                 'uploader': None,
57                 'upload_date': None,
58             }
59
60             coursepage = self._download_webpage(url, info['id'],
61                                         note='Downloading course info page',
62                                         errnote='Unable to download course info page')
63
64             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
65
66             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
67                 coursepage, u'description', fatal=False)
68
69             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
70             info['list'] = [
71                 {
72                     'type': 'reference',
73                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
74                 }
75                     for vpage in links]
76             results = []
77             for entry in info['list']:
78                 assert entry['type'] == 'reference'
79                 results += self.extract(entry['url'])
80             return results
81         else: # Root page
82             info = {
83                 'id': 'Stanford OpenClassroom',
84                 'type': 'playlist',
85                 'uploader': None,
86                 'upload_date': None,
87             }
88
89             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
90             rootpage = self._download_webpage(rootURL, info['id'],
91                 errnote=u'Unable to download course info page')
92
93             info['title'] = info['id']
94
95             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
96             info['list'] = [
97                 {
98                     'type': 'reference',
99                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
100                 }
101                     for cpage in links]
102
103             results = []
104             for entry in info['list']:
105                 assert entry['type'] == 'reference'
106                 results += self.extract(entry['url'])
107             return results