_ Git - youtube-dl/blob - youtube_dl/extractor/stanfordoc.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     compat_str,
   6
   7     ExtractorError,
   8     orderedSet,
   9     unescapeHTML,
  10 )
  11
  12
  13 class StanfordOpenClassroomIE(InfoExtractor):
  14     IE_NAME = u'stanfordoc'
  15     IE_DESC = u'Stanford Open ClassRoom'
  16     _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  17     _TEST = {
  18         u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
  19         u'file': u'PracticalUnix_intro-environment.mp4',
  20         u'md5': u'544a9468546059d4e80d76265b0443b8',
  21         u'info_dict': {
  22             u"title": u"Intro Environment"
  23         }
  24     }
  25
  26     def _real_extract(self, url):
  27         mobj = re.match(self._VALID_URL, url)
  28         if mobj is None:
  29             raise ExtractorError(u'Invalid URL: %s' % url)
  30
  31         if mobj.group('course') and mobj.group('video'): # A specific video
  32             course = mobj.group('course')
  33             video = mobj.group('video')
  34             info = {
  35                 'id': course + '_' + video,
  36                 'uploader': None,
  37                 'upload_date': None,
  38             }
  39
  40             self.report_extraction(info['id'])
  41             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  42             xmlUrl = baseUrl + video + '.xml'
  43             mdoc = self._download_xml(xmlUrl, info['id'])
  44             try:
  45                 info['title'] = mdoc.findall('./title')[0].text
  46                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  47             except IndexError:
  48                 raise ExtractorError(u'Invalid metadata XML file')
  49             info['ext'] = info['url'].rpartition('.')[2]
  50             return [info]
  51         elif mobj.group('course'): # A course page
  52             course = mobj.group('course')
  53             info = {
  54                 'id': course,
  55                 'type': 'playlist',
  56                 'uploader': None,
  57                 'upload_date': None,
  58             }
  59
  60             coursepage = self._download_webpage(url, info['id'],
  61                                         note='Downloading course info page',
  62                                         errnote='Unable to download course info page')
  63
  64             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  65
  66             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  67                 coursepage, u'description', fatal=False)
  68
  69             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  70             info['list'] = [
  71                 {
  72                     'type': 'reference',
  73                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  74                 }
  75                     for vpage in links]
  76             results = []
  77             for entry in info['list']:
  78                 assert entry['type'] == 'reference'
  79                 results += self.extract(entry['url'])
  80             return results
  81         else: # Root page
  82             info = {
  83                 'id': 'Stanford OpenClassroom',
  84                 'type': 'playlist',
  85                 'uploader': None,
  86                 'upload_date': None,
  87             }
  88
  89             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  90             rootpage = self._download_webpage(rootURL, info['id'],
  91                 errnote=u'Unable to download course info page')
  92
  93             info['title'] = info['id']
  94
  95             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  96             info['list'] = [
  97                 {
  98                     'type': 'reference',
  99                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 100                 }
 101                     for cpage in links]
 102
 103             results = []
 104             for entry in info['list']:
 105                 assert entry['type'] == 'reference'
 106                 results += self.extract(entry['url'])
 107             return results