_ Git - youtube-dl/blob - youtube_dl/extractor/viidea.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..compat import (
   7     compat_urlparse,
   8     compat_str,
   9 )
  10 from ..utils import (
  11     parse_duration,
  12     js_to_json,
  13     parse_iso8601,
  14 )
  15
  16
  17 class ViideaIE(InfoExtractor):
  18     _VALID_URL = r'''(?x)http://(?:www\.)?(?:
  19             videolectures\.net|
  20             flexilearn\.viidea\.net|
  21             presentations\.ocwconsortium\.org|
  22             video\.travel-zoom\.si|
  23             video\.pomp-forum\.si|
  24             tv\.nil\.si|
  25             video\.hekovnik.com|
  26             video\.szko\.si|
  27             kpk\.viidea\.com|
  28             inside\.viidea\.net|
  29             video\.kiberpipa\.org|
  30             bvvideo\.si|
  31             kongres\.viidea\.net|
  32             edemokracija\.viidea\.com
  33         )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$'''
  34
  35     _TESTS = [{
  36         'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
  37         'info_dict': {
  38             'id': '20171_part1',
  39             'ext': 'mp4',
  40             'title': 'Automatics, robotics and biocybernetics',
  41             'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
  42             'upload_date': '20130627',
  43             'duration': 565,
  44             'thumbnail': 're:http://.*\.jpg',
  45         },
  46     }, {
  47         # video with invalid direct format links (HTTP 403)
  48         'url': 'http://videolectures.net/russir2010_filippova_nlp/',
  49         'info_dict': {
  50             'id': '14891_part1',
  51             'ext': 'flv',
  52             'title': 'NLP at Google',
  53             'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
  54             'duration': 5352,
  55             'thumbnail': 're:http://.*\.jpg',
  56         },
  57         'params': {
  58             # rtmp download
  59             'skip_download': True,
  60         },
  61     }, {
  62         'url': 'http://videolectures.net/deeplearning2015_montreal/',
  63         'info_dict': {
  64             'id': '23181',
  65             'title': 'Deep Learning Summer School, Montreal 2015',
  66             'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
  67             'timestamp': 1438560000,
  68         },
  69         'playlist_count': 30,
  70     }, {
  71         # multi part lecture
  72         'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
  73         'info_dict': {
  74             'id': '9737',
  75             'title': 'Introduction To Bayesian Inference',
  76             'timestamp': 1251622800,
  77         },
  78         'playlist': [{
  79             'info_dict': {
  80                 'id': '9737_part1',
  81                 'ext': 'wmv',
  82                 'title': 'Introduction To Bayesian Inference',
  83             },
  84         }, {
  85             'info_dict': {
  86                 'id': '9737_part2',
  87                 'ext': 'wmv',
  88                 'title': 'Introduction To Bayesian Inference',
  89             },
  90         }],
  91         'playlist_count': 2,
  92     }]
  93
  94     def _real_extract(self, url):
  95         lecture_slug, part = re.match(self._VALID_URL, url).groups()
  96
  97         webpage = self._download_webpage(url, lecture_slug)
  98
  99         cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json)
 100
 101         lecture_id = compat_str(cfg['obj_id'])
 102
 103         base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
 104
 105         lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0]
 106
 107         lecture_info = {
 108             'id': lecture_id,
 109             'display_id': lecture_slug,
 110             'title': lecture_data['title'],
 111             'timestamp': parse_iso8601(lecture_data.get('time')),
 112             'description': lecture_data.get('description_wiki'),
 113             'thumbnail': lecture_data.get('thumb'),
 114         }
 115
 116         entries = []
 117         parts = cfg.get('videos')
 118         if parts:
 119             if len(parts) == 1:
 120                 part = compat_str(parts[0])
 121             if part:
 122                 smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part)
 123                 smil = self._download_smil(smil_url, lecture_id)
 124                 info = self._parse_smil(smil, smil_url, lecture_id)
 125                 info['id'] = '%s_part%s' % (lecture_id, part)
 126                 switch = smil.find('.//switch')
 127                 if switch is not None:
 128                     info['duration'] = parse_duration(switch.attrib.get('dur'))
 129                 return info
 130             else:
 131                 for part in parts:
 132                     entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea'))
 133                 lecture_info['_type'] = 'multi_video'
 134         if not parts or lecture_data.get('type') == 'evt':
 135             # Probably a playlist
 136             playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
 137             entries = [
 138                 self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
 139                 for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
 140             lecture_info['_type'] = 'playlist'
 141
 142         lecture_info['entries'] = entries
 143         return lecture_info