_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     unescapeHTML,
   9     int_or_none,
  10     parse_iso8601,
  11     clean_html,
  12 )
  13
  14
  15 class Channel9IE(InfoExtractor):
  16     '''
  17     Common extractor for channel9.msdn.com.
  18
  19     The type of provided URL (video or playlist) is determined according to
  20     meta Search.PageType from web page HTML rather than URL itself, as it is
  21     not always possible to do.
  22     '''
  23     IE_DESC = 'Channel 9'
  24     IE_NAME = 'channel9'
  25     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
  26
  27     _TESTS = [{
  28         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  29         'md5': '32083d4eaf1946db6d454313f44510ca',
  30         'info_dict': {
  31             'id': '6c413323-383a-49dc-88f9-a22800cab024',
  32             'ext': 'wmv',
  33             'title': 'Developer Kick-Off Session: Stuff We Love',
  34             'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
  35             'duration': 4576,
  36             'thumbnail': r're:https?://.*\.jpg',
  37             'timestamp': 1377717420,
  38             'upload_date': '20130828',
  39             'session_code': 'KOS002',
  40             'session_room': 'Arena 1A',
  41             'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
  42         },
  43     }, {
  44         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  45         'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
  46         'info_dict': {
  47             'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
  48             'ext': 'wmv',
  49             'title': 'Self-service BI with Power BI - nuclear testing',
  50             'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
  51             'duration': 1540,
  52             'thumbnail': r're:https?://.*\.jpg',
  53             'timestamp': 1386381991,
  54             'upload_date': '20131207',
  55             'authors': ['Mike Wilmot'],
  56         },
  57     }, {
  58         # low quality mp4 is best
  59         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  60         'info_dict': {
  61             'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
  62             'ext': 'mp4',
  63             'title': 'Ranges for the Standard Library',
  64             'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
  65             'duration': 5646,
  66             'thumbnail': r're:https?://.*\.jpg',
  67             'upload_date': '20150930',
  68             'timestamp': 1443640735,
  69         },
  70         'params': {
  71             'skip_download': True,
  72         },
  73     }, {
  74         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
  75         'info_dict': {
  76             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
  77             'title': 'Channel 9',
  78         },
  79         'playlist_mincount': 100,
  80     }, {
  81         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
  82         'only_matching': True,
  83     }, {
  84         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
  85         'only_matching': True,
  86     }]
  87
  88     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  89
  90     def _extract_list(self, video_id, rss_url=None):
  91         if not rss_url:
  92             rss_url = self._RSS_URL % video_id
  93         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
  94         entries = [self.url_result(session_url.text, 'Channel9')
  95                    for session_url in rss.findall('./channel/item/link')]
  96         title_text = rss.find('./channel/title').text
  97         return self.playlist_result(entries, video_id, title_text)
  98
  99     def _real_extract(self, url):
 100         content_path, rss = re.match(self._VALID_URL, url).groups()
 101
 102         if rss:
 103             return self._extract_list(content_path, url)
 104
 105         webpage = self._download_webpage(
 106             url, content_path, 'Downloading web page')
 107
 108         episode_data = self._search_regex(
 109             r"data-episode='([^']+)'", webpage, 'episode data', default=None)
 110         if episode_data:
 111             episode_data = self._parse_json(unescapeHTML(
 112                 episode_data), content_path)
 113             content_id = episode_data['contentId']
 114             is_session = '/Sessions(' in episode_data['api']
 115             content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
 116             if is_session:
 117                 content_url += '?$expand=Speakers'
 118             else:
 119                 content_url += '?$expand=Authors'
 120             content_data = self._download_json(content_url, content_id)
 121             title = content_data['Title']
 122
 123             formats = []
 124             qualities = [
 125                 'VideoMP4Low',
 126                 'VideoWMV',
 127                 'VideoMP4Medium',
 128                 'VideoMP4High',
 129                 'VideoWMVHQ',
 130             ]
 131             for q in qualities:
 132                 q_url = content_data.get(q)
 133                 if not q_url:
 134                     continue
 135                 formats.append({
 136                     'format_id': q,
 137                     'url': q_url,
 138                 })
 139             slides = content_data.get('Slides')
 140             zip_file = content_data.get('ZipFile')
 141
 142             if not formats and not slides and not zip_file:
 143                 raise ExtractorError(
 144                     'None of recording, slides or zip are available for %s' % content_path)
 145
 146             subtitles = {}
 147             for caption in content_data.get('Captions', []):
 148                 caption_url = caption.get('Url')
 149                 if not caption_url:
 150                     continue
 151                 subtitles.setdefault(caption.get('Language', 'en'), []).append({
 152                     'url': caption_url,
 153                     'ext': 'vtt',
 154                 })
 155
 156             common = {
 157                 'id': content_id,
 158                 'title': title,
 159                 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
 160                 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
 161                 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
 162                 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
 163                 'avg_rating': int_or_none(content_data.get('Rating')),
 164                 'rating_count': int_or_none(content_data.get('RatingCount')),
 165                 'view_count': int_or_none(content_data.get('Views')),
 166                 'comment_count': int_or_none(content_data.get('CommentCount')),
 167                 'subtitles': subtitles,
 168             }
 169             if is_session:
 170                 speakers = []
 171                 for s in content_data.get('Speakers', []):
 172                     speaker_name = s.get('FullName')
 173                     if not speaker_name:
 174                         continue
 175                     speakers.append(speaker_name)
 176
 177                 common.update({
 178                     'session_code': content_data.get('Code'),
 179                     'session_room': content_data.get('Room'),
 180                     'session_speakers': speakers,
 181                 })
 182             else:
 183                 authors = []
 184                 for a in content_data.get('Authors', []):
 185                     author_name = a.get('DisplayName')
 186                     if not author_name:
 187                         continue
 188                     authors.append(author_name)
 189                 common['authors'] = authors
 190
 191             contents = []
 192
 193             if slides:
 194                 d = common.copy()
 195                 d.update({'title': title + '-Slides', 'url': slides})
 196                 contents.append(d)
 197
 198             if zip_file:
 199                 d = common.copy()
 200                 d.update({'title': title + '-Zip', 'url': zip_file})
 201                 contents.append(d)
 202
 203             if formats:
 204                 d = common.copy()
 205                 d.update({'title': title, 'formats': formats})
 206                 contents.append(d)
 207             return self.playlist_result(contents)
 208         else:
 209             return self._extract_list(content_path)