1 from __future__ import unicode_literals
5 from .common import InfoExtractor
15 class Channel9IE(InfoExtractor):
17 Common extractor for channel9.msdn.com.
19 The type of provided URL (video or playlist) is determined according to
20 meta Search.PageType from web page HTML rather than URL itself, as it is
21 not always possible to do.
25 _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
28 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
29 'md5': '32083d4eaf1946db6d454313f44510ca',
31 'id': '6c413323-383a-49dc-88f9-a22800cab024',
33 'title': 'Developer Kick-Off Session: Stuff We Love',
34 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
36 'thumbnail': r're:https?://.*\.jpg',
37 'timestamp': 1377717420,
38 'upload_date': '20130828',
39 'session_code': 'KOS002',
40 'session_room': 'Arena 1A',
41 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
44 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
45 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
47 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
49 'title': 'Self-service BI with Power BI - nuclear testing',
50 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
52 'thumbnail': r're:https?://.*\.jpg',
53 'timestamp': 1386381991,
54 'upload_date': '20131207',
55 'authors': ['Mike Wilmot'],
58 # low quality mp4 is best
59 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
61 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
63 'title': 'Ranges for the Standard Library',
64 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
66 'thumbnail': r're:https?://.*\.jpg',
67 'upload_date': '20150930',
68 'timestamp': 1443640735,
71 'skip_download': True,
74 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
76 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
79 'playlist_mincount': 100,
81 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
82 'only_matching': True,
84 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
85 'only_matching': True,
88 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
90 def _extract_list(self, video_id, rss_url=None):
92 rss_url = self._RSS_URL % video_id
93 rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
94 entries = [self.url_result(session_url.text, 'Channel9')
95 for session_url in rss.findall('./channel/item/link')]
96 title_text = rss.find('./channel/title').text
97 return self.playlist_result(entries, video_id, title_text)
99 def _real_extract(self, url):
100 content_path, rss = re.match(self._VALID_URL, url).groups()
103 return self._extract_list(content_path, url)
105 webpage = self._download_webpage(
106 url, content_path, 'Downloading web page')
108 episode_data = self._search_regex(
109 r"data-episode='([^']+)'", webpage, 'episode data', default=None)
111 episode_data = self._parse_json(unescapeHTML(
112 episode_data), content_path)
113 content_id = episode_data['contentId']
114 is_session = '/Sessions(' in episode_data['api']
115 content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
117 content_url += '?$expand=Speakers'
119 content_url += '?$expand=Authors'
120 content_data = self._download_json(content_url, content_id)
121 title = content_data['Title']
132 q_url = content_data.get(q)
139 slides = content_data.get('Slides')
140 zip_file = content_data.get('ZipFile')
142 if not formats and not slides and not zip_file:
143 raise ExtractorError(
144 'None of recording, slides or zip are available for %s' % content_path)
147 for caption in content_data.get('Captions', []):
148 caption_url = caption.get('Url')
151 subtitles.setdefault(caption.get('Language', 'en'), []).append({
159 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
160 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
161 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
162 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
163 'avg_rating': int_or_none(content_data.get('Rating')),
164 'rating_count': int_or_none(content_data.get('RatingCount')),
165 'view_count': int_or_none(content_data.get('Views')),
166 'comment_count': int_or_none(content_data.get('CommentCount')),
167 'subtitles': subtitles,
171 for s in content_data.get('Speakers', []):
172 speaker_name = s.get('FullName')
175 speakers.append(speaker_name)
178 'session_code': content_data.get('Code'),
179 'session_room': content_data.get('Room'),
180 'session_speakers': speakers,
184 for a in content_data.get('Authors', []):
185 author_name = a.get('DisplayName')
188 authors.append(author_name)
189 common['authors'] = authors
195 d.update({'title': title + '-Slides', 'url': slides})
200 d.update({'title': title + '-Zip', 'url': zip_file})
205 d.update({'title': title, 'formats': formats})
207 return self.playlist_result(contents)
209 return self._extract_list(content_path)