1 from __future__ import unicode_literals
5 from .common import InfoExtractor
16 class Channel9IE(InfoExtractor):
18 Common extractor for channel9.msdn.com.
20 The type of provided URL (video or playlist) is determined according to
21 meta Search.PageType from web page HTML rather than URL itself, as it is
22 not always possible to do.
26 _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
29 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
30 'md5': '32083d4eaf1946db6d454313f44510ca',
32 'id': '6c413323-383a-49dc-88f9-a22800cab024',
34 'title': 'Developer Kick-Off Session: Stuff We Love',
35 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
37 'thumbnail': r're:https?://.*\.jpg',
38 'timestamp': 1377717420,
39 'upload_date': '20130828',
40 'session_code': 'KOS002',
41 'session_room': 'Arena 1A',
42 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
45 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
46 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
48 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
50 'title': 'Self-service BI with Power BI - nuclear testing',
51 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
53 'thumbnail': r're:https?://.*\.jpg',
54 'timestamp': 1386381991,
55 'upload_date': '20131207',
56 'authors': ['Mike Wilmot'],
59 # low quality mp4 is best
60 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
62 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
64 'title': 'Ranges for the Standard Library',
65 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
67 'thumbnail': r're:https?://.*\.jpg',
68 'upload_date': '20150930',
69 'timestamp': 1443640735,
72 'skip_download': True,
75 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
77 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
80 'playlist_mincount': 100,
82 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
83 'only_matching': True,
85 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
86 'only_matching': True,
89 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
91 def _extract_list(self, video_id, rss_url=None):
93 rss_url = self._RSS_URL % video_id
94 rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
95 entries = [self.url_result(session_url.text, 'Channel9')
96 for session_url in rss.findall('./channel/item/link')]
97 title_text = rss.find('./channel/title').text
98 return self.playlist_result(entries, video_id, title_text)
100 def _real_extract(self, url):
101 content_path, rss = re.match(self._VALID_URL, url).groups()
104 return self._extract_list(content_path, url)
106 webpage = self._download_webpage(
107 url, content_path, 'Downloading web page')
109 episode_data = self._search_regex(
110 r"data-episode='([^']+)'", webpage, 'episode data', default=None)
112 episode_data = self._parse_json(unescapeHTML(
113 episode_data), content_path)
114 content_id = episode_data['contentId']
115 is_session = '/Sessions(' in episode_data['api']
116 content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
118 content_url += '?$expand=Speakers'
120 content_url += '?$expand=Authors'
121 content_data = self._download_json(content_url, content_id)
122 title = content_data['Title']
127 'wmv-low', 'mp4-low',
128 'wmv-mid', 'mp4-mid',
129 'wmv-high', 'mp4-high',
132 quality_key = qualities(QUALITIES)
134 def quality(quality_id, format_url):
135 return (len(QUALITIES) if '_Source.' in format_url
136 else quality_key(quality_id))
144 'Low Quality WMV': 'wmv-low',
145 'Low Quality MP4': 'mp4-low',
146 'Mid Quality WMV': 'wmv-mid',
147 'Mid Quality MP4': 'mp4-mid',
148 'High Quality WMV': 'wmv-high',
149 'High Quality MP4': 'mp4-high',
152 formats_select = self._search_regex(
153 r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
154 'formats select', default=None)
156 for mobj in re.finditer(
157 r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
159 format_url = mobj.group('url')
160 if format_url in urls:
163 format_id = mobj.group('format')
164 quality_id = SITE_QUALITIES.get(format_id, format_id)
167 'format_id': quality_id,
168 'quality': quality(quality_id, format_url),
169 'vcodec': 'none' if quality_id == 'mp3' else None,
173 'VideoMP4Low': 'mp4-low',
174 'VideoWMV': 'wmv-mid',
175 'VideoMP4Medium': 'mp4-mid',
176 'VideoMP4High': 'mp4-high',
177 'VideoWMVHQ': 'wmv-hq',
180 for format_id, q in API_QUALITIES.items():
181 q_url = content_data.get(format_id)
182 if not q_url or q_url in urls:
188 'quality': quality(q, q_url),
191 self._sort_formats(formats)
193 slides = content_data.get('Slides')
194 zip_file = content_data.get('ZipFile')
196 if not formats and not slides and not zip_file:
197 raise ExtractorError(
198 'None of recording, slides or zip are available for %s' % content_path)
201 for caption in content_data.get('Captions', []):
202 caption_url = caption.get('Url')
205 subtitles.setdefault(caption.get('Language', 'en'), []).append({
213 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
214 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
215 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
216 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
217 'avg_rating': int_or_none(content_data.get('Rating')),
218 'rating_count': int_or_none(content_data.get('RatingCount')),
219 'view_count': int_or_none(content_data.get('Views')),
220 'comment_count': int_or_none(content_data.get('CommentCount')),
221 'subtitles': subtitles,
225 for s in content_data.get('Speakers', []):
226 speaker_name = s.get('FullName')
229 speakers.append(speaker_name)
232 'session_code': content_data.get('Code'),
233 'session_room': content_data.get('Room'),
234 'session_speakers': speakers,
238 for a in content_data.get('Authors', []):
239 author_name = a.get('DisplayName')
242 authors.append(author_name)
243 common['authors'] = authors
249 d.update({'title': title + '-Slides', 'url': slides})
254 d.update({'title': title + '-Zip', 'url': zip_file})
259 d.update({'title': title, 'formats': formats})
261 return self.playlist_result(contents)
263 return self._extract_list(content_path)