1 from __future__ import unicode_literals
5 from .common import InfoExtractor
13 class Channel9IE(InfoExtractor):
15 Common extractor for channel9.msdn.com.
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
27 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
30 'id': 'Events/TechEd/Australia/2013/KOS002',
32 'title': 'Developer Kick-Off Session: Stuff We Love',
33 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
35 'thumbnail': 're:http://.*\.jpg',
36 'session_code': 'KOS002',
37 'session_day': 'Day 1',
38 'session_room': 'Arena 1A',
39 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
43 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
46 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
48 'title': 'Self-service BI with Power BI - nuclear testing',
49 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
51 'thumbnail': 're:http://.*\.jpg',
52 'authors': ['Mike Wilmot'],
56 # low quality mp4 is best
57 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
59 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
61 'title': 'Ranges for the Standard Library',
62 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
64 'thumbnail': 're:http://.*\.jpg',
67 'skip_download': True,
72 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
74 def _formats_from_html(self, html):
77 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
78 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
79 (?:<div\s+class="popup\s+rounded">\s*
80 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
81 </div>)? # File size part may be missing
85 'Low Quality WMV', 'Low Quality MP4',
86 'Mid Quality WMV', 'Mid Quality MP4',
87 'High Quality WMV', 'High Quality MP4'))
89 'url': x.group('url'),
90 'format_id': x.group('quality'),
91 'format_note': x.group('note'),
92 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
93 'filesize_approx': parse_filesize(x.group('filesize')),
94 'quality': quality(x.group('quality')),
95 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
96 } for x in list(re.finditer(FORMAT_REGEX, html))]
98 self._sort_formats(formats)
102 def _extract_title(self, html):
103 title = self._html_search_meta('title', html, 'title')
105 title = self._og_search_title(html)
106 TITLE_SUFFIX = ' (Channel 9)'
107 if title is not None and title.endswith(TITLE_SUFFIX):
108 title = title[:-len(TITLE_SUFFIX)]
111 def _extract_description(self, html):
112 DESCRIPTION_REGEX = r'''(?sx)
113 <div\s+class="entry-content">\s*
114 <div\s+id="entry-body">\s*
115 (?P<description>.+?)\s*
119 m = re.search(DESCRIPTION_REGEX, html)
121 return m.group('description')
122 return self._html_search_meta('description', html, 'description')
124 def _extract_duration(self, html):
125 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
126 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
128 def _extract_slides(self, html):
129 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
130 return m.group('slidesurl') if m is not None else None
132 def _extract_zip(self, html):
133 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
134 return m.group('zipurl') if m is not None else None
136 def _extract_avg_rating(self, html):
137 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
138 return float(m.group('avgrating')) if m is not None else 0
140 def _extract_rating_count(self, html):
141 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
142 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
144 def _extract_view_count(self, html):
145 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
146 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
148 def _extract_comment_count(self, html):
149 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
150 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
152 def _fix_count(self, count):
153 return int(str(count).replace(',', '')) if count is not None else None
155 def _extract_authors(self, html):
156 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
159 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
161 def _extract_session_code(self, html):
162 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
163 return m.group('code') if m is not None else None
165 def _extract_session_day(self, html):
166 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
167 return m.group('day').strip() if m is not None else None
169 def _extract_session_room(self, html):
170 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
171 return m.group('room') if m is not None else None
173 def _extract_session_speakers(self, html):
174 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
176 def _extract_content(self, html, content_path):
177 # Look for downloadable content
178 formats = self._formats_from_html(html)
179 slides = self._extract_slides(html)
180 zip_ = self._extract_zip(html)
182 # Nothing to download
183 if len(formats) == 0 and slides is None and zip_ is None:
184 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
188 title = self._extract_title(html)
189 description = self._extract_description(html)
190 thumbnail = self._og_search_thumbnail(html)
191 duration = self._extract_duration(html)
192 avg_rating = self._extract_avg_rating(html)
193 rating_count = self._extract_rating_count(html)
194 view_count = self._extract_view_count(html)
195 comment_count = self._extract_comment_count(html)
200 'description': description,
201 'thumbnail': thumbnail,
202 'duration': duration,
203 'avg_rating': avg_rating,
204 'rating_count': rating_count,
205 'view_count': view_count,
206 'comment_count': comment_count,
211 if slides is not None:
213 d.update({'title': title + '-Slides', 'url': slides})
218 d.update({'title': title + '-Zip', 'url': zip_})
223 d.update({'title': title, 'formats': formats})
228 def _extract_entry_item(self, html, content_path):
229 contents = self._extract_content(html, content_path)
233 if len(contents) > 1:
234 raise ExtractorError('Got more than one entry')
236 result['authors'] = self._extract_authors(html)
240 def _extract_session(self, html, content_path):
241 contents = self._extract_content(html, content_path)
246 'session_code': self._extract_session_code(html),
247 'session_day': self._extract_session_day(html),
248 'session_room': self._extract_session_room(html),
249 'session_speakers': self._extract_session_speakers(html),
252 for content in contents:
253 content.update(session_meta)
255 return self.playlist_result(contents)
257 def _extract_list(self, content_path):
258 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
259 entries = [self.url_result(session_url.text, 'Channel9')
260 for session_url in rss.findall('./channel/item/link')]
261 title_text = rss.find('./channel/title').text
262 return self.playlist_result(entries, content_path, title_text)
264 def _real_extract(self, url):
265 mobj = re.match(self._VALID_URL, url)
266 content_path = mobj.group('contentpath')
268 webpage = self._download_webpage(url, content_path, 'Downloading web page')
270 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
271 if page_type_m is not None:
272 page_type = page_type_m.group('pagetype')
273 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
274 return self._extract_entry_item(webpage, content_path)
275 elif page_type == 'Session': # Event session page, may contain downloadable content
276 return self._extract_session(webpage, content_path)
277 elif page_type == 'Event':
278 return self._extract_list(content_path)
280 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
282 else: # Assuming list
283 return self._extract_list(content_path)