1 from __future__ import unicode_literals
5 from .common import InfoExtractor
13 class Channel9IE(InfoExtractor):
15 Common extractor for channel9.msdn.com.
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
27 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
30 'id': 'Events/TechEd/Australia/2013/KOS002',
32 'title': 'Developer Kick-Off Session: Stuff We Love',
33 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
35 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
36 'session_code': 'KOS002',
37 'session_day': 'Day 1',
38 'session_room': 'Arena 1A',
39 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
43 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
46 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
48 'title': 'Self-service BI with Power BI - nuclear testing',
49 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
51 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
52 'authors': ['Mike Wilmot'],
57 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
59 def _formats_from_html(self, html):
62 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
63 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
64 (?:<div\s+class="popup\s+rounded">\s*
65 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
66 </div>)? # File size part may be missing
70 'Low Quality WMV', 'Low Quality MP4',
71 'Mid Quality WMV', 'Mid Quality MP4',
72 'High Quality WMV', 'High Quality MP4'))
74 'url': x.group('url'),
75 'format_id': x.group('quality'),
76 'format_note': x.group('note'),
77 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
78 'filesize_approx': parse_filesize(x.group('filesize')),
79 'quality': quality(x.group('quality')),
80 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
81 } for x in list(re.finditer(FORMAT_REGEX, html))]
83 self._sort_formats(formats)
87 def _extract_title(self, html):
88 title = self._html_search_meta('title', html, 'title')
90 title = self._og_search_title(html)
91 TITLE_SUFFIX = ' (Channel 9)'
92 if title is not None and title.endswith(TITLE_SUFFIX):
93 title = title[:-len(TITLE_SUFFIX)]
96 def _extract_description(self, html):
97 DESCRIPTION_REGEX = r'''(?sx)
98 <div\s+class="entry-content">\s*
99 <div\s+id="entry-body">\s*
100 (?P<description>.+?)\s*
104 m = re.search(DESCRIPTION_REGEX, html)
106 return m.group('description')
107 return self._html_search_meta('description', html, 'description')
109 def _extract_duration(self, html):
110 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
111 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
113 def _extract_slides(self, html):
114 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
115 return m.group('slidesurl') if m is not None else None
117 def _extract_zip(self, html):
118 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
119 return m.group('zipurl') if m is not None else None
121 def _extract_avg_rating(self, html):
122 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
123 return float(m.group('avgrating')) if m is not None else 0
125 def _extract_rating_count(self, html):
126 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
127 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
129 def _extract_view_count(self, html):
130 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
131 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
133 def _extract_comment_count(self, html):
134 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
135 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
137 def _fix_count(self, count):
138 return int(str(count).replace(',', '')) if count is not None else None
140 def _extract_authors(self, html):
141 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
144 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
146 def _extract_session_code(self, html):
147 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
148 return m.group('code') if m is not None else None
150 def _extract_session_day(self, html):
151 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
152 return m.group('day').strip() if m is not None else None
154 def _extract_session_room(self, html):
155 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
156 return m.group('room') if m is not None else None
158 def _extract_session_speakers(self, html):
159 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
161 def _extract_content(self, html, content_path):
162 # Look for downloadable content
163 formats = self._formats_from_html(html)
164 slides = self._extract_slides(html)
165 zip_ = self._extract_zip(html)
167 # Nothing to download
168 if len(formats) == 0 and slides is None and zip_ is None:
169 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
173 title = self._extract_title(html)
174 description = self._extract_description(html)
175 thumbnail = self._og_search_thumbnail(html)
176 duration = self._extract_duration(html)
177 avg_rating = self._extract_avg_rating(html)
178 rating_count = self._extract_rating_count(html)
179 view_count = self._extract_view_count(html)
180 comment_count = self._extract_comment_count(html)
185 'description': description,
186 'thumbnail': thumbnail,
187 'duration': duration,
188 'avg_rating': avg_rating,
189 'rating_count': rating_count,
190 'view_count': view_count,
191 'comment_count': comment_count,
196 if slides is not None:
198 d.update({'title': title + '-Slides', 'url': slides})
203 d.update({'title': title + '-Zip', 'url': zip_})
208 d.update({'title': title, 'formats': formats})
213 def _extract_entry_item(self, html, content_path):
214 contents = self._extract_content(html, content_path)
218 if len(contents) > 1:
219 raise ExtractorError('Got more than one entry')
221 result['authors'] = self._extract_authors(html)
225 def _extract_session(self, html, content_path):
226 contents = self._extract_content(html, content_path)
231 'session_code': self._extract_session_code(html),
232 'session_day': self._extract_session_day(html),
233 'session_room': self._extract_session_room(html),
234 'session_speakers': self._extract_session_speakers(html),
237 for content in contents:
238 content.update(session_meta)
240 return self.playlist_result(contents)
242 def _extract_list(self, content_path):
243 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
244 entries = [self.url_result(session_url.text, 'Channel9')
245 for session_url in rss.findall('./channel/item/link')]
246 title_text = rss.find('./channel/title').text
247 return self.playlist_result(entries, content_path, title_text)
249 def _real_extract(self, url):
250 mobj = re.match(self._VALID_URL, url)
251 content_path = mobj.group('contentpath')
253 webpage = self._download_webpage(url, content_path, 'Downloading web page')
255 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
256 if page_type_m is not None:
257 page_type = page_type_m.group('pagetype')
258 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
259 return self._extract_entry_item(webpage, content_path)
260 elif page_type == 'Session': # Event session page, may contain downloadable content
261 return self._extract_session(webpage, content_path)
262 elif page_type == 'Event':
263 return self._extract_list(content_path)
265 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
267 else: # Assuming list
268 return self._extract_list(content_path)