2 from __future__ import unicode_literals
6 from .common import InfoExtractor
7 from ..utils import ExtractorError
9 class Channel9IE(InfoExtractor):
11 Common extractor for channel9.msdn.com.
13 The type of provided URL (video or playlist) is determined according to
14 meta Search.PageType from web page HTML rather than URL itself, as it is
15 not always possible to do.
19 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
23 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24 'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
25 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
27 'title': 'Developer Kick-Off Session: Stuff We Love',
28 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
31 'session_code': 'KOS002',
32 'session_day': 'Day 1',
33 'session_room': 'Arena 1A',
34 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
38 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
39 'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
40 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
42 'title': 'Self-service BI with Power BI - nuclear testing',
43 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
45 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
46 'authors': [ 'Mike Wilmot' ],
51 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
56 def _restore_bytes(self, formatted_size):
57 if not formatted_size:
59 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
62 units = m.group('units')
64 exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
67 size = float(m.group('size'))
68 return int(size * (1024 ** exponent))
70 def _formats_from_html(self, html):
73 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
74 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
75 (?:<div\s+class="popup\s+rounded">\s*
76 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
77 </div>)? # File size part may be missing
79 # Extract known formats
81 'url': x.group('url'),
82 'format_id': x.group('quality'),
83 'format_note': x.group('note'),
84 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
85 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
86 'preference': self._known_formats.index(x.group('quality')),
87 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
88 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
90 self._sort_formats(formats)
94 def _extract_title(self, html):
95 title = self._html_search_meta('title', html, 'title')
97 title = self._og_search_title(html)
98 TITLE_SUFFIX = ' (Channel 9)'
99 if title is not None and title.endswith(TITLE_SUFFIX):
100 title = title[:-len(TITLE_SUFFIX)]
103 def _extract_description(self, html):
104 DESCRIPTION_REGEX = r'''(?sx)
105 <div\s+class="entry-content">\s*
106 <div\s+id="entry-body">\s*
107 (?P<description>.+?)\s*
111 m = re.search(DESCRIPTION_REGEX, html)
113 return m.group('description')
114 return self._html_search_meta('description', html, 'description')
116 def _extract_duration(self, html):
117 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
118 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
120 def _extract_slides(self, html):
121 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
122 return m.group('slidesurl') if m is not None else None
124 def _extract_zip(self, html):
125 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
126 return m.group('zipurl') if m is not None else None
128 def _extract_avg_rating(self, html):
129 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
130 return float(m.group('avgrating')) if m is not None else 0
132 def _extract_rating_count(self, html):
133 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
134 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
136 def _extract_view_count(self, html):
137 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
138 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
140 def _extract_comment_count(self, html):
141 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
142 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
144 def _fix_count(self, count):
145 return int(str(count).replace(',', '')) if count is not None else None
147 def _extract_authors(self, html):
148 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
151 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
153 def _extract_session_code(self, html):
154 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
155 return m.group('code') if m is not None else None
157 def _extract_session_day(self, html):
158 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
159 return m.group('day') if m is not None else None
161 def _extract_session_room(self, html):
162 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
163 return m.group('room') if m is not None else None
165 def _extract_session_speakers(self, html):
166 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
168 def _extract_content(self, html, content_path):
169 # Look for downloadable content
170 formats = self._formats_from_html(html)
171 slides = self._extract_slides(html)
172 zip_ = self._extract_zip(html)
174 # Nothing to download
175 if len(formats) == 0 and slides is None and zip_ is None:
176 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
180 title = self._extract_title(html)
181 description = self._extract_description(html)
182 thumbnail = self._og_search_thumbnail(html)
183 duration = self._extract_duration(html)
184 avg_rating = self._extract_avg_rating(html)
185 rating_count = self._extract_rating_count(html)
186 view_count = self._extract_view_count(html)
187 comment_count = self._extract_comment_count(html)
189 common = {'_type': 'video',
191 'description': description,
192 'thumbnail': thumbnail,
193 'duration': duration,
194 'avg_rating': avg_rating,
195 'rating_count': rating_count,
196 'view_count': view_count,
197 'comment_count': comment_count,
202 if slides is not None:
204 d.update({ 'title': title + '-Slides', 'url': slides })
209 d.update({ 'title': title + '-Zip', 'url': zip_ })
214 d.update({ 'title': title, 'formats': formats })
219 def _extract_entry_item(self, html, content_path):
220 contents = self._extract_content(html, content_path)
224 authors = self._extract_authors(html)
226 for content in contents:
227 content['authors'] = authors
231 def _extract_session(self, html, content_path):
232 contents = self._extract_content(html, content_path)
236 session_meta = {'session_code': self._extract_session_code(html),
237 'session_day': self._extract_session_day(html),
238 'session_room': self._extract_session_room(html),
239 'session_speakers': self._extract_session_speakers(html),
242 for content in contents:
243 content.update(session_meta)
247 def _extract_list(self, content_path):
248 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
249 entries = [self.url_result(session_url.text, 'Channel9')
250 for session_url in rss.findall('./channel/item/link')]
251 title_text = rss.find('./channel/title').text
252 return self.playlist_result(entries, content_path, title_text)
254 def _real_extract(self, url):
255 mobj = re.match(self._VALID_URL, url)
256 content_path = mobj.group('contentpath')
258 webpage = self._download_webpage(url, content_path, 'Downloading web page')
260 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
261 if page_type_m is None:
262 raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
264 page_type = page_type_m.group('pagetype')
265 if page_type == 'List': # List page, may contain list of 'item'-like objects
266 return self._extract_list(content_path)
267 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
268 return self._extract_entry_item(webpage, content_path)
269 elif page_type == 'Session': # Event session page, may contain downloadable content
270 return self._extract_session(webpage, content_path)
272 raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)