1 from __future__ import unicode_literals
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
8 class Channel9IE(InfoExtractor):
10 Common extractor for channel9.msdn.com.
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
18 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
24 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
26 'title': 'Developer Kick-Off Session: Stuff We Love',
27 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
29 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 'session_code': 'KOS002',
31 'session_day': 'Day 1',
32 'session_room': 'Arena 1A',
33 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
37 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41 'title': 'Self-service BI with Power BI - nuclear testing',
42 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
44 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 'authors': [ 'Mike Wilmot' ],
50 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
53 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
55 def _restore_bytes(self, formatted_size):
56 if not formatted_size:
58 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
61 units = m.group('units')
63 exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
66 size = float(m.group('size'))
67 return int(size * (1024 ** exponent))
69 def _formats_from_html(self, html):
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
78 # Extract known formats
80 'url': x.group('url'),
81 'format_id': x.group('quality'),
82 'format_note': x.group('note'),
83 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
84 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
85 'preference': self._known_formats.index(x.group('quality')),
86 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
87 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
89 self._sort_formats(formats)
93 def _extract_title(self, html):
94 title = self._html_search_meta('title', html, 'title')
96 title = self._og_search_title(html)
97 TITLE_SUFFIX = ' (Channel 9)'
98 if title is not None and title.endswith(TITLE_SUFFIX):
99 title = title[:-len(TITLE_SUFFIX)]
102 def _extract_description(self, html):
103 DESCRIPTION_REGEX = r'''(?sx)
104 <div\s+class="entry-content">\s*
105 <div\s+id="entry-body">\s*
106 (?P<description>.+?)\s*
110 m = re.search(DESCRIPTION_REGEX, html)
112 return m.group('description')
113 return self._html_search_meta('description', html, 'description')
115 def _extract_duration(self, html):
116 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
117 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
119 def _extract_slides(self, html):
120 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
121 return m.group('slidesurl') if m is not None else None
123 def _extract_zip(self, html):
124 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
125 return m.group('zipurl') if m is not None else None
127 def _extract_avg_rating(self, html):
128 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
129 return float(m.group('avgrating')) if m is not None else 0
131 def _extract_rating_count(self, html):
132 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
133 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
135 def _extract_view_count(self, html):
136 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
137 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
139 def _extract_comment_count(self, html):
140 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
141 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
143 def _fix_count(self, count):
144 return int(str(count).replace(',', '')) if count is not None else None
146 def _extract_authors(self, html):
147 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
150 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
152 def _extract_session_code(self, html):
153 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
154 return m.group('code') if m is not None else None
156 def _extract_session_day(self, html):
157 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
158 return m.group('day') if m is not None else None
160 def _extract_session_room(self, html):
161 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
162 return m.group('room') if m is not None else None
164 def _extract_session_speakers(self, html):
165 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
167 def _extract_content(self, html, content_path):
168 # Look for downloadable content
169 formats = self._formats_from_html(html)
170 slides = self._extract_slides(html)
171 zip_ = self._extract_zip(html)
173 # Nothing to download
174 if len(formats) == 0 and slides is None and zip_ is None:
175 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
179 title = self._extract_title(html)
180 description = self._extract_description(html)
181 thumbnail = self._og_search_thumbnail(html)
182 duration = self._extract_duration(html)
183 avg_rating = self._extract_avg_rating(html)
184 rating_count = self._extract_rating_count(html)
185 view_count = self._extract_view_count(html)
186 comment_count = self._extract_comment_count(html)
188 common = {'_type': 'video',
190 'description': description,
191 'thumbnail': thumbnail,
192 'duration': duration,
193 'avg_rating': avg_rating,
194 'rating_count': rating_count,
195 'view_count': view_count,
196 'comment_count': comment_count,
201 if slides is not None:
203 d.update({ 'title': title + '-Slides', 'url': slides })
208 d.update({ 'title': title + '-Zip', 'url': zip_ })
213 d.update({ 'title': title, 'formats': formats })
218 def _extract_entry_item(self, html, content_path):
219 contents = self._extract_content(html, content_path)
223 authors = self._extract_authors(html)
225 for content in contents:
226 content['authors'] = authors
230 def _extract_session(self, html, content_path):
231 contents = self._extract_content(html, content_path)
235 session_meta = {'session_code': self._extract_session_code(html),
236 'session_day': self._extract_session_day(html),
237 'session_room': self._extract_session_room(html),
238 'session_speakers': self._extract_session_speakers(html),
241 for content in contents:
242 content.update(session_meta)
246 def _extract_list(self, content_path):
247 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
248 entries = [self.url_result(session_url.text, 'Channel9')
249 for session_url in rss.findall('./channel/item/link')]
250 title_text = rss.find('./channel/title').text
251 return self.playlist_result(entries, content_path, title_text)
253 def _real_extract(self, url):
254 mobj = re.match(self._VALID_URL, url)
255 content_path = mobj.group('contentpath')
257 webpage = self._download_webpage(url, content_path, 'Downloading web page')
259 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
260 if page_type_m is None:
261 raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
263 page_type = page_type_m.group('pagetype')
264 if page_type == 'List': # List page, may contain list of 'item'-like objects
265 return self._extract_list(content_path)
266 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
267 return self._extract_entry_item(webpage, content_path)
268 elif page_type == 'Session': # Event session page, may contain downloadable content
269 return self._extract_session(webpage, content_path)
271 raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)