5 from .common import InfoExtractor
11 class Channel9IE(InfoExtractor):
13 Common extractor for channel9.msdn.com.
15 The type of provided URL (video or playlist) is determined according to
16 meta Search.PageType from web page HTML rather than URL itself, as it is
17 not always possible to do.
19 IE_DESC = u'Channel 9'
21 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
25 u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
26 u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
27 u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
29 u'title': u'Developer Kick-Off Session: Stuff We Love',
30 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
32 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
33 u'session_code': u'KOS002',
34 u'session_day': u'Day 1',
35 u'session_room': u'Arena 1A',
36 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
40 u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
41 u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
42 u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
44 u'title': u'Self-service BI with Power BI - nuclear testing',
45 u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10',
47 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48 u'authors': [ u'Mike Wilmot' ],
53 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54 _EXTRACT_ENTRY_ITEMS_FROM_RSS = False
57 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
59 def _restore_bytes(self, formatted_size):
60 if not formatted_size:
62 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
65 units = m.group('units')
67 exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
70 size = float(m.group('size'))
71 return int(size * (1024 ** exponent))
73 def _formats_from_html(self, html):
76 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
77 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
78 (?:<div\s+class="popup\s+rounded">\s*
79 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
80 </div>)? # File size part may be missing
82 # Extract known formats
83 formats = [{'url': x.group('url'),
84 'format_id': x.group('quality'),
85 'format_note': x.group('note'),
86 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
87 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
88 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
89 # Sort according to known formats list
90 formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
93 def _formats_from_rss_item(self, item):
95 def process_formats(elem):
97 for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'):
98 url = media_content.attrib['url']
99 # Ignore unrelated media
100 if url.endswith('.ism/manifest'):
102 format_note = media_content.attrib['type']
103 filesize = int(media_content.attrib['fileSize'])
104 formats.append({'url': url,
105 'format_note': format_note,
106 'format': '%s %s' % (format_note, format_bytes(filesize)),
107 'filesize': filesize,
113 for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'):
114 formats.extend(process_formats(media_group))
116 # Sometimes there are no media:groups in item, but there is media:content
117 # right in item (usually when there is the only media source)
118 formats.extend(process_formats(item))
121 formats.sort(key=lambda fmt: fmt['filesize'])
124 def _extract_title(self, html):
125 title = self._html_search_meta(u'title', html, u'title')
127 title = self._og_search_title(html)
128 TITLE_SUFFIX = u' (Channel 9)'
129 if title is not None and title.endswith(TITLE_SUFFIX):
130 title = title[:-len(TITLE_SUFFIX)]
133 def _extract_description(self, html):
134 DESCRIPTION_REGEX = r'''(?sx)
135 <div\s+class="entry-content">\s*
136 <div\s+id="entry-body">\s*
137 (?P<description>.+?)\s*
141 m = re.search(DESCRIPTION_REGEX, html)
143 return m.group('description')
144 return self._html_search_meta(u'description', html, u'description')
146 def _extract_duration(self, html):
147 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
148 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
150 def _extract_slides(self, html):
151 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
152 return m.group('slidesurl') if m is not None else None
154 def _extract_zip(self, html):
155 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
156 return m.group('zipurl') if m is not None else None
158 def _extract_avg_rating(self, html):
159 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
160 return float(m.group('avgrating')) if m is not None else 0
162 def _extract_rating_count(self, html):
163 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
164 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
166 def _extract_view_count(self, html):
167 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
168 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
170 def _extract_comment_count(self, html):
171 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
172 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
174 def _fix_count(self, count):
175 return int(str(count).replace(',', '')) if count is not None else None
177 def _extract_authors(self, html):
178 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
181 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
183 def _extract_session_code(self, html):
184 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
185 return m.group('code') if m is not None else None
187 def _extract_session_day(self, html):
188 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
189 return m.group('day') if m is not None else None
191 def _extract_session_room(self, html):
192 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
193 return m.group('room') if m is not None else None
195 def _extract_session_speakers(self, html):
196 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
198 def _extract_content(self, html, content_path):
199 # Look for downloadable content
200 formats = self._formats_from_html(html)
201 slides = self._extract_slides(html)
202 zip_ = self._extract_zip(html)
204 # Nothing to download
205 if len(formats) == 0 and slides is None and zip_ is None:
206 self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
210 title = self._extract_title(html)
211 description = self._extract_description(html)
212 thumbnail = self._og_search_thumbnail(html)
213 duration = self._extract_duration(html)
214 avg_rating = self._extract_avg_rating(html)
215 rating_count = self._extract_rating_count(html)
216 view_count = self._extract_view_count(html)
217 comment_count = self._extract_comment_count(html)
219 common = {'_type': 'video',
221 'description': description,
222 'thumbnail': thumbnail,
223 'duration': duration,
224 'avg_rating': avg_rating,
225 'rating_count': rating_count,
226 'view_count': view_count,
227 'comment_count': comment_count,
232 if slides is not None:
234 d.update({ 'title': title + '-Slides', 'url': slides })
239 d.update({ 'title': title + '-Zip', 'url': zip_ })
244 d.update({ 'title': title, 'formats': formats })
249 def _extract_entry_item(self, html, content_path):
250 contents = self._extract_content(html, content_path)
254 authors = self._extract_authors(html)
256 for content in contents:
257 content['authors'] = authors
261 def _extract_session(self, html, content_path):
262 contents = self._extract_content(html, content_path)
266 session_meta = {'session_code': self._extract_session_code(html),
267 'session_day': self._extract_session_day(html),
268 'session_room': self._extract_session_room(html),
269 'session_speakers': self._extract_session_speakers(html),
272 for content in contents:
273 content.update(session_meta)
277 def _extract_content_rss(self, rss):
279 Extracts links to entry items right out of RSS feed.
280 This approach is faster than extracting from web pages
281 one by one, but suffers from some problems.
283 - no need to download additional pages
284 - provides more media links
287 - fewer meta data provided
288 - links to media files have no appropriate data that may be used as format_id
289 - RSS does not contain links to presentation materials (slides, zip)
292 for item in rss.findall('./channel/item'):
293 url = item.find('./link').text
294 video_id = url.split('/')[-1]
295 formats = self._formats_from_rss_item(item)
297 if len(formats) == 0:
298 self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id)
301 title = item.find('./title').text
302 description = item.find('./description').text
304 thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text
306 duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration')
307 duration = duration_e.text if duration_e is not None else 0
309 speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator')
310 speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else []
312 entries.append({'_type': 'video',
316 'description': description,
317 'thumbnail': thumbnail,
318 'duration': duration,
319 'session_speakers': speakers,
323 def _extract_list(self, content_path):
324 rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
325 if self._EXTRACT_ENTRY_ITEMS_FROM_RSS:
326 return self._extract_content_rss(rss)
328 entries = [self.url_result(session_url.text, 'Channel9')
329 for session_url in rss.findall('./channel/item/link')]
330 title_text = rss.find('./channel/title').text
331 return self.playlist_result(entries, content_path, title_text)
333 def _real_extract(self, url):
334 mobj = re.match(self._VALID_URL, url)
335 content_path = mobj.group('contentpath')
337 webpage = self._download_webpage(url, content_path, u'Downloading web page')
339 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
340 if page_type_m is None:
341 raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
343 page_type = page_type_m.group('pagetype')
344 if page_type == 'List': # List page, may contain list of 'item'-like objects
345 return self._extract_list(content_path)
346 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
347 return self._extract_entry_item(webpage, content_path)
348 elif page_type == 'Session': # Event session page, may contain downloadable content
349 return self._extract_session(webpage, content_path)
351 raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)