[channel9] Initial implementation (#1885)
[youtube-dl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     format_bytes,
8     ExtractorError,
9 )
10
11 class Channel9IE(InfoExtractor):
12     '''
13     Common extractor for channel9.msdn.com.
14
15     The type of provided URL (video or playlist) is determined according to
16     meta Search.PageType from web page HTML rather than URL itself, as it is
17     not always possible to do.    
18     '''
19     IE_DESC = u'Channel 9'
20     IE_NAME = u'channel9'
21     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22
23     _TESTS = [
24         {
25             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
26             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
27             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
28             u'info_dict': {
29                 u'title': u'Developer Kick-Off Session: Stuff We Love',
30                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
31                 u'duration': 4576,
32                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
33                 u'session_code': u'KOS002',
34                 u'session_day': u'Day 1',
35                 u'session_room': u'Arena 1A',
36                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
37             },
38         },
39         {
40             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
41             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
42             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
43             u'info_dict': {
44                 u'title': u'Self-service BI with Power BI - nuclear testing',
45                 u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10',
46                 u'duration': 1540,
47                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48                 u'authors': [ u'Mike Wilmot' ],
49             },
50         }
51     ]
52
53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54     _EXTRACT_ENTRY_ITEMS_FROM_RSS = False
55
56     # Sorted by quality
57     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
58
59     def _restore_bytes(self, formatted_size):
60         if not formatted_size:
61             return 0
62         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
63         if not m:
64             return 0
65         units = m.group('units')
66         try:
67             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
68         except ValueError:
69             return 0
70         size = float(m.group('size'))
71         return int(size * (1024 ** exponent))
72
73     def _formats_from_html(self, html):
74         FORMAT_REGEX = r'''
75             (?x)
76             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
77             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
78             (?:<div\s+class="popup\s+rounded">\s*
79             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
80             </div>)?                                                # File size part may be missing
81         '''
82         # Extract known formats
83         formats = [{'url': x.group('url'),
84                  'format_id': x.group('quality'),
85                  'format_note': x.group('note'),
86                  'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
87                  'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
88                  } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
89         # Sort according to known formats list
90         formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
91         return formats
92
93     def _formats_from_rss_item(self, item):
94
95         def process_formats(elem):
96             formats = []
97             for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'):
98                 url = media_content.attrib['url']
99                 # Ignore unrelated media
100                 if url.endswith('.ism/manifest'):
101                     continue
102                 format_note = media_content.attrib['type']
103                 filesize = int(media_content.attrib['fileSize'])
104                 formats.append({'url': url,
105                                 'format_note': format_note,
106                                 'format': '%s %s' % (format_note, format_bytes(filesize)),
107                                 'filesize': filesize,
108                                 })
109             return formats
110
111         formats = []
112
113         for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'):
114             formats.extend(process_formats(media_group))
115
116         # Sometimes there are no media:groups in item, but there is media:content
117         # right in item (usually when there is the only media source)
118         formats.extend(process_formats(item))        
119
120         # Sort by file size
121         formats.sort(key=lambda fmt: fmt['filesize'])
122         return formats
123
124     def _extract_title(self, html):
125         title = self._html_search_meta(u'title', html, u'title')
126         if title is None:           
127             title = self._og_search_title(html)
128             TITLE_SUFFIX = u' (Channel 9)'
129             if title is not None and title.endswith(TITLE_SUFFIX):
130                 title = title[:-len(TITLE_SUFFIX)]
131         return title
132
133     def _extract_description(self, html):
134         DESCRIPTION_REGEX = r'''(?sx)
135             <div\s+class="entry-content">\s*
136             <div\s+id="entry-body">\s*
137             (?P<description>.+?)\s*
138             </div>\s*
139             </div>
140         '''
141         m = re.search(DESCRIPTION_REGEX, html)
142         if m is not None:
143             return m.group('description')
144         return self._html_search_meta(u'description', html, u'description')
145
146     def _extract_duration(self, html):
147         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
148         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
149
150     def _extract_slides(self, html):
151         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
152         return m.group('slidesurl') if m is not None else None
153
154     def _extract_zip(self, html):
155         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
156         return m.group('zipurl') if m is not None else None
157
158     def _extract_avg_rating(self, html):
159         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
160         return float(m.group('avgrating')) if m is not None else 0
161
162     def _extract_rating_count(self, html):
163         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
164         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
165
166     def _extract_view_count(self, html):
167         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
168         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
169
170     def _extract_comment_count(self, html):
171         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
172         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
173
174     def _fix_count(self, count):
175         return int(str(count).replace(',', '')) if count is not None else None
176
177     def _extract_authors(self, html):
178         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
179         if m is None:
180             return None
181         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
182
183     def _extract_session_code(self, html):
184         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
185         return m.group('code') if m is not None else None
186
187     def _extract_session_day(self, html):
188         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
189         return m.group('day') if m is not None else None
190
191     def _extract_session_room(self, html):
192         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
193         return m.group('room') if m is not None else None
194
195     def _extract_session_speakers(self, html):
196         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
197
198     def _extract_content(self, html, content_path):
199         # Look for downloadable content        
200         formats = self._formats_from_html(html)
201         slides = self._extract_slides(html)
202         zip_ = self._extract_zip(html)
203
204         # Nothing to download
205         if len(formats) == 0 and slides is None and zip_ is None:
206             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
207             return
208
209         # Extract meta
210         title = self._extract_title(html)
211         description = self._extract_description(html)
212         thumbnail = self._og_search_thumbnail(html)
213         duration = self._extract_duration(html)
214         avg_rating = self._extract_avg_rating(html)
215         rating_count = self._extract_rating_count(html)
216         view_count = self._extract_view_count(html)
217         comment_count = self._extract_comment_count(html)
218
219         common = {'_type': 'video',
220                   'id': content_path,
221                   'description': description,
222                   'thumbnail': thumbnail,
223                   'duration': duration,
224                   'avg_rating': avg_rating,
225                   'rating_count': rating_count,
226                   'view_count': view_count,
227                   'comment_count': comment_count,
228                 }
229
230         result = []
231
232         if slides is not None:
233             d = common.copy()
234             d.update({ 'title': title + '-Slides', 'url': slides })
235             result.append(d)
236
237         if zip_ is not None:
238             d = common.copy()
239             d.update({ 'title': title + '-Zip', 'url': zip_ })
240             result.append(d)
241
242         if len(formats) > 0:
243             d = common.copy()
244             d.update({ 'title': title, 'formats': formats })
245             result.append(d)
246
247         return result
248
249     def _extract_entry_item(self, html, content_path):
250         contents = self._extract_content(html, content_path)
251         if contents is None:
252             return contents
253
254         authors = self._extract_authors(html)
255
256         for content in contents:
257             content['authors'] = authors
258
259         return contents
260
261     def _extract_session(self, html, content_path):
262         contents = self._extract_content(html, content_path)
263         if contents is None:
264             return contents
265
266         session_meta = {'session_code': self._extract_session_code(html),
267                         'session_day': self._extract_session_day(html),
268                         'session_room': self._extract_session_room(html),
269                         'session_speakers': self._extract_session_speakers(html),
270                         }
271
272         for content in contents:
273             content.update(session_meta)
274
275         return contents
276
277     def _extract_content_rss(self, rss):
278         '''
279         Extracts links to entry items right out of RSS feed.
280         This approach is faster than extracting from web pages
281         one by one, but suffers from some problems.
282         Pros:
283          - no need to download additional pages
284          - provides more media links
285          - accurate file size
286         Cons:
287          - fewer meta data provided
288          - links to media files have no appropriate data that may be used as format_id
289          - RSS does not contain links to presentation materials (slides, zip)
290         '''
291         entries = []
292         for item in rss.findall('./channel/item'):
293             url = item.find('./link').text
294             video_id = url.split('/')[-1]
295             formats = self._formats_from_rss_item(item)
296
297             if len(formats) == 0:
298                 self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id)
299                 continue
300
301             title = item.find('./title').text
302             description = item.find('./description').text
303
304             thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text
305
306             duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration')
307             duration = duration_e.text if duration_e is not None else 0
308
309             speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator')
310             speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else []
311
312             entries.append({'_type': 'video',
313                             'id': video_id,
314                             'formats': formats,
315                             'title': title,
316                             'description': description,
317                             'thumbnail': thumbnail,
318                             'duration': duration,
319                             'session_speakers': speakers,                            
320                             })
321         return entries
322
323     def _extract_list(self, content_path):
324         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
325         if self._EXTRACT_ENTRY_ITEMS_FROM_RSS:   
326             return self._extract_content_rss(rss)
327         else:
328             entries = [self.url_result(session_url.text, 'Channel9')
329                        for session_url in rss.findall('./channel/item/link')]
330             title_text = rss.find('./channel/title').text
331             return self.playlist_result(entries, content_path, title_text)
332
333     def _real_extract(self, url):
334         mobj = re.match(self._VALID_URL, url)
335         content_path = mobj.group('contentpath')
336
337         webpage = self._download_webpage(url, content_path, u'Downloading web page')
338
339         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
340         if page_type_m is None:
341             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
342
343         page_type = page_type_m.group('pagetype')
344         if page_type == 'List':         # List page, may contain list of 'item'-like objects
345             return self._extract_list(content_path)
346         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
347             return self._extract_entry_item(webpage, content_path)
348         elif page_type == 'Session':    # Event session page, may contain downloadable content
349             return self._extract_session(webpage, content_path)
350         else:
351             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)