[lynda] minor changes
[youtube-dl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8 class Channel9IE(InfoExtractor):
9     '''
10     Common extractor for channel9.msdn.com.
11
12     The type of provided URL (video or playlist) is determined according to
13     meta Search.PageType from web page HTML rather than URL itself, as it is
14     not always possible to do.    
15     '''
16     IE_DESC = u'Channel 9'
17     IE_NAME = u'channel9'
18     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
19
20     _TESTS = [
21         {
22             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
24             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
25             u'info_dict': {
26                 u'title': u'Developer Kick-Off Session: Stuff We Love',
27                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
28                 u'duration': 4576,
29                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30                 u'session_code': u'KOS002',
31                 u'session_day': u'Day 1',
32                 u'session_room': u'Arena 1A',
33                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
34             },
35         },
36         {
37             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
40             u'info_dict': {
41                 u'title': u'Self-service BI with Power BI - nuclear testing',
42                 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
43                 u'duration': 1540,
44                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45                 u'authors': [ u'Mike Wilmot' ],
46             },
47         }
48     ]
49
50     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
51
52     # Sorted by quality
53     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
54
55     def _restore_bytes(self, formatted_size):
56         if not formatted_size:
57             return 0
58         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
59         if not m:
60             return 0
61         units = m.group('units')
62         try:
63             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
64         except ValueError:
65             return 0
66         size = float(m.group('size'))
67         return int(size * (1024 ** exponent))
68
69     def _formats_from_html(self, html):
70         FORMAT_REGEX = r'''
71             (?x)
72             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74             (?:<div\s+class="popup\s+rounded">\s*
75             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76             </div>)?                                                # File size part may be missing
77         '''
78         # Extract known formats
79         formats = [{
80             'url': x.group('url'),
81             'format_id': x.group('quality'),
82             'format_note': x.group('note'),
83             'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
84             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
85             'preference': self._known_formats.index(x.group('quality')),
86             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
87         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
88
89         self._sort_formats(formats)
90
91         return formats
92
93     def _extract_title(self, html):
94         title = self._html_search_meta(u'title', html, u'title')
95         if title is None:           
96             title = self._og_search_title(html)
97             TITLE_SUFFIX = u' (Channel 9)'
98             if title is not None and title.endswith(TITLE_SUFFIX):
99                 title = title[:-len(TITLE_SUFFIX)]
100         return title
101
102     def _extract_description(self, html):
103         DESCRIPTION_REGEX = r'''(?sx)
104             <div\s+class="entry-content">\s*
105             <div\s+id="entry-body">\s*
106             (?P<description>.+?)\s*
107             </div>\s*
108             </div>
109         '''
110         m = re.search(DESCRIPTION_REGEX, html)
111         if m is not None:
112             return m.group('description')
113         return self._html_search_meta(u'description', html, u'description')
114
115     def _extract_duration(self, html):
116         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
117         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
118
119     def _extract_slides(self, html):
120         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
121         return m.group('slidesurl') if m is not None else None
122
123     def _extract_zip(self, html):
124         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
125         return m.group('zipurl') if m is not None else None
126
127     def _extract_avg_rating(self, html):
128         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
129         return float(m.group('avgrating')) if m is not None else 0
130
131     def _extract_rating_count(self, html):
132         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
133         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
134
135     def _extract_view_count(self, html):
136         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
137         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
138
139     def _extract_comment_count(self, html):
140         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
141         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
142
143     def _fix_count(self, count):
144         return int(str(count).replace(',', '')) if count is not None else None
145
146     def _extract_authors(self, html):
147         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
148         if m is None:
149             return None
150         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
151
152     def _extract_session_code(self, html):
153         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
154         return m.group('code') if m is not None else None
155
156     def _extract_session_day(self, html):
157         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
158         return m.group('day') if m is not None else None
159
160     def _extract_session_room(self, html):
161         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
162         return m.group('room') if m is not None else None
163
164     def _extract_session_speakers(self, html):
165         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
166
167     def _extract_content(self, html, content_path):
168         # Look for downloadable content        
169         formats = self._formats_from_html(html)
170         slides = self._extract_slides(html)
171         zip_ = self._extract_zip(html)
172
173         # Nothing to download
174         if len(formats) == 0 and slides is None and zip_ is None:
175             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
176             return
177
178         # Extract meta
179         title = self._extract_title(html)
180         description = self._extract_description(html)
181         thumbnail = self._og_search_thumbnail(html)
182         duration = self._extract_duration(html)
183         avg_rating = self._extract_avg_rating(html)
184         rating_count = self._extract_rating_count(html)
185         view_count = self._extract_view_count(html)
186         comment_count = self._extract_comment_count(html)
187
188         common = {'_type': 'video',
189                   'id': content_path,
190                   'description': description,
191                   'thumbnail': thumbnail,
192                   'duration': duration,
193                   'avg_rating': avg_rating,
194                   'rating_count': rating_count,
195                   'view_count': view_count,
196                   'comment_count': comment_count,
197                 }
198
199         result = []
200
201         if slides is not None:
202             d = common.copy()
203             d.update({ 'title': title + '-Slides', 'url': slides })
204             result.append(d)
205
206         if zip_ is not None:
207             d = common.copy()
208             d.update({ 'title': title + '-Zip', 'url': zip_ })
209             result.append(d)
210
211         if len(formats) > 0:
212             d = common.copy()
213             d.update({ 'title': title, 'formats': formats })
214             result.append(d)
215
216         return result
217
218     def _extract_entry_item(self, html, content_path):
219         contents = self._extract_content(html, content_path)
220         if contents is None:
221             return contents
222
223         authors = self._extract_authors(html)
224
225         for content in contents:
226             content['authors'] = authors
227
228         return contents
229
230     def _extract_session(self, html, content_path):
231         contents = self._extract_content(html, content_path)
232         if contents is None:
233             return contents
234
235         session_meta = {'session_code': self._extract_session_code(html),
236                         'session_day': self._extract_session_day(html),
237                         'session_room': self._extract_session_room(html),
238                         'session_speakers': self._extract_session_speakers(html),
239                         }
240
241         for content in contents:
242             content.update(session_meta)
243
244         return contents
245
246     def _extract_list(self, content_path):
247         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
248         entries = [self.url_result(session_url.text, 'Channel9')
249                    for session_url in rss.findall('./channel/item/link')]
250         title_text = rss.find('./channel/title').text
251         return self.playlist_result(entries, content_path, title_text)
252
253     def _real_extract(self, url):
254         mobj = re.match(self._VALID_URL, url)
255         content_path = mobj.group('contentpath')
256
257         webpage = self._download_webpage(url, content_path, u'Downloading web page')
258
259         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
260         if page_type_m is None:
261             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
262
263         page_type = page_type_m.group('pagetype')
264         if page_type == 'List':         # List page, may contain list of 'item'-like objects
265             return self._extract_list(content_path)
266         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
267             return self._extract_entry_item(webpage, content_path)
268         elif page_type == 'Session':    # Event session page, may contain downloadable content
269             return self._extract_session(webpage, content_path)
270         else:
271             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)