16d800512a092e3d9d064cf2ac7800a830af64ec
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8 class Channel9IE(InfoExtractor):
9     '''
10     Common extractor for channel9.msdn.com.
11
12     The type of provided URL (video or playlist) is determined according to
13     meta Search.PageType from web page HTML rather than URL itself, as it is
14     not always possible to do.
15     '''
16     IE_DESC = 'Channel 9'
17     IE_NAME = 'channel9'
18     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
19
20     _TESTS = [
21         {
22             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
24             'info_dict': {
25                 'id': 'Events/TechEd/Australia/2013/KOS002',
26                 'ext': 'mp4',
27                 'title': 'Developer Kick-Off Session: Stuff We Love',
28                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
29                 'duration': 4576,
30                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
31                 'session_code': 'KOS002',
32                 'session_day': 'Day 1',
33                 'session_room': 'Arena 1A',
34                 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
35             },
36         },
37         {
38             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
39             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
40             'info_dict': {
41                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
42                 'ext': 'mp4',
43                 'title': 'Self-service BI with Power BI - nuclear testing',
44                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
45                 'duration': 1540,
46                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
47                 'authors': [ 'Mike Wilmot' ],
48             },
49         }
50     ]
51
52     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
53
54     # Sorted by quality
55     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
56
57     def _restore_bytes(self, formatted_size):
58         if not formatted_size:
59             return 0
60         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
61         if not m:
62             return 0
63         units = m.group('units')
64         try:
65             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
66         except ValueError:
67             return 0
68         size = float(m.group('size'))
69         return int(size * (1024 ** exponent))
70
71     def _formats_from_html(self, html):
72         FORMAT_REGEX = r'''
73             (?x)
74             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
75             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
76             (?:<div\s+class="popup\s+rounded">\s*
77             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
78             </div>)?                                                # File size part may be missing
79         '''
80         # Extract known formats
81         formats = [{
82             'url': x.group('url'),
83             'format_id': x.group('quality'),
84             'format_note': x.group('note'),
85             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
86             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
87             'preference': self._known_formats.index(x.group('quality')),
88             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
89         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
90
91         self._sort_formats(formats)
92
93         return formats
94
95     def _extract_title(self, html):
96         title = self._html_search_meta('title', html, 'title')
97         if title is None:
98             title = self._og_search_title(html)
99             TITLE_SUFFIX = ' (Channel 9)'
100             if title is not None and title.endswith(TITLE_SUFFIX):
101                 title = title[:-len(TITLE_SUFFIX)]
102         return title
103
104     def _extract_description(self, html):
105         DESCRIPTION_REGEX = r'''(?sx)
106             <div\s+class="entry-content">\s*
107             <div\s+id="entry-body">\s*
108             (?P<description>.+?)\s*
109             </div>\s*
110             </div>
111         '''
112         m = re.search(DESCRIPTION_REGEX, html)
113         if m is not None:
114             return m.group('description')
115         return self._html_search_meta('description', html, 'description')
116
117     def _extract_duration(self, html):
118         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
119         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
120
121     def _extract_slides(self, html):
122         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
123         return m.group('slidesurl') if m is not None else None
124
125     def _extract_zip(self, html):
126         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
127         return m.group('zipurl') if m is not None else None
128
129     def _extract_avg_rating(self, html):
130         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
131         return float(m.group('avgrating')) if m is not None else 0
132
133     def _extract_rating_count(self, html):
134         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
135         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
136
137     def _extract_view_count(self, html):
138         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
139         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
140
141     def _extract_comment_count(self, html):
142         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
143         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
144
145     def _fix_count(self, count):
146         return int(str(count).replace(',', '')) if count is not None else None
147
148     def _extract_authors(self, html):
149         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
150         if m is None:
151             return None
152         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
153
154     def _extract_session_code(self, html):
155         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
156         return m.group('code') if m is not None else None
157
158     def _extract_session_day(self, html):
159         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
160         return m.group('day') if m is not None else None
161
162     def _extract_session_room(self, html):
163         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
164         return m.group('room') if m is not None else None
165
166     def _extract_session_speakers(self, html):
167         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
168
169     def _extract_content(self, html, content_path):
170         # Look for downloadable content
171         formats = self._formats_from_html(html)
172         slides = self._extract_slides(html)
173         zip_ = self._extract_zip(html)
174
175         # Nothing to download
176         if len(formats) == 0 and slides is None and zip_ is None:
177             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
178             return
179
180         # Extract meta
181         title = self._extract_title(html)
182         description = self._extract_description(html)
183         thumbnail = self._og_search_thumbnail(html)
184         duration = self._extract_duration(html)
185         avg_rating = self._extract_avg_rating(html)
186         rating_count = self._extract_rating_count(html)
187         view_count = self._extract_view_count(html)
188         comment_count = self._extract_comment_count(html)
189
190         common = {'_type': 'video',
191                   'id': content_path,
192                   'description': description,
193                   'thumbnail': thumbnail,
194                   'duration': duration,
195                   'avg_rating': avg_rating,
196                   'rating_count': rating_count,
197                   'view_count': view_count,
198                   'comment_count': comment_count,
199                 }
200
201         result = []
202
203         if slides is not None:
204             d = common.copy()
205             d.update({ 'title': title + '-Slides', 'url': slides })
206             result.append(d)
207
208         if zip_ is not None:
209             d = common.copy()
210             d.update({ 'title': title + '-Zip', 'url': zip_ })
211             result.append(d)
212
213         if len(formats) > 0:
214             d = common.copy()
215             d.update({ 'title': title, 'formats': formats })
216             result.append(d)
217
218         return result
219
220     def _extract_entry_item(self, html, content_path):
221         contents = self._extract_content(html, content_path)
222         if contents is None:
223             return contents
224
225         authors = self._extract_authors(html)
226
227         for content in contents:
228             content['authors'] = authors
229
230         return contents
231
232     def _extract_session(self, html, content_path):
233         contents = self._extract_content(html, content_path)
234         if contents is None:
235             return contents
236
237         session_meta = {'session_code': self._extract_session_code(html),
238                         'session_day': self._extract_session_day(html),
239                         'session_room': self._extract_session_room(html),
240                         'session_speakers': self._extract_session_speakers(html),
241                         }
242
243         for content in contents:
244             content.update(session_meta)
245
246         return contents
247
248     def _extract_list(self, content_path):
249         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
250         entries = [self.url_result(session_url.text, 'Channel9')
251                    for session_url in rss.findall('./channel/item/link')]
252         title_text = rss.find('./channel/title').text
253         return self.playlist_result(entries, content_path, title_text)
254
255     def _real_extract(self, url):
256         mobj = re.match(self._VALID_URL, url)
257         content_path = mobj.group('contentpath')
258
259         webpage = self._download_webpage(url, content_path, 'Downloading web page')
260
261         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
262         if page_type_m is not None:
263             page_type = page_type_m.group('pagetype')
264             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
265                 return self._extract_entry_item(webpage, content_path)
266             elif page_type == 'Session':  # Event session page, may contain downloadable content
267                 return self._extract_session(webpage, content_path)
268             elif page_type == 'Event':
269                 return self._extract_list(content_path)
270             else:
271                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
272
273         else: # Assuming list
274             return self._extract_list(content_path)