[channel9] Add low quality formats and modernize
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     parse_filesize,
9     qualities,
10 )
11
12
13 class Channel9IE(InfoExtractor):
14     '''
15     Common extractor for channel9.msdn.com.
16
17     The type of provided URL (video or playlist) is determined according to
18     meta Search.PageType from web page HTML rather than URL itself, as it is
19     not always possible to do.
20     '''
21     IE_DESC = 'Channel 9'
22     IE_NAME = 'channel9'
23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
24
25     _TESTS = [
26         {
27             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29             'info_dict': {
30                 'id': 'Events/TechEd/Australia/2013/KOS002',
31                 'ext': 'mp4',
32                 'title': 'Developer Kick-Off Session: Stuff We Love',
33                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34                 'duration': 4576,
35                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
36                 'session_code': 'KOS002',
37                 'session_day': 'Day 1',
38                 'session_room': 'Arena 1A',
39                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
40             },
41         },
42         {
43             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45             'info_dict': {
46                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47                 'ext': 'mp4',
48                 'title': 'Self-service BI with Power BI - nuclear testing',
49                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50                 'duration': 1540,
51                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
52                 'authors': ['Mike Wilmot'],
53             },
54         }
55     ]
56
57     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
58
59     def _formats_from_html(self, html):
60         FORMAT_REGEX = r'''
61             (?x)
62             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
63             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
64             (?:<div\s+class="popup\s+rounded">\s*
65             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
66             </div>)?                                                # File size part may be missing
67         '''
68         quality = qualities((
69             'MP3', 'MP4',
70             'Low Quality WMV', 'Low Quality MP4',
71             'Mid Quality WMV', 'Mid Quality MP4',
72             'High Quality WMV', 'High Quality MP4'))
73         formats = [{
74             'url': x.group('url'),
75             'format_id': x.group('quality'),
76             'format_note': x.group('note'),
77             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
78             'filesize_approx': parse_filesize(x.group('filesize')),
79             'quality': quality(x.group('quality')),
80             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
81         } for x in list(re.finditer(FORMAT_REGEX, html))]
82
83         self._sort_formats(formats)
84
85         return formats
86
87     def _extract_title(self, html):
88         title = self._html_search_meta('title', html, 'title')
89         if title is None:
90             title = self._og_search_title(html)
91             TITLE_SUFFIX = ' (Channel 9)'
92             if title is not None and title.endswith(TITLE_SUFFIX):
93                 title = title[:-len(TITLE_SUFFIX)]
94         return title
95
96     def _extract_description(self, html):
97         DESCRIPTION_REGEX = r'''(?sx)
98             <div\s+class="entry-content">\s*
99             <div\s+id="entry-body">\s*
100             (?P<description>.+?)\s*
101             </div>\s*
102             </div>
103         '''
104         m = re.search(DESCRIPTION_REGEX, html)
105         if m is not None:
106             return m.group('description')
107         return self._html_search_meta('description', html, 'description')
108
109     def _extract_duration(self, html):
110         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
111         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
112
113     def _extract_slides(self, html):
114         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
115         return m.group('slidesurl') if m is not None else None
116
117     def _extract_zip(self, html):
118         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
119         return m.group('zipurl') if m is not None else None
120
121     def _extract_avg_rating(self, html):
122         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
123         return float(m.group('avgrating')) if m is not None else 0
124
125     def _extract_rating_count(self, html):
126         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
127         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
128
129     def _extract_view_count(self, html):
130         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
131         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
132
133     def _extract_comment_count(self, html):
134         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
135         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
136
137     def _fix_count(self, count):
138         return int(str(count).replace(',', '')) if count is not None else None
139
140     def _extract_authors(self, html):
141         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
142         if m is None:
143             return None
144         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
145
146     def _extract_session_code(self, html):
147         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
148         return m.group('code') if m is not None else None
149
150     def _extract_session_day(self, html):
151         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
152         return m.group('day').strip() if m is not None else None
153
154     def _extract_session_room(self, html):
155         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
156         return m.group('room') if m is not None else None
157
158     def _extract_session_speakers(self, html):
159         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
160
161     def _extract_content(self, html, content_path):
162         # Look for downloadable content
163         formats = self._formats_from_html(html)
164         slides = self._extract_slides(html)
165         zip_ = self._extract_zip(html)
166
167         # Nothing to download
168         if len(formats) == 0 and slides is None and zip_ is None:
169             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
170             return
171
172         # Extract meta
173         title = self._extract_title(html)
174         description = self._extract_description(html)
175         thumbnail = self._og_search_thumbnail(html)
176         duration = self._extract_duration(html)
177         avg_rating = self._extract_avg_rating(html)
178         rating_count = self._extract_rating_count(html)
179         view_count = self._extract_view_count(html)
180         comment_count = self._extract_comment_count(html)
181
182         common = {
183             '_type': 'video',
184             'id': content_path,
185             'description': description,
186             'thumbnail': thumbnail,
187             'duration': duration,
188             'avg_rating': avg_rating,
189             'rating_count': rating_count,
190             'view_count': view_count,
191             'comment_count': comment_count,
192         }
193
194         result = []
195
196         if slides is not None:
197             d = common.copy()
198             d.update({'title': title + '-Slides', 'url': slides})
199             result.append(d)
200
201         if zip_ is not None:
202             d = common.copy()
203             d.update({'title': title + '-Zip', 'url': zip_})
204             result.append(d)
205
206         if len(formats) > 0:
207             d = common.copy()
208             d.update({'title': title, 'formats': formats})
209             result.append(d)
210
211         return result
212
213     def _extract_entry_item(self, html, content_path):
214         contents = self._extract_content(html, content_path)
215         if contents is None:
216             return contents
217
218         if len(contents) > 1:
219             raise ExtractorError('Got more than one entry')
220         result = contents[0]
221         result['authors'] = self._extract_authors(html)
222
223         return result
224
225     def _extract_session(self, html, content_path):
226         contents = self._extract_content(html, content_path)
227         if contents is None:
228             return contents
229
230         session_meta = {
231             'session_code': self._extract_session_code(html),
232             'session_day': self._extract_session_day(html),
233             'session_room': self._extract_session_room(html),
234             'session_speakers': self._extract_session_speakers(html),
235         }
236
237         for content in contents:
238             content.update(session_meta)
239
240         return self.playlist_result(contents)
241
242     def _extract_list(self, content_path):
243         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
244         entries = [self.url_result(session_url.text, 'Channel9')
245                    for session_url in rss.findall('./channel/item/link')]
246         title_text = rss.find('./channel/title').text
247         return self.playlist_result(entries, content_path, title_text)
248
249     def _real_extract(self, url):
250         mobj = re.match(self._VALID_URL, url)
251         content_path = mobj.group('contentpath')
252
253         webpage = self._download_webpage(url, content_path, 'Downloading web page')
254
255         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
256         if page_type_m is not None:
257             page_type = page_type_m.group('pagetype')
258             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
259                 return self._extract_entry_item(webpage, content_path)
260             elif page_type == 'Session':  # Event session page, may contain downloadable content
261                 return self._extract_session(webpage, content_path)
262             elif page_type == 'Event':
263                 return self._extract_list(content_path)
264             else:
265                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
266
267         else:  # Assuming list
268             return self._extract_list(content_path)