[channel9] Use unicode literals
[youtube-dl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import ExtractorError
8
9 class Channel9IE(InfoExtractor):
10     '''
11     Common extractor for channel9.msdn.com.
12
13     The type of provided URL (video or playlist) is determined according to
14     meta Search.PageType from web page HTML rather than URL itself, as it is
15     not always possible to do.
16     '''
17     IE_DESC = 'Channel 9'
18     IE_NAME = 'channel9'
19     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
20
21     _TESTS = [
22         {
23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24             'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
25             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
26             'info_dict': {
27                 'title': 'Developer Kick-Off Session: Stuff We Love',
28                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
29                 'duration': 4576,
30                 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
31                 'session_code': 'KOS002',
32                 'session_day': 'Day 1',
33                 'session_room': 'Arena 1A',
34                 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
35             },
36         },
37         {
38             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
39             'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41             'info_dict': {
42                 'title': 'Self-service BI with Power BI - nuclear testing',
43                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
44                 'duration': 1540,
45                 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
46                 'authors': [ 'Mike Wilmot' ],
47             },
48         }
49     ]
50
51     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
52
53     # Sorted by quality
54     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
55
56     def _restore_bytes(self, formatted_size):
57         if not formatted_size:
58             return 0
59         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
60         if not m:
61             return 0
62         units = m.group('units')
63         try:
64             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
65         except ValueError:
66             return 0
67         size = float(m.group('size'))
68         return int(size * (1024 ** exponent))
69
70     def _formats_from_html(self, html):
71         FORMAT_REGEX = r'''
72             (?x)
73             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
74             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
75             (?:<div\s+class="popup\s+rounded">\s*
76             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
77             </div>)?                                                # File size part may be missing
78         '''
79         # Extract known formats
80         formats = [{
81             'url': x.group('url'),
82             'format_id': x.group('quality'),
83             'format_note': x.group('note'),
84             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
85             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
86             'preference': self._known_formats.index(x.group('quality')),
87             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
88         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
89
90         self._sort_formats(formats)
91
92         return formats
93
94     def _extract_title(self, html):
95         title = self._html_search_meta('title', html, 'title')
96         if title is None:           
97             title = self._og_search_title(html)
98             TITLE_SUFFIX = ' (Channel 9)'
99             if title is not None and title.endswith(TITLE_SUFFIX):
100                 title = title[:-len(TITLE_SUFFIX)]
101         return title
102
103     def _extract_description(self, html):
104         DESCRIPTION_REGEX = r'''(?sx)
105             <div\s+class="entry-content">\s*
106             <div\s+id="entry-body">\s*
107             (?P<description>.+?)\s*
108             </div>\s*
109             </div>
110         '''
111         m = re.search(DESCRIPTION_REGEX, html)
112         if m is not None:
113             return m.group('description')
114         return self._html_search_meta('description', html, 'description')
115
116     def _extract_duration(self, html):
117         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
118         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
119
120     def _extract_slides(self, html):
121         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
122         return m.group('slidesurl') if m is not None else None
123
124     def _extract_zip(self, html):
125         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
126         return m.group('zipurl') if m is not None else None
127
128     def _extract_avg_rating(self, html):
129         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
130         return float(m.group('avgrating')) if m is not None else 0
131
132     def _extract_rating_count(self, html):
133         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
134         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
135
136     def _extract_view_count(self, html):
137         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
138         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
139
140     def _extract_comment_count(self, html):
141         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
142         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
143
144     def _fix_count(self, count):
145         return int(str(count).replace(',', '')) if count is not None else None
146
147     def _extract_authors(self, html):
148         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
149         if m is None:
150             return None
151         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
152
153     def _extract_session_code(self, html):
154         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
155         return m.group('code') if m is not None else None
156
157     def _extract_session_day(self, html):
158         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
159         return m.group('day') if m is not None else None
160
161     def _extract_session_room(self, html):
162         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
163         return m.group('room') if m is not None else None
164
165     def _extract_session_speakers(self, html):
166         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
167
168     def _extract_content(self, html, content_path):
169         # Look for downloadable content        
170         formats = self._formats_from_html(html)
171         slides = self._extract_slides(html)
172         zip_ = self._extract_zip(html)
173
174         # Nothing to download
175         if len(formats) == 0 and slides is None and zip_ is None:
176             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
177             return
178
179         # Extract meta
180         title = self._extract_title(html)
181         description = self._extract_description(html)
182         thumbnail = self._og_search_thumbnail(html)
183         duration = self._extract_duration(html)
184         avg_rating = self._extract_avg_rating(html)
185         rating_count = self._extract_rating_count(html)
186         view_count = self._extract_view_count(html)
187         comment_count = self._extract_comment_count(html)
188
189         common = {'_type': 'video',
190                   'id': content_path,
191                   'description': description,
192                   'thumbnail': thumbnail,
193                   'duration': duration,
194                   'avg_rating': avg_rating,
195                   'rating_count': rating_count,
196                   'view_count': view_count,
197                   'comment_count': comment_count,
198                 }
199
200         result = []
201
202         if slides is not None:
203             d = common.copy()
204             d.update({ 'title': title + '-Slides', 'url': slides })
205             result.append(d)
206
207         if zip_ is not None:
208             d = common.copy()
209             d.update({ 'title': title + '-Zip', 'url': zip_ })
210             result.append(d)
211
212         if len(formats) > 0:
213             d = common.copy()
214             d.update({ 'title': title, 'formats': formats })
215             result.append(d)
216
217         return result
218
219     def _extract_entry_item(self, html, content_path):
220         contents = self._extract_content(html, content_path)
221         if contents is None:
222             return contents
223
224         authors = self._extract_authors(html)
225
226         for content in contents:
227             content['authors'] = authors
228
229         return contents
230
231     def _extract_session(self, html, content_path):
232         contents = self._extract_content(html, content_path)
233         if contents is None:
234             return contents
235
236         session_meta = {'session_code': self._extract_session_code(html),
237                         'session_day': self._extract_session_day(html),
238                         'session_room': self._extract_session_room(html),
239                         'session_speakers': self._extract_session_speakers(html),
240                         }
241
242         for content in contents:
243             content.update(session_meta)
244
245         return contents
246
247     def _extract_list(self, content_path):
248         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
249         entries = [self.url_result(session_url.text, 'Channel9')
250                    for session_url in rss.findall('./channel/item/link')]
251         title_text = rss.find('./channel/title').text
252         return self.playlist_result(entries, content_path, title_text)
253
254     def _real_extract(self, url):
255         mobj = re.match(self._VALID_URL, url)
256         content_path = mobj.group('contentpath')
257
258         webpage = self._download_webpage(url, content_path, 'Downloading web page')
259
260         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
261         if page_type_m is None:
262             raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
263
264         page_type = page_type_m.group('pagetype')
265         if page_type == 'List':         # List page, may contain list of 'item'-like objects
266             return self._extract_list(content_path)
267         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
268             return self._extract_entry_item(webpage, content_path)
269         elif page_type == 'Session':    # Event session page, may contain downloadable content
270             return self._extract_session(webpage, content_path)
271         else:
272             raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)