PEP8 applied
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8
9 class Channel9IE(InfoExtractor):
10     '''
11     Common extractor for channel9.msdn.com.
12
13     The type of provided URL (video or playlist) is determined according to
14     meta Search.PageType from web page HTML rather than URL itself, as it is
15     not always possible to do.
16     '''
17     IE_DESC = 'Channel 9'
18     IE_NAME = 'channel9'
19     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
20
21     _TESTS = [
22         {
23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
25             'info_dict': {
26                 'id': 'Events/TechEd/Australia/2013/KOS002',
27                 'ext': 'mp4',
28                 'title': 'Developer Kick-Off Session: Stuff We Love',
29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30                 'duration': 4576,
31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
32                 'session_code': 'KOS002',
33                 'session_day': 'Day 1',
34                 'session_room': 'Arena 1A',
35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
36             },
37         },
38         {
39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41             'info_dict': {
42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
43                 'ext': 'mp4',
44                 'title': 'Self-service BI with Power BI - nuclear testing',
45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
46                 'duration': 1540,
47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48                 'authors': ['Mike Wilmot'],
49             },
50         }
51     ]
52
53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54
55     # Sorted by quality
56     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
57
58     def _restore_bytes(self, formatted_size):
59         if not formatted_size:
60             return 0
61         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
62         if not m:
63             return 0
64         units = m.group('units')
65         try:
66             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
67         except ValueError:
68             return 0
69         size = float(m.group('size'))
70         return int(size * (1024 ** exponent))
71
72     def _formats_from_html(self, html):
73         FORMAT_REGEX = r'''
74             (?x)
75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
77             (?:<div\s+class="popup\s+rounded">\s*
78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
79             </div>)?                                                # File size part may be missing
80         '''
81         # Extract known formats
82         formats = [{
83             'url': x.group('url'),
84             'format_id': x.group('quality'),
85             'format_note': x.group('note'),
86             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
87             'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
88             'preference': self._known_formats.index(x.group('quality')),
89             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
90         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
91
92         self._sort_formats(formats)
93
94         return formats
95
96     def _extract_title(self, html):
97         title = self._html_search_meta('title', html, 'title')
98         if title is None:
99             title = self._og_search_title(html)
100             TITLE_SUFFIX = ' (Channel 9)'
101             if title is not None and title.endswith(TITLE_SUFFIX):
102                 title = title[:-len(TITLE_SUFFIX)]
103         return title
104
105     def _extract_description(self, html):
106         DESCRIPTION_REGEX = r'''(?sx)
107             <div\s+class="entry-content">\s*
108             <div\s+id="entry-body">\s*
109             (?P<description>.+?)\s*
110             </div>\s*
111             </div>
112         '''
113         m = re.search(DESCRIPTION_REGEX, html)
114         if m is not None:
115             return m.group('description')
116         return self._html_search_meta('description', html, 'description')
117
118     def _extract_duration(self, html):
119         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
120         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
121
122     def _extract_slides(self, html):
123         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
124         return m.group('slidesurl') if m is not None else None
125
126     def _extract_zip(self, html):
127         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
128         return m.group('zipurl') if m is not None else None
129
130     def _extract_avg_rating(self, html):
131         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
132         return float(m.group('avgrating')) if m is not None else 0
133
134     def _extract_rating_count(self, html):
135         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
136         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
137
138     def _extract_view_count(self, html):
139         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
140         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
141
142     def _extract_comment_count(self, html):
143         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
144         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
145
146     def _fix_count(self, count):
147         return int(str(count).replace(',', '')) if count is not None else None
148
149     def _extract_authors(self, html):
150         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
151         if m is None:
152             return None
153         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
154
155     def _extract_session_code(self, html):
156         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
157         return m.group('code') if m is not None else None
158
159     def _extract_session_day(self, html):
160         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
161         return m.group('day') if m is not None else None
162
163     def _extract_session_room(self, html):
164         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
165         return m.group('room') if m is not None else None
166
167     def _extract_session_speakers(self, html):
168         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
169
170     def _extract_content(self, html, content_path):
171         # Look for downloadable content
172         formats = self._formats_from_html(html)
173         slides = self._extract_slides(html)
174         zip_ = self._extract_zip(html)
175
176         # Nothing to download
177         if len(formats) == 0 and slides is None and zip_ is None:
178             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
179             return
180
181         # Extract meta
182         title = self._extract_title(html)
183         description = self._extract_description(html)
184         thumbnail = self._og_search_thumbnail(html)
185         duration = self._extract_duration(html)
186         avg_rating = self._extract_avg_rating(html)
187         rating_count = self._extract_rating_count(html)
188         view_count = self._extract_view_count(html)
189         comment_count = self._extract_comment_count(html)
190
191         common = {'_type': 'video',
192                   'id': content_path,
193                   'description': description,
194                   'thumbnail': thumbnail,
195                   'duration': duration,
196                   'avg_rating': avg_rating,
197                   'rating_count': rating_count,
198                   'view_count': view_count,
199                   'comment_count': comment_count,
200                 }
201
202         result = []
203
204         if slides is not None:
205             d = common.copy()
206             d.update({'title': title + '-Slides', 'url': slides})
207             result.append(d)
208
209         if zip_ is not None:
210             d = common.copy()
211             d.update({'title': title + '-Zip', 'url': zip_})
212             result.append(d)
213
214         if len(formats) > 0:
215             d = common.copy()
216             d.update({'title': title, 'formats': formats})
217             result.append(d)
218
219         return result
220
221     def _extract_entry_item(self, html, content_path):
222         contents = self._extract_content(html, content_path)
223         if contents is None:
224             return contents
225
226         authors = self._extract_authors(html)
227
228         for content in contents:
229             content['authors'] = authors
230
231         return contents
232
233     def _extract_session(self, html, content_path):
234         contents = self._extract_content(html, content_path)
235         if contents is None:
236             return contents
237
238         session_meta = {'session_code': self._extract_session_code(html),
239                         'session_day': self._extract_session_day(html),
240                         'session_room': self._extract_session_room(html),
241                         'session_speakers': self._extract_session_speakers(html),
242                         }
243
244         for content in contents:
245             content.update(session_meta)
246
247         return contents
248
249     def _extract_list(self, content_path):
250         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
251         entries = [self.url_result(session_url.text, 'Channel9')
252                    for session_url in rss.findall('./channel/item/link')]
253         title_text = rss.find('./channel/title').text
254         return self.playlist_result(entries, content_path, title_text)
255
256     def _real_extract(self, url):
257         mobj = re.match(self._VALID_URL, url)
258         content_path = mobj.group('contentpath')
259
260         webpage = self._download_webpage(url, content_path, 'Downloading web page')
261
262         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
263         if page_type_m is not None:
264             page_type = page_type_m.group('pagetype')
265             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
266                 return self._extract_entry_item(webpage, content_path)
267             elif page_type == 'Session':  # Event session page, may contain downloadable content
268                 return self._extract_session(webpage, content_path)
269             elif page_type == 'Event':
270                 return self._extract_list(content_path)
271             else:
272                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
273
274         else:  # Assuming list
275             return self._extract_list(content_path)