Merge branch 'dcn' of github.com:remitamine/youtube-dl into remitamine-dcn
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     parse_filesize,
9     qualities,
10 )
11
12
13 class Channel9IE(InfoExtractor):
14     '''
15     Common extractor for channel9.msdn.com.
16
17     The type of provided URL (video or playlist) is determined according to
18     meta Search.PageType from web page HTML rather than URL itself, as it is
19     not always possible to do.
20     '''
21     IE_DESC = 'Channel 9'
22     IE_NAME = 'channel9'
23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
24
25     _TESTS = [
26         {
27             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
28             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29             'info_dict': {
30                 'id': 'Events/TechEd/Australia/2013/KOS002',
31                 'ext': 'mp4',
32                 'title': 'Developer Kick-Off Session: Stuff We Love',
33                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34                 'duration': 4576,
35                 'thumbnail': 're:http://.*\.jpg',
36                 'session_code': 'KOS002',
37                 'session_day': 'Day 1',
38                 'session_room': 'Arena 1A',
39                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
40             },
41         },
42         {
43             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
44             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45             'info_dict': {
46                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47                 'ext': 'mp4',
48                 'title': 'Self-service BI with Power BI - nuclear testing',
49                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50                 'duration': 1540,
51                 'thumbnail': 're:http://.*\.jpg',
52                 'authors': ['Mike Wilmot'],
53             },
54         },
55         {
56             # low quality mp4 is best
57             'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58             'info_dict': {
59                 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
60                 'ext': 'mp4',
61                 'title': 'Ranges for the Standard Library',
62                 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
63                 'duration': 5646,
64                 'thumbnail': 're:http://.*\.jpg',
65             },
66             'params': {
67                 'skip_download': True,
68             },
69         }
70     ]
71
72     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
73
74     def _formats_from_html(self, html):
75         FORMAT_REGEX = r'''
76             (?x)
77             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
78             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
79             (?:<div\s+class="popup\s+rounded">\s*
80             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
81             </div>)?                                                # File size part may be missing
82         '''
83         quality = qualities((
84             'MP3', 'MP4',
85             'Low Quality WMV', 'Low Quality MP4',
86             'Mid Quality WMV', 'Mid Quality MP4',
87             'High Quality WMV', 'High Quality MP4'))
88         formats = [{
89             'url': x.group('url'),
90             'format_id': x.group('quality'),
91             'format_note': x.group('note'),
92             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
93             'filesize_approx': parse_filesize(x.group('filesize')),
94             'quality': quality(x.group('quality')),
95             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
96         } for x in list(re.finditer(FORMAT_REGEX, html))]
97
98         self._sort_formats(formats)
99
100         return formats
101
102     def _extract_title(self, html):
103         title = self._html_search_meta('title', html, 'title')
104         if title is None:
105             title = self._og_search_title(html)
106             TITLE_SUFFIX = ' (Channel 9)'
107             if title is not None and title.endswith(TITLE_SUFFIX):
108                 title = title[:-len(TITLE_SUFFIX)]
109         return title
110
111     def _extract_description(self, html):
112         DESCRIPTION_REGEX = r'''(?sx)
113             <div\s+class="entry-content">\s*
114             <div\s+id="entry-body">\s*
115             (?P<description>.+?)\s*
116             </div>\s*
117             </div>
118         '''
119         m = re.search(DESCRIPTION_REGEX, html)
120         if m is not None:
121             return m.group('description')
122         return self._html_search_meta('description', html, 'description')
123
124     def _extract_duration(self, html):
125         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
126         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
127
128     def _extract_slides(self, html):
129         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
130         return m.group('slidesurl') if m is not None else None
131
132     def _extract_zip(self, html):
133         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
134         return m.group('zipurl') if m is not None else None
135
136     def _extract_avg_rating(self, html):
137         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
138         return float(m.group('avgrating')) if m is not None else 0
139
140     def _extract_rating_count(self, html):
141         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
142         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
143
144     def _extract_view_count(self, html):
145         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
146         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
147
148     def _extract_comment_count(self, html):
149         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
150         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
151
152     def _fix_count(self, count):
153         return int(str(count).replace(',', '')) if count is not None else None
154
155     def _extract_authors(self, html):
156         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
157         if m is None:
158             return None
159         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
160
161     def _extract_session_code(self, html):
162         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
163         return m.group('code') if m is not None else None
164
165     def _extract_session_day(self, html):
166         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
167         return m.group('day').strip() if m is not None else None
168
169     def _extract_session_room(self, html):
170         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
171         return m.group('room') if m is not None else None
172
173     def _extract_session_speakers(self, html):
174         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
175
176     def _extract_content(self, html, content_path):
177         # Look for downloadable content
178         formats = self._formats_from_html(html)
179         slides = self._extract_slides(html)
180         zip_ = self._extract_zip(html)
181
182         # Nothing to download
183         if len(formats) == 0 and slides is None and zip_ is None:
184             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
185             return
186
187         # Extract meta
188         title = self._extract_title(html)
189         description = self._extract_description(html)
190         thumbnail = self._og_search_thumbnail(html)
191         duration = self._extract_duration(html)
192         avg_rating = self._extract_avg_rating(html)
193         rating_count = self._extract_rating_count(html)
194         view_count = self._extract_view_count(html)
195         comment_count = self._extract_comment_count(html)
196
197         common = {
198             '_type': 'video',
199             'id': content_path,
200             'description': description,
201             'thumbnail': thumbnail,
202             'duration': duration,
203             'avg_rating': avg_rating,
204             'rating_count': rating_count,
205             'view_count': view_count,
206             'comment_count': comment_count,
207         }
208
209         result = []
210
211         if slides is not None:
212             d = common.copy()
213             d.update({'title': title + '-Slides', 'url': slides})
214             result.append(d)
215
216         if zip_ is not None:
217             d = common.copy()
218             d.update({'title': title + '-Zip', 'url': zip_})
219             result.append(d)
220
221         if len(formats) > 0:
222             d = common.copy()
223             d.update({'title': title, 'formats': formats})
224             result.append(d)
225
226         return result
227
228     def _extract_entry_item(self, html, content_path):
229         contents = self._extract_content(html, content_path)
230         if contents is None:
231             return contents
232
233         if len(contents) > 1:
234             raise ExtractorError('Got more than one entry')
235         result = contents[0]
236         result['authors'] = self._extract_authors(html)
237
238         return result
239
240     def _extract_session(self, html, content_path):
241         contents = self._extract_content(html, content_path)
242         if contents is None:
243             return contents
244
245         session_meta = {
246             'session_code': self._extract_session_code(html),
247             'session_day': self._extract_session_day(html),
248             'session_room': self._extract_session_room(html),
249             'session_speakers': self._extract_session_speakers(html),
250         }
251
252         for content in contents:
253             content.update(session_meta)
254
255         return self.playlist_result(contents)
256
257     def _extract_list(self, content_path):
258         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
259         entries = [self.url_result(session_url.text, 'Channel9')
260                    for session_url in rss.findall('./channel/item/link')]
261         title_text = rss.find('./channel/title').text
262         return self.playlist_result(entries, content_path, title_text)
263
264     def _real_extract(self, url):
265         mobj = re.match(self._VALID_URL, url)
266         content_path = mobj.group('contentpath')
267
268         webpage = self._download_webpage(url, content_path, 'Downloading web page')
269
270         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
271         if page_type_m is not None:
272             page_type = page_type_m.group('pagetype')
273             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
274                 return self._extract_entry_item(webpage, content_path)
275             elif page_type == 'Session':  # Event session page, may contain downloadable content
276                 return self._extract_session(webpage, content_path)
277             elif page_type == 'Event':
278                 return self._extract_list(content_path)
279             else:
280                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
281
282         else:  # Assuming list
283             return self._extract_list(content_path)