[ign] improve extraction and extract uploader_id
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8
9 class Channel9IE(InfoExtractor):
10     '''
11     Common extractor for channel9.msdn.com.
12
13     The type of provided URL (video or playlist) is determined according to
14     meta Search.PageType from web page HTML rather than URL itself, as it is
15     not always possible to do.
16     '''
17     IE_DESC = 'Channel 9'
18     IE_NAME = 'channel9'
19     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
20
21     _TESTS = [
22         {
23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
25             'info_dict': {
26                 'id': 'Events/TechEd/Australia/2013/KOS002',
27                 'ext': 'mp4',
28                 'title': 'Developer Kick-Off Session: Stuff We Love',
29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30                 'duration': 4576,
31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
32                 'session_code': 'KOS002',
33                 'session_day': 'Day 1',
34                 'session_room': 'Arena 1A',
35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
36             },
37         },
38         {
39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41             'info_dict': {
42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
43                 'ext': 'mp4',
44                 'title': 'Self-service BI with Power BI - nuclear testing',
45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
46                 'duration': 1540,
47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
48                 'authors': ['Mike Wilmot'],
49             },
50         }
51     ]
52
53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
54
55     # Sorted by quality
56     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
57
58     def _restore_bytes(self, formatted_size):
59         if not formatted_size:
60             return 0
61         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
62         if not m:
63             return 0
64         units = m.group('units')
65         try:
66             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
67         except ValueError:
68             return 0
69         size = float(m.group('size'))
70         return int(size * (1024 ** exponent))
71
72     def _formats_from_html(self, html):
73         FORMAT_REGEX = r'''
74             (?x)
75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
77             (?:<div\s+class="popup\s+rounded">\s*
78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
79             </div>)?                                                # File size part may be missing
80         '''
81         # Extract known formats
82         formats = [{
83             'url': x.group('url'),
84             'format_id': x.group('quality'),
85             'format_note': x.group('note'),
86             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
87             'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
88             'preference': self._known_formats.index(x.group('quality')),
89             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
90         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
91
92         self._sort_formats(formats)
93
94         return formats
95
96     def _extract_title(self, html):
97         title = self._html_search_meta('title', html, 'title')
98         if title is None:
99             title = self._og_search_title(html)
100             TITLE_SUFFIX = ' (Channel 9)'
101             if title is not None and title.endswith(TITLE_SUFFIX):
102                 title = title[:-len(TITLE_SUFFIX)]
103         return title
104
105     def _extract_description(self, html):
106         DESCRIPTION_REGEX = r'''(?sx)
107             <div\s+class="entry-content">\s*
108             <div\s+id="entry-body">\s*
109             (?P<description>.+?)\s*
110             </div>\s*
111             </div>
112         '''
113         m = re.search(DESCRIPTION_REGEX, html)
114         if m is not None:
115             return m.group('description')
116         return self._html_search_meta('description', html, 'description')
117
118     def _extract_duration(self, html):
119         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
120         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
121
122     def _extract_slides(self, html):
123         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
124         return m.group('slidesurl') if m is not None else None
125
126     def _extract_zip(self, html):
127         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
128         return m.group('zipurl') if m is not None else None
129
130     def _extract_avg_rating(self, html):
131         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
132         return float(m.group('avgrating')) if m is not None else 0
133
134     def _extract_rating_count(self, html):
135         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
136         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
137
138     def _extract_view_count(self, html):
139         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
140         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
141
142     def _extract_comment_count(self, html):
143         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
144         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
145
146     def _fix_count(self, count):
147         return int(str(count).replace(',', '')) if count is not None else None
148
149     def _extract_authors(self, html):
150         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
151         if m is None:
152             return None
153         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
154
155     def _extract_session_code(self, html):
156         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
157         return m.group('code') if m is not None else None
158
159     def _extract_session_day(self, html):
160         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
161         return m.group('day') if m is not None else None
162
163     def _extract_session_room(self, html):
164         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
165         return m.group('room') if m is not None else None
166
167     def _extract_session_speakers(self, html):
168         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
169
170     def _extract_content(self, html, content_path):
171         # Look for downloadable content
172         formats = self._formats_from_html(html)
173         slides = self._extract_slides(html)
174         zip_ = self._extract_zip(html)
175
176         # Nothing to download
177         if len(formats) == 0 and slides is None and zip_ is None:
178             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
179             return
180
181         # Extract meta
182         title = self._extract_title(html)
183         description = self._extract_description(html)
184         thumbnail = self._og_search_thumbnail(html)
185         duration = self._extract_duration(html)
186         avg_rating = self._extract_avg_rating(html)
187         rating_count = self._extract_rating_count(html)
188         view_count = self._extract_view_count(html)
189         comment_count = self._extract_comment_count(html)
190
191         common = {
192             '_type': 'video',
193             'id': content_path,
194             'description': description,
195             'thumbnail': thumbnail,
196             'duration': duration,
197             'avg_rating': avg_rating,
198             'rating_count': rating_count,
199             'view_count': view_count,
200             'comment_count': comment_count,
201         }
202
203         result = []
204
205         if slides is not None:
206             d = common.copy()
207             d.update({'title': title + '-Slides', 'url': slides})
208             result.append(d)
209
210         if zip_ is not None:
211             d = common.copy()
212             d.update({'title': title + '-Zip', 'url': zip_})
213             result.append(d)
214
215         if len(formats) > 0:
216             d = common.copy()
217             d.update({'title': title, 'formats': formats})
218             result.append(d)
219
220         return result
221
222     def _extract_entry_item(self, html, content_path):
223         contents = self._extract_content(html, content_path)
224         if contents is None:
225             return contents
226
227         authors = self._extract_authors(html)
228
229         for content in contents:
230             content['authors'] = authors
231
232         return contents
233
234     def _extract_session(self, html, content_path):
235         contents = self._extract_content(html, content_path)
236         if contents is None:
237             return contents
238
239         session_meta = {
240             'session_code': self._extract_session_code(html),
241             'session_day': self._extract_session_day(html),
242             'session_room': self._extract_session_room(html),
243             'session_speakers': self._extract_session_speakers(html),
244         }
245
246         for content in contents:
247             content.update(session_meta)
248
249         return self.playlist_result(contents)
250
251     def _extract_list(self, content_path):
252         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
253         entries = [self.url_result(session_url.text, 'Channel9')
254                    for session_url in rss.findall('./channel/item/link')]
255         title_text = rss.find('./channel/title').text
256         return self.playlist_result(entries, content_path, title_text)
257
258     def _real_extract(self, url):
259         mobj = re.match(self._VALID_URL, url)
260         content_path = mobj.group('contentpath')
261
262         webpage = self._download_webpage(url, content_path, 'Downloading web page')
263
264         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
265         if page_type_m is not None:
266             page_type = page_type_m.group('pagetype')
267             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
268                 return self._extract_entry_item(webpage, content_path)
269             elif page_type == 'Session':  # Event session page, may contain downloadable content
270                 return self._extract_session(webpage, content_path)
271             elif page_type == 'Event':
272                 return self._extract_list(content_path)
273             else:
274                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
275
276         else:  # Assuming list
277             return self._extract_list(content_path)