Fix "invalid escape sequences" error on Python 3.6
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     parse_filesize,
9     qualities,
10 )
11
12
13 class Channel9IE(InfoExtractor):
14     '''
15     Common extractor for channel9.msdn.com.
16
17     The type of provided URL (video or playlist) is determined according to
18     meta Search.PageType from web page HTML rather than URL itself, as it is
19     not always possible to do.
20     '''
21     IE_DESC = 'Channel 9'
22     IE_NAME = 'channel9'
23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
24
25     _TESTS = [{
26         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
27         'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
28         'info_dict': {
29             'id': 'Events/TechEd/Australia/2013/KOS002',
30             'ext': 'mp4',
31             'title': 'Developer Kick-Off Session: Stuff We Love',
32             'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
33             'duration': 4576,
34             'thumbnail': r're:http://.*\.jpg',
35             'session_code': 'KOS002',
36             'session_day': 'Day 1',
37             'session_room': 'Arena 1A',
38             'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
39                                  'Mads Kristensen'],
40         },
41     }, {
42         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
43         'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
44         'info_dict': {
45             'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
46             'ext': 'mp4',
47             'title': 'Self-service BI with Power BI - nuclear testing',
48             'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
49             'duration': 1540,
50             'thumbnail': r're:http://.*\.jpg',
51             'authors': ['Mike Wilmot'],
52         },
53     }, {
54         # low quality mp4 is best
55         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
56         'info_dict': {
57             'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58             'ext': 'mp4',
59             'title': 'Ranges for the Standard Library',
60             'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
61             'duration': 5646,
62             'thumbnail': r're:http://.*\.jpg',
63         },
64         'params': {
65             'skip_download': True,
66         },
67     }, {
68         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
69         'info_dict': {
70             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
71             'title': 'Channel 9',
72         },
73         'playlist_count': 2,
74     }, {
75         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
76         'only_matching': True,
77     }, {
78         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
79         'only_matching': True,
80     }]
81
82     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
83
84     def _formats_from_html(self, html):
85         FORMAT_REGEX = r'''
86             (?x)
87             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
88             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
89             (?:<div\s+class="popup\s+rounded">\s*
90             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
91             </div>)?                                                # File size part may be missing
92         '''
93         quality = qualities((
94             'MP3', 'MP4',
95             'Low Quality WMV', 'Low Quality MP4',
96             'Mid Quality WMV', 'Mid Quality MP4',
97             'High Quality WMV', 'High Quality MP4'))
98         formats = [{
99             'url': x.group('url'),
100             'format_id': x.group('quality'),
101             'format_note': x.group('note'),
102             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
103             'filesize_approx': parse_filesize(x.group('filesize')),
104             'quality': quality(x.group('quality')),
105             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
106         } for x in list(re.finditer(FORMAT_REGEX, html))]
107
108         self._sort_formats(formats)
109
110         return formats
111
112     def _extract_title(self, html):
113         title = self._html_search_meta('title', html, 'title')
114         if title is None:
115             title = self._og_search_title(html)
116             TITLE_SUFFIX = ' (Channel 9)'
117             if title is not None and title.endswith(TITLE_SUFFIX):
118                 title = title[:-len(TITLE_SUFFIX)]
119         return title
120
121     def _extract_description(self, html):
122         DESCRIPTION_REGEX = r'''(?sx)
123             <div\s+class="entry-content">\s*
124             <div\s+id="entry-body">\s*
125             (?P<description>.+?)\s*
126             </div>\s*
127             </div>
128         '''
129         m = re.search(DESCRIPTION_REGEX, html)
130         if m is not None:
131             return m.group('description')
132         return self._html_search_meta('description', html, 'description')
133
134     def _extract_duration(self, html):
135         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
136         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
137
138     def _extract_slides(self, html):
139         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
140         return m.group('slidesurl') if m is not None else None
141
142     def _extract_zip(self, html):
143         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
144         return m.group('zipurl') if m is not None else None
145
146     def _extract_avg_rating(self, html):
147         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
148         return float(m.group('avgrating')) if m is not None else 0
149
150     def _extract_rating_count(self, html):
151         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
152         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
153
154     def _extract_view_count(self, html):
155         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
156         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
157
158     def _extract_comment_count(self, html):
159         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
160         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
161
162     def _fix_count(self, count):
163         return int(str(count).replace(',', '')) if count is not None else None
164
165     def _extract_authors(self, html):
166         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
167         if m is None:
168             return None
169         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
170
171     def _extract_session_code(self, html):
172         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
173         return m.group('code') if m is not None else None
174
175     def _extract_session_day(self, html):
176         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
177         return m.group('day').strip() if m is not None else None
178
179     def _extract_session_room(self, html):
180         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
181         return m.group('room') if m is not None else None
182
183     def _extract_session_speakers(self, html):
184         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
185
186     def _extract_content(self, html, content_path):
187         # Look for downloadable content
188         formats = self._formats_from_html(html)
189         slides = self._extract_slides(html)
190         zip_ = self._extract_zip(html)
191
192         # Nothing to download
193         if len(formats) == 0 and slides is None and zip_ is None:
194             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
195             return
196
197         # Extract meta
198         title = self._extract_title(html)
199         description = self._extract_description(html)
200         thumbnail = self._og_search_thumbnail(html)
201         duration = self._extract_duration(html)
202         avg_rating = self._extract_avg_rating(html)
203         rating_count = self._extract_rating_count(html)
204         view_count = self._extract_view_count(html)
205         comment_count = self._extract_comment_count(html)
206
207         common = {
208             '_type': 'video',
209             'id': content_path,
210             'description': description,
211             'thumbnail': thumbnail,
212             'duration': duration,
213             'avg_rating': avg_rating,
214             'rating_count': rating_count,
215             'view_count': view_count,
216             'comment_count': comment_count,
217         }
218
219         result = []
220
221         if slides is not None:
222             d = common.copy()
223             d.update({'title': title + '-Slides', 'url': slides})
224             result.append(d)
225
226         if zip_ is not None:
227             d = common.copy()
228             d.update({'title': title + '-Zip', 'url': zip_})
229             result.append(d)
230
231         if len(formats) > 0:
232             d = common.copy()
233             d.update({'title': title, 'formats': formats})
234             result.append(d)
235
236         return result
237
238     def _extract_entry_item(self, html, content_path):
239         contents = self._extract_content(html, content_path)
240         if contents is None:
241             return contents
242
243         if len(contents) > 1:
244             raise ExtractorError('Got more than one entry')
245         result = contents[0]
246         result['authors'] = self._extract_authors(html)
247
248         return result
249
250     def _extract_session(self, html, content_path):
251         contents = self._extract_content(html, content_path)
252         if contents is None:
253             return contents
254
255         session_meta = {
256             'session_code': self._extract_session_code(html),
257             'session_day': self._extract_session_day(html),
258             'session_room': self._extract_session_room(html),
259             'session_speakers': self._extract_session_speakers(html),
260         }
261
262         for content in contents:
263             content.update(session_meta)
264
265         return self.playlist_result(contents)
266
267     def _extract_list(self, video_id, rss_url=None):
268         if not rss_url:
269             rss_url = self._RSS_URL % video_id
270         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
271         entries = [self.url_result(session_url.text, 'Channel9')
272                    for session_url in rss.findall('./channel/item/link')]
273         title_text = rss.find('./channel/title').text
274         return self.playlist_result(entries, video_id, title_text)
275
276     def _real_extract(self, url):
277         mobj = re.match(self._VALID_URL, url)
278         content_path = mobj.group('contentpath')
279         rss = mobj.group('rss')
280
281         if rss:
282             return self._extract_list(content_path, url)
283
284         webpage = self._download_webpage(
285             url, content_path, 'Downloading web page')
286
287         page_type = self._search_regex(
288             r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
289             webpage, 'page type', default=None, group='pagetype')
290         if page_type:
291             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
292                 return self._extract_entry_item(webpage, content_path)
293             elif page_type == 'Session':  # Event session page, may contain downloadable content
294                 return self._extract_session(webpage, content_path)
295             elif page_type == 'Event':
296                 return self._extract_list(content_path)
297             else:
298                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
299         else:  # Assuming list
300             return self._extract_list(content_path)