[channel9] Extract more formats
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     unescapeHTML,
9     int_or_none,
10     parse_iso8601,
11     clean_html,
12     qualities,
13 )
14
15
16 class Channel9IE(InfoExtractor):
17     '''
18     Common extractor for channel9.msdn.com.
19
20     The type of provided URL (video or playlist) is determined according to
21     meta Search.PageType from web page HTML rather than URL itself, as it is
22     not always possible to do.
23     '''
24     IE_DESC = 'Channel 9'
25     IE_NAME = 'channel9'
26     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
27
28     _TESTS = [{
29         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
30         'md5': '32083d4eaf1946db6d454313f44510ca',
31         'info_dict': {
32             'id': '6c413323-383a-49dc-88f9-a22800cab024',
33             'ext': 'wmv',
34             'title': 'Developer Kick-Off Session: Stuff We Love',
35             'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
36             'duration': 4576,
37             'thumbnail': r're:https?://.*\.jpg',
38             'timestamp': 1377717420,
39             'upload_date': '20130828',
40             'session_code': 'KOS002',
41             'session_room': 'Arena 1A',
42             'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
43         },
44     }, {
45         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
46         'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
47         'info_dict': {
48             'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
49             'ext': 'wmv',
50             'title': 'Self-service BI with Power BI - nuclear testing',
51             'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
52             'duration': 1540,
53             'thumbnail': r're:https?://.*\.jpg',
54             'timestamp': 1386381991,
55             'upload_date': '20131207',
56             'authors': ['Mike Wilmot'],
57         },
58     }, {
59         # low quality mp4 is best
60         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
61         'info_dict': {
62             'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
63             'ext': 'mp4',
64             'title': 'Ranges for the Standard Library',
65             'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
66             'duration': 5646,
67             'thumbnail': r're:https?://.*\.jpg',
68             'upload_date': '20150930',
69             'timestamp': 1443640735,
70         },
71         'params': {
72             'skip_download': True,
73         },
74     }, {
75         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
76         'info_dict': {
77             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
78             'title': 'Channel 9',
79         },
80         'playlist_mincount': 100,
81     }, {
82         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
83         'only_matching': True,
84     }, {
85         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
86         'only_matching': True,
87     }]
88
89     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
90
91     def _extract_list(self, video_id, rss_url=None):
92         if not rss_url:
93             rss_url = self._RSS_URL % video_id
94         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
95         entries = [self.url_result(session_url.text, 'Channel9')
96                    for session_url in rss.findall('./channel/item/link')]
97         title_text = rss.find('./channel/title').text
98         return self.playlist_result(entries, video_id, title_text)
99
100     def _real_extract(self, url):
101         content_path, rss = re.match(self._VALID_URL, url).groups()
102
103         if rss:
104             return self._extract_list(content_path, url)
105
106         webpage = self._download_webpage(
107             url, content_path, 'Downloading web page')
108
109         episode_data = self._search_regex(
110             r"data-episode='([^']+)'", webpage, 'episode data', default=None)
111         if episode_data:
112             episode_data = self._parse_json(unescapeHTML(
113                 episode_data), content_path)
114             content_id = episode_data['contentId']
115             is_session = '/Sessions(' in episode_data['api']
116             content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
117             if is_session:
118                 content_url += '?$expand=Speakers'
119             else:
120                 content_url += '?$expand=Authors'
121             content_data = self._download_json(content_url, content_id)
122             title = content_data['Title']
123
124             QUALITIES = (
125                 'mp3',
126                 'wmv', 'mp4',
127                 'wmv-low', 'mp4-low',
128                 'wmv-mid', 'mp4-mid',
129                 'wmv-high', 'mp4-high',
130             )
131
132             quality_key = qualities(QUALITIES)
133
134             def quality(quality_id, format_url):
135                 return (len(QUALITIES) if '_Source.' in format_url
136                         else quality_key(quality_id))
137
138             formats = []
139             urls = set()
140
141             SITE_QUALITIES = {
142                 'MP3': 'mp3',
143                 'MP4': 'mp4',
144                 'Low Quality WMV': 'wmv-low',
145                 'Low Quality MP4': 'mp4-low',
146                 'Mid Quality WMV': 'wmv-mid',
147                 'Mid Quality MP4': 'mp4-mid',
148                 'High Quality WMV': 'wmv-high',
149                 'High Quality MP4': 'mp4-high',
150             }
151
152             formats_select = self._search_regex(
153                 r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
154                 'formats select', default=None)
155             if formats_select:
156                 for mobj in re.finditer(
157                         r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
158                         formats_select):
159                     format_url = mobj.group('url')
160                     if format_url in urls:
161                         continue
162                     urls.add(format_url)
163                     format_id = mobj.group('format')
164                     quality_id = SITE_QUALITIES.get(format_id, format_id)
165                     formats.append({
166                         'url': format_url,
167                         'format_id': quality_id,
168                         'quality': quality(quality_id, format_url),
169                         'vcodec': 'none' if quality_id == 'mp3' else None,
170                     })
171
172             API_QUALITIES = {
173                 'VideoMP4Low': 'mp4-low',
174                 'VideoWMV': 'wmv-mid',
175                 'VideoMP4Medium': 'mp4-mid',
176                 'VideoMP4High': 'mp4-high',
177                 'VideoWMVHQ': 'wmv-hq',
178             }
179
180             for format_id, q in API_QUALITIES.items():
181                 q_url = content_data.get(format_id)
182                 if not q_url or q_url in urls:
183                     continue
184                 urls.add(q_url)
185                 formats.append({
186                     'url': q_url,
187                     'format_id': q,
188                     'quality': quality(q, q_url),
189                 })
190
191             self._sort_formats(formats)
192
193             slides = content_data.get('Slides')
194             zip_file = content_data.get('ZipFile')
195
196             if not formats and not slides and not zip_file:
197                 raise ExtractorError(
198                     'None of recording, slides or zip are available for %s' % content_path)
199
200             subtitles = {}
201             for caption in content_data.get('Captions', []):
202                 caption_url = caption.get('Url')
203                 if not caption_url:
204                     continue
205                 subtitles.setdefault(caption.get('Language', 'en'), []).append({
206                     'url': caption_url,
207                     'ext': 'vtt',
208                 })
209
210             common = {
211                 'id': content_id,
212                 'title': title,
213                 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
214                 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
215                 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
216                 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
217                 'avg_rating': int_or_none(content_data.get('Rating')),
218                 'rating_count': int_or_none(content_data.get('RatingCount')),
219                 'view_count': int_or_none(content_data.get('Views')),
220                 'comment_count': int_or_none(content_data.get('CommentCount')),
221                 'subtitles': subtitles,
222             }
223             if is_session:
224                 speakers = []
225                 for s in content_data.get('Speakers', []):
226                     speaker_name = s.get('FullName')
227                     if not speaker_name:
228                         continue
229                     speakers.append(speaker_name)
230
231                 common.update({
232                     'session_code': content_data.get('Code'),
233                     'session_room': content_data.get('Room'),
234                     'session_speakers': speakers,
235                 })
236             else:
237                 authors = []
238                 for a in content_data.get('Authors', []):
239                     author_name = a.get('DisplayName')
240                     if not author_name:
241                         continue
242                     authors.append(author_name)
243                 common['authors'] = authors
244
245             contents = []
246
247             if slides:
248                 d = common.copy()
249                 d.update({'title': title + '-Slides', 'url': slides})
250                 contents.append(d)
251
252             if zip_file:
253                 d = common.copy()
254                 d.update({'title': title + '-Zip', 'url': zip_file})
255                 contents.append(d)
256
257             if formats:
258                 d = common.copy()
259                 d.update({'title': title, 'formats': formats})
260                 contents.append(d)
261             return self.playlist_result(contents)
262         else:
263             return self._extract_list(content_path)