[channel9] fix extraction(closes #11323)
[youtube-dl] / youtube_dl / extractor / channel9.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     unescapeHTML,
9     int_or_none,
10     parse_iso8601,
11     clean_html,
12 )
13
14
15 class Channel9IE(InfoExtractor):
16     '''
17     Common extractor for channel9.msdn.com.
18
19     The type of provided URL (video or playlist) is determined according to
20     meta Search.PageType from web page HTML rather than URL itself, as it is
21     not always possible to do.
22     '''
23     IE_DESC = 'Channel 9'
24     IE_NAME = 'channel9'
25     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
26
27     _TESTS = [{
28         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
29         'md5': '32083d4eaf1946db6d454313f44510ca',
30         'info_dict': {
31             'id': '6c413323-383a-49dc-88f9-a22800cab024',
32             'ext': 'wmv',
33             'title': 'Developer Kick-Off Session: Stuff We Love',
34             'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
35             'duration': 4576,
36             'thumbnail': r're:https?://.*\.jpg',
37             'timestamp': 1377717420,
38             'upload_date': '20130828',
39             'session_code': 'KOS002',
40             'session_room': 'Arena 1A',
41             'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
42         },
43     }, {
44         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
45         'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
46         'info_dict': {
47             'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
48             'ext': 'wmv',
49             'title': 'Self-service BI with Power BI - nuclear testing',
50             'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
51             'duration': 1540,
52             'thumbnail': r're:https?://.*\.jpg',
53             'timestamp': 1386381991,
54             'upload_date': '20131207',
55             'authors': ['Mike Wilmot'],
56         },
57     }, {
58         # low quality mp4 is best
59         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
60         'info_dict': {
61             'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
62             'ext': 'mp4',
63             'title': 'Ranges for the Standard Library',
64             'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
65             'duration': 5646,
66             'thumbnail': r're:https?://.*\.jpg',
67             'upload_date': '20150930',
68             'timestamp': 1443640735,
69         },
70         'params': {
71             'skip_download': True,
72         },
73     }, {
74         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
75         'info_dict': {
76             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
77             'title': 'Channel 9',
78         },
79         'playlist_mincount': 100,
80     }, {
81         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
82         'only_matching': True,
83     }, {
84         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
85         'only_matching': True,
86     }]
87
88     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
89
90     def _extract_list(self, video_id, rss_url=None):
91         if not rss_url:
92             rss_url = self._RSS_URL % video_id
93         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
94         entries = [self.url_result(session_url.text, 'Channel9')
95                    for session_url in rss.findall('./channel/item/link')]
96         title_text = rss.find('./channel/title').text
97         return self.playlist_result(entries, video_id, title_text)
98
99     def _real_extract(self, url):
100         content_path, rss = re.match(self._VALID_URL, url).groups()
101
102         if rss:
103             return self._extract_list(content_path, url)
104
105         webpage = self._download_webpage(
106             url, content_path, 'Downloading web page')
107
108         episode_data = self._search_regex(
109             r"data-episode='([^']+)'", webpage, 'episode data', default=None)
110         if episode_data:
111             episode_data = self._parse_json(unescapeHTML(
112                 episode_data), content_path)
113             content_id = episode_data['contentId']
114             is_session = '/Sessions(' in episode_data['api']
115             content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
116             if is_session:
117                 content_url += '?$expand=Speakers'
118             else:
119                 content_url += '?$expand=Authors'
120             content_data = self._download_json(content_url, content_id)
121             title = content_data['Title']
122
123             formats = []
124             qualities = [
125                 'VideoMP4Low',
126                 'VideoWMV',
127                 'VideoMP4Medium',
128                 'VideoMP4High',
129                 'VideoWMVHQ',
130             ]
131             for q in qualities:
132                 q_url = content_data.get(q)
133                 if not q_url:
134                     continue
135                 formats.append({
136                     'format_id': q,
137                     'url': q_url,
138                 })
139             slides = content_data.get('Slides')
140             zip_file = content_data.get('ZipFile')
141
142             if not formats and not slides and not zip_file:
143                 raise ExtractorError(
144                     'None of recording, slides or zip are available for %s' % content_path)
145
146             subtitles = {}
147             for caption in content_data.get('Captions', []):
148                 caption_url = caption.get('Url')
149                 if not caption_url:
150                     continue
151                 subtitles.setdefault(caption.get('Language', 'en'), []).append({
152                     'url': caption_url,
153                     'ext': 'vtt',
154                 })
155
156             common = {
157                 'id': content_id,
158                 'title': title,
159                 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
160                 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
161                 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
162                 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
163                 'avg_rating': int_or_none(content_data.get('Rating')),
164                 'rating_count': int_or_none(content_data.get('RatingCount')),
165                 'view_count': int_or_none(content_data.get('Views')),
166                 'comment_count': int_or_none(content_data.get('CommentCount')),
167                 'subtitles': subtitles,
168             }
169             if is_session:
170                 speakers = []
171                 for s in content_data.get('Speakers', []):
172                     speaker_name = s.get('FullName')
173                     if not speaker_name:
174                         continue
175                     speakers.append(speaker_name)
176
177                 common.update({
178                     'session_code': content_data.get('Code'),
179                     'session_room': content_data.get('Room'),
180                     'session_speakers': speakers,
181                 })
182             else:
183                 authors = []
184                 for a in content_data.get('Authors', []):
185                     author_name = a.get('DisplayName')
186                     if not author_name:
187                         continue
188                     authors.append(author_name)
189                 common['authors'] = authors
190
191             contents = []
192
193             if slides:
194                 d = common.copy()
195                 d.update({'title': title + '-Slides', 'url': slides})
196                 contents.append(d)
197
198             if zip_file:
199                 d = common.copy()
200                 d.update({'title': title + '-Zip', 'url': zip_file})
201                 contents.append(d)
202
203             if formats:
204                 d = common.copy()
205                 d.update({'title': title, 'formats': formats})
206                 contents.append(d)
207             return self.playlist_result(contents)
208         else:
209             return self._extract_list(content_path)