Merge remote-tracking branch 'rzhxeo/crunchyroll'
[youtube-dl] / youtube_dl / extractor / channel9.py
1 # encoding: utf-8
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import ExtractorError
7
8 class Channel9IE(InfoExtractor):
9     '''
10     Common extractor for channel9.msdn.com.
11
12     The type of provided URL (video or playlist) is determined according to
13     meta Search.PageType from web page HTML rather than URL itself, as it is
14     not always possible to do.    
15     '''
16     IE_DESC = u'Channel 9'
17     IE_NAME = u'channel9'
18     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
19
20     _TESTS = [
21         {
22             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
24             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
25             u'info_dict': {
26                 u'title': u'Developer Kick-Off Session: Stuff We Love',
27                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
28                 u'duration': 4576,
29                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30                 u'session_code': u'KOS002',
31                 u'session_day': u'Day 1',
32                 u'session_room': u'Arena 1A',
33                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
34             },
35         },
36         {
37             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
40             u'info_dict': {
41                 u'title': u'Self-service BI with Power BI - nuclear testing',
42                 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
43                 u'duration': 1540,
44                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45                 u'authors': [ u'Mike Wilmot' ],
46             },
47         }
48     ]
49
50     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
51
52     # Sorted by quality
53     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
54
55     def _restore_bytes(self, formatted_size):
56         if not formatted_size:
57             return 0
58         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
59         if not m:
60             return 0
61         units = m.group('units')
62         try:
63             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
64         except ValueError:
65             return 0
66         size = float(m.group('size'))
67         return int(size * (1024 ** exponent))
68
69     def _formats_from_html(self, html):
70         FORMAT_REGEX = r'''
71             (?x)
72             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74             (?:<div\s+class="popup\s+rounded">\s*
75             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76             </div>)?                                                # File size part may be missing
77         '''
78         # Extract known formats
79         formats = [{'url': x.group('url'),
80                  'format_id': x.group('quality'),
81                  'format_note': x.group('note'),
82                  'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
83                  'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
84                  } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
85         # Sort according to known formats list
86         formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
87         return formats
88
89     def _extract_title(self, html):
90         title = self._html_search_meta(u'title', html, u'title')
91         if title is None:           
92             title = self._og_search_title(html)
93             TITLE_SUFFIX = u' (Channel 9)'
94             if title is not None and title.endswith(TITLE_SUFFIX):
95                 title = title[:-len(TITLE_SUFFIX)]
96         return title
97
98     def _extract_description(self, html):
99         DESCRIPTION_REGEX = r'''(?sx)
100             <div\s+class="entry-content">\s*
101             <div\s+id="entry-body">\s*
102             (?P<description>.+?)\s*
103             </div>\s*
104             </div>
105         '''
106         m = re.search(DESCRIPTION_REGEX, html)
107         if m is not None:
108             return m.group('description')
109         return self._html_search_meta(u'description', html, u'description')
110
111     def _extract_duration(self, html):
112         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
113         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
114
115     def _extract_slides(self, html):
116         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
117         return m.group('slidesurl') if m is not None else None
118
119     def _extract_zip(self, html):
120         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
121         return m.group('zipurl') if m is not None else None
122
123     def _extract_avg_rating(self, html):
124         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
125         return float(m.group('avgrating')) if m is not None else 0
126
127     def _extract_rating_count(self, html):
128         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
129         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
130
131     def _extract_view_count(self, html):
132         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
133         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
134
135     def _extract_comment_count(self, html):
136         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
137         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
138
139     def _fix_count(self, count):
140         return int(str(count).replace(',', '')) if count is not None else None
141
142     def _extract_authors(self, html):
143         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
144         if m is None:
145             return None
146         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
147
148     def _extract_session_code(self, html):
149         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
150         return m.group('code') if m is not None else None
151
152     def _extract_session_day(self, html):
153         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
154         return m.group('day') if m is not None else None
155
156     def _extract_session_room(self, html):
157         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
158         return m.group('room') if m is not None else None
159
160     def _extract_session_speakers(self, html):
161         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
162
163     def _extract_content(self, html, content_path):
164         # Look for downloadable content        
165         formats = self._formats_from_html(html)
166         slides = self._extract_slides(html)
167         zip_ = self._extract_zip(html)
168
169         # Nothing to download
170         if len(formats) == 0 and slides is None and zip_ is None:
171             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
172             return
173
174         # Extract meta
175         title = self._extract_title(html)
176         description = self._extract_description(html)
177         thumbnail = self._og_search_thumbnail(html)
178         duration = self._extract_duration(html)
179         avg_rating = self._extract_avg_rating(html)
180         rating_count = self._extract_rating_count(html)
181         view_count = self._extract_view_count(html)
182         comment_count = self._extract_comment_count(html)
183
184         common = {'_type': 'video',
185                   'id': content_path,
186                   'description': description,
187                   'thumbnail': thumbnail,
188                   'duration': duration,
189                   'avg_rating': avg_rating,
190                   'rating_count': rating_count,
191                   'view_count': view_count,
192                   'comment_count': comment_count,
193                 }
194
195         result = []
196
197         if slides is not None:
198             d = common.copy()
199             d.update({ 'title': title + '-Slides', 'url': slides })
200             result.append(d)
201
202         if zip_ is not None:
203             d = common.copy()
204             d.update({ 'title': title + '-Zip', 'url': zip_ })
205             result.append(d)
206
207         if len(formats) > 0:
208             d = common.copy()
209             d.update({ 'title': title, 'formats': formats })
210             result.append(d)
211
212         return result
213
214     def _extract_entry_item(self, html, content_path):
215         contents = self._extract_content(html, content_path)
216         if contents is None:
217             return contents
218
219         authors = self._extract_authors(html)
220
221         for content in contents:
222             content['authors'] = authors
223
224         return contents
225
226     def _extract_session(self, html, content_path):
227         contents = self._extract_content(html, content_path)
228         if contents is None:
229             return contents
230
231         session_meta = {'session_code': self._extract_session_code(html),
232                         'session_day': self._extract_session_day(html),
233                         'session_room': self._extract_session_room(html),
234                         'session_speakers': self._extract_session_speakers(html),
235                         }
236
237         for content in contents:
238             content.update(session_meta)
239
240         return contents
241
242     def _extract_list(self, content_path):
243         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
244         entries = [self.url_result(session_url.text, 'Channel9')
245                    for session_url in rss.findall('./channel/item/link')]
246         title_text = rss.find('./channel/title').text
247         return self.playlist_result(entries, content_path, title_text)
248
249     def _real_extract(self, url):
250         mobj = re.match(self._VALID_URL, url)
251         content_path = mobj.group('contentpath')
252
253         webpage = self._download_webpage(url, content_path, u'Downloading web page')
254
255         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
256         if page_type_m is None:
257             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
258
259         page_type = page_type_m.group('pagetype')
260         if page_type == 'List':         # List page, may contain list of 'item'-like objects
261             return self._extract_list(content_path)
262         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
263             return self._extract_entry_item(webpage, content_path)
264         elif page_type == 'Session':    # Event session page, may contain downloadable content
265             return self._extract_session(webpage, content_path)
266         else:
267             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)