_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 # encoding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import ExtractorError
   8
   9 class Channel9IE(InfoExtractor):
  10     '''
  11     Common extractor for channel9.msdn.com.
  12
  13     The type of provided URL (video or playlist) is determined according to
  14     meta Search.PageType from web page HTML rather than URL itself, as it is
  15     not always possible to do.
  16     '''
  17     IE_DESC = 'Channel 9'
  18     IE_NAME = 'channel9'
  19     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  20
  21     _TESTS = [
  22         {
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  24             'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
  25             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  26             'info_dict': {
  27                 'title': 'Developer Kick-Off Session: Stuff We Love',
  28                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  29                 'duration': 4576,
  30                 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  31                 'session_code': 'KOS002',
  32                 'session_day': 'Day 1',
  33                 'session_room': 'Arena 1A',
  34                 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
  35             },
  36         },
  37         {
  38             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  39             'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  41             'info_dict': {
  42                 'title': 'Self-service BI with Power BI - nuclear testing',
  43                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  44                 'duration': 1540,
  45                 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  46                 'authors': [ 'Mike Wilmot' ],
  47             },
  48         }
  49     ]
  50
  51     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  52
  53     # Sorted by quality
  54     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  55
  56     def _restore_bytes(self, formatted_size):
  57         if not formatted_size:
  58             return 0
  59         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  60         if not m:
  61             return 0
  62         units = m.group('units')
  63         try:
  64             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
  65         except ValueError:
  66             return 0
  67         size = float(m.group('size'))
  68         return int(size * (1024 ** exponent))
  69
  70     def _formats_from_html(self, html):
  71         FORMAT_REGEX = r'''
  72             (?x)
  73             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  74             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  75             (?:<div\s+class="popup\s+rounded">\s*
  76             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  77             </div>)?                                                # File size part may be missing
  78         '''
  79         # Extract known formats
  80         formats = [{
  81             'url': x.group('url'),
  82             'format_id': x.group('quality'),
  83             'format_note': x.group('note'),
  84             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  85             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
  86             'preference': self._known_formats.index(x.group('quality')),
  87             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  88         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  89
  90         self._sort_formats(formats)
  91
  92         return formats
  93
  94     def _extract_title(self, html):
  95         title = self._html_search_meta('title', html, 'title')
  96         if title is None:
  97             title = self._og_search_title(html)
  98             TITLE_SUFFIX = ' (Channel 9)'
  99             if title is not None and title.endswith(TITLE_SUFFIX):
 100                 title = title[:-len(TITLE_SUFFIX)]
 101         return title
 102
 103     def _extract_description(self, html):
 104         DESCRIPTION_REGEX = r'''(?sx)
 105             <div\s+class="entry-content">\s*
 106             <div\s+id="entry-body">\s*
 107             (?P<description>.+?)\s*
 108             </div>\s*
 109             </div>
 110         '''
 111         m = re.search(DESCRIPTION_REGEX, html)
 112         if m is not None:
 113             return m.group('description')
 114         return self._html_search_meta('description', html, 'description')
 115
 116     def _extract_duration(self, html):
 117         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 118         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 119
 120     def _extract_slides(self, html):
 121         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 122         return m.group('slidesurl') if m is not None else None
 123
 124     def _extract_zip(self, html):
 125         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 126         return m.group('zipurl') if m is not None else None
 127
 128     def _extract_avg_rating(self, html):
 129         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 130         return float(m.group('avgrating')) if m is not None else 0
 131
 132     def _extract_rating_count(self, html):
 133         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 134         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 135
 136     def _extract_view_count(self, html):
 137         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 138         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 139
 140     def _extract_comment_count(self, html):
 141         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 142         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 143
 144     def _fix_count(self, count):
 145         return int(str(count).replace(',', '')) if count is not None else None
 146
 147     def _extract_authors(self, html):
 148         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 149         if m is None:
 150             return None
 151         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 152
 153     def _extract_session_code(self, html):
 154         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 155         return m.group('code') if m is not None else None
 156
 157     def _extract_session_day(self, html):
 158         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 159         return m.group('day') if m is not None else None
 160
 161     def _extract_session_room(self, html):
 162         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 163         return m.group('room') if m is not None else None
 164
 165     def _extract_session_speakers(self, html):
 166         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 167
 168     def _extract_content(self, html, content_path):
 169         # Look for downloadable content
 170         formats = self._formats_from_html(html)
 171         slides = self._extract_slides(html)
 172         zip_ = self._extract_zip(html)
 173
 174         # Nothing to download
 175         if len(formats) == 0 and slides is None and zip_ is None:
 176             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 177             return
 178
 179         # Extract meta
 180         title = self._extract_title(html)
 181         description = self._extract_description(html)
 182         thumbnail = self._og_search_thumbnail(html)
 183         duration = self._extract_duration(html)
 184         avg_rating = self._extract_avg_rating(html)
 185         rating_count = self._extract_rating_count(html)
 186         view_count = self._extract_view_count(html)
 187         comment_count = self._extract_comment_count(html)
 188
 189         common = {'_type': 'video',
 190                   'id': content_path,
 191                   'description': description,
 192                   'thumbnail': thumbnail,
 193                   'duration': duration,
 194                   'avg_rating': avg_rating,
 195                   'rating_count': rating_count,
 196                   'view_count': view_count,
 197                   'comment_count': comment_count,
 198                 }
 199
 200         result = []
 201
 202         if slides is not None:
 203             d = common.copy()
 204             d.update({ 'title': title + '-Slides', 'url': slides })
 205             result.append(d)
 206
 207         if zip_ is not None:
 208             d = common.copy()
 209             d.update({ 'title': title + '-Zip', 'url': zip_ })
 210             result.append(d)
 211
 212         if len(formats) > 0:
 213             d = common.copy()
 214             d.update({ 'title': title, 'formats': formats })
 215             result.append(d)
 216
 217         return result
 218
 219     def _extract_entry_item(self, html, content_path):
 220         contents = self._extract_content(html, content_path)
 221         if contents is None:
 222             return contents
 223
 224         authors = self._extract_authors(html)
 225
 226         for content in contents:
 227             content['authors'] = authors
 228
 229         return contents
 230
 231     def _extract_session(self, html, content_path):
 232         contents = self._extract_content(html, content_path)
 233         if contents is None:
 234             return contents
 235
 236         session_meta = {'session_code': self._extract_session_code(html),
 237                         'session_day': self._extract_session_day(html),
 238                         'session_room': self._extract_session_room(html),
 239                         'session_speakers': self._extract_session_speakers(html),
 240                         }
 241
 242         for content in contents:
 243             content.update(session_meta)
 244
 245         return contents
 246
 247     def _extract_list(self, content_path):
 248         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 249         entries = [self.url_result(session_url.text, 'Channel9')
 250                    for session_url in rss.findall('./channel/item/link')]
 251         title_text = rss.find('./channel/title').text
 252         return self.playlist_result(entries, content_path, title_text)
 253
 254     def _real_extract(self, url):
 255         mobj = re.match(self._VALID_URL, url)
 256         content_path = mobj.group('contentpath')
 257
 258         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 259
 260         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
 261         if page_type_m is None:
 262             raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
 263
 264         page_type = page_type_m.group('pagetype')
 265         if page_type == 'List':         # List page, may contain list of 'item'-like objects
 266             return self._extract_list(content_path)
 267         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
 268             return self._extract_entry_item(webpage, content_path)
 269         elif page_type == 'Session':    # Event session page, may contain downloadable content
 270             return self._extract_session(webpage, content_path)
 271         else:
 272             raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)