_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8
   9 class Channel9IE(InfoExtractor):
  10     '''
  11     Common extractor for channel9.msdn.com.
  12
  13     The type of provided URL (video or playlist) is determined according to
  14     meta Search.PageType from web page HTML rather than URL itself, as it is
  15     not always possible to do.
  16     '''
  17     IE_DESC = 'Channel 9'
  18     IE_NAME = 'channel9'
  19     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  20
  21     _TESTS = [
  22         {
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  25             'info_dict': {
  26                 'id': 'Events/TechEd/Australia/2013/KOS002',
  27                 'ext': 'mp4',
  28                 'title': 'Developer Kick-Off Session: Stuff We Love',
  29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  30                 'duration': 4576,
  31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  32                 'session_code': 'KOS002',
  33                 'session_day': 'Day 1',
  34                 'session_room': 'Arena 1A',
  35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
  36             },
  37         },
  38         {
  39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  41             'info_dict': {
  42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  43                 'ext': 'mp4',
  44                 'title': 'Self-service BI with Power BI - nuclear testing',
  45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  46                 'duration': 1540,
  47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  48                 'authors': ['Mike Wilmot'],
  49             },
  50         }
  51     ]
  52
  53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  54
  55     # Sorted by quality
  56     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  57
  58     def _restore_bytes(self, formatted_size):
  59         if not formatted_size:
  60             return 0
  61         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  62         if not m:
  63             return 0
  64         units = m.group('units')
  65         try:
  66             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
  67         except ValueError:
  68             return 0
  69         size = float(m.group('size'))
  70         return int(size * (1024 ** exponent))
  71
  72     def _formats_from_html(self, html):
  73         FORMAT_REGEX = r'''
  74             (?x)
  75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  77             (?:<div\s+class="popup\s+rounded">\s*
  78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  79             </div>)?                                                # File size part may be missing
  80         '''
  81         # Extract known formats
  82         formats = [{
  83             'url': x.group('url'),
  84             'format_id': x.group('quality'),
  85             'format_note': x.group('note'),
  86             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  87             'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
  88             'preference': self._known_formats.index(x.group('quality')),
  89             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  90         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  91
  92         self._sort_formats(formats)
  93
  94         return formats
  95
  96     def _extract_title(self, html):
  97         title = self._html_search_meta('title', html, 'title')
  98         if title is None:
  99             title = self._og_search_title(html)
 100             TITLE_SUFFIX = ' (Channel 9)'
 101             if title is not None and title.endswith(TITLE_SUFFIX):
 102                 title = title[:-len(TITLE_SUFFIX)]
 103         return title
 104
 105     def _extract_description(self, html):
 106         DESCRIPTION_REGEX = r'''(?sx)
 107             <div\s+class="entry-content">\s*
 108             <div\s+id="entry-body">\s*
 109             (?P<description>.+?)\s*
 110             </div>\s*
 111             </div>
 112         '''
 113         m = re.search(DESCRIPTION_REGEX, html)
 114         if m is not None:
 115             return m.group('description')
 116         return self._html_search_meta('description', html, 'description')
 117
 118     def _extract_duration(self, html):
 119         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 120         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 121
 122     def _extract_slides(self, html):
 123         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 124         return m.group('slidesurl') if m is not None else None
 125
 126     def _extract_zip(self, html):
 127         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 128         return m.group('zipurl') if m is not None else None
 129
 130     def _extract_avg_rating(self, html):
 131         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 132         return float(m.group('avgrating')) if m is not None else 0
 133
 134     def _extract_rating_count(self, html):
 135         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 136         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 137
 138     def _extract_view_count(self, html):
 139         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 140         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 141
 142     def _extract_comment_count(self, html):
 143         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 144         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 145
 146     def _fix_count(self, count):
 147         return int(str(count).replace(',', '')) if count is not None else None
 148
 149     def _extract_authors(self, html):
 150         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 151         if m is None:
 152             return None
 153         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 154
 155     def _extract_session_code(self, html):
 156         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 157         return m.group('code') if m is not None else None
 158
 159     def _extract_session_day(self, html):
 160         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 161         return m.group('day') if m is not None else None
 162
 163     def _extract_session_room(self, html):
 164         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 165         return m.group('room') if m is not None else None
 166
 167     def _extract_session_speakers(self, html):
 168         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 169
 170     def _extract_content(self, html, content_path):
 171         # Look for downloadable content
 172         formats = self._formats_from_html(html)
 173         slides = self._extract_slides(html)
 174         zip_ = self._extract_zip(html)
 175
 176         # Nothing to download
 177         if len(formats) == 0 and slides is None and zip_ is None:
 178             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 179             return
 180
 181         # Extract meta
 182         title = self._extract_title(html)
 183         description = self._extract_description(html)
 184         thumbnail = self._og_search_thumbnail(html)
 185         duration = self._extract_duration(html)
 186         avg_rating = self._extract_avg_rating(html)
 187         rating_count = self._extract_rating_count(html)
 188         view_count = self._extract_view_count(html)
 189         comment_count = self._extract_comment_count(html)
 190
 191         common = {'_type': 'video',
 192                   'id': content_path,
 193                   'description': description,
 194                   'thumbnail': thumbnail,
 195                   'duration': duration,
 196                   'avg_rating': avg_rating,
 197                   'rating_count': rating_count,
 198                   'view_count': view_count,
 199                   'comment_count': comment_count,
 200                 }
 201
 202         result = []
 203
 204         if slides is not None:
 205             d = common.copy()
 206             d.update({'title': title + '-Slides', 'url': slides})
 207             result.append(d)
 208
 209         if zip_ is not None:
 210             d = common.copy()
 211             d.update({'title': title + '-Zip', 'url': zip_})
 212             result.append(d)
 213
 214         if len(formats) > 0:
 215             d = common.copy()
 216             d.update({'title': title, 'formats': formats})
 217             result.append(d)
 218
 219         return result
 220
 221     def _extract_entry_item(self, html, content_path):
 222         contents = self._extract_content(html, content_path)
 223         if contents is None:
 224             return contents
 225
 226         authors = self._extract_authors(html)
 227
 228         for content in contents:
 229             content['authors'] = authors
 230
 231         return contents
 232
 233     def _extract_session(self, html, content_path):
 234         contents = self._extract_content(html, content_path)
 235         if contents is None:
 236             return contents
 237
 238         session_meta = {'session_code': self._extract_session_code(html),
 239                         'session_day': self._extract_session_day(html),
 240                         'session_room': self._extract_session_room(html),
 241                         'session_speakers': self._extract_session_speakers(html),
 242                         }
 243
 244         for content in contents:
 245             content.update(session_meta)
 246
 247         return contents
 248
 249     def _extract_list(self, content_path):
 250         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 251         entries = [self.url_result(session_url.text, 'Channel9')
 252                    for session_url in rss.findall('./channel/item/link')]
 253         title_text = rss.find('./channel/title').text
 254         return self.playlist_result(entries, content_path, title_text)
 255
 256     def _real_extract(self, url):
 257         mobj = re.match(self._VALID_URL, url)
 258         content_path = mobj.group('contentpath')
 259
 260         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 261
 262         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
 263         if page_type_m is not None:
 264             page_type = page_type_m.group('pagetype')
 265             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 266                 return self._extract_entry_item(webpage, content_path)
 267             elif page_type == 'Session':  # Event session page, may contain downloadable content
 268                 return self._extract_session(webpage, content_path)
 269             elif page_type == 'Event':
 270                 return self._extract_list(content_path)
 271             else:
 272                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 273
 274         else:  # Assuming list
 275             return self._extract_list(content_path)