_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     parse_filesize,
   9     qualities,
  10 )
  11
  12
  13 class Channel9IE(InfoExtractor):
  14     '''
  15     Common extractor for channel9.msdn.com.
  16
  17     The type of provided URL (video or playlist) is determined according to
  18     meta Search.PageType from web page HTML rather than URL itself, as it is
  19     not always possible to do.
  20     '''
  21     IE_DESC = 'Channel 9'
  22     IE_NAME = 'channel9'
  23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  24
  25     _TESTS = [
  26         {
  27             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  28             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  29             'info_dict': {
  30                 'id': 'Events/TechEd/Australia/2013/KOS002',
  31                 'ext': 'mp4',
  32                 'title': 'Developer Kick-Off Session: Stuff We Love',
  33                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  34                 'duration': 4576,
  35                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  36                 'session_code': 'KOS002',
  37                 'session_day': 'Day 1',
  38                 'session_room': 'Arena 1A',
  39                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
  40             },
  41         },
  42         {
  43             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  44             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  45             'info_dict': {
  46                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  47                 'ext': 'mp4',
  48                 'title': 'Self-service BI with Power BI - nuclear testing',
  49                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  50                 'duration': 1540,
  51                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  52                 'authors': ['Mike Wilmot'],
  53             },
  54         }
  55     ]
  56
  57     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  58
  59     def _formats_from_html(self, html):
  60         FORMAT_REGEX = r'''
  61             (?x)
  62             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  63             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  64             (?:<div\s+class="popup\s+rounded">\s*
  65             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  66             </div>)?                                                # File size part may be missing
  67         '''
  68         quality = qualities((
  69             'MP3', 'MP4',
  70             'Low Quality WMV', 'Low Quality MP4',
  71             'Mid Quality WMV', 'Mid Quality MP4',
  72             'High Quality WMV', 'High Quality MP4'))
  73         formats = [{
  74             'url': x.group('url'),
  75             'format_id': x.group('quality'),
  76             'format_note': x.group('note'),
  77             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  78             'filesize_approx': parse_filesize(x.group('filesize')),
  79             'quality': quality(x.group('quality')),
  80             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  81         } for x in list(re.finditer(FORMAT_REGEX, html))]
  82
  83         self._sort_formats(formats)
  84
  85         return formats
  86
  87     def _extract_title(self, html):
  88         title = self._html_search_meta('title', html, 'title')
  89         if title is None:
  90             title = self._og_search_title(html)
  91             TITLE_SUFFIX = ' (Channel 9)'
  92             if title is not None and title.endswith(TITLE_SUFFIX):
  93                 title = title[:-len(TITLE_SUFFIX)]
  94         return title
  95
  96     def _extract_description(self, html):
  97         DESCRIPTION_REGEX = r'''(?sx)
  98             <div\s+class="entry-content">\s*
  99             <div\s+id="entry-body">\s*
 100             (?P<description>.+?)\s*
 101             </div>\s*
 102             </div>
 103         '''
 104         m = re.search(DESCRIPTION_REGEX, html)
 105         if m is not None:
 106             return m.group('description')
 107         return self._html_search_meta('description', html, 'description')
 108
 109     def _extract_duration(self, html):
 110         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 111         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 112
 113     def _extract_slides(self, html):
 114         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 115         return m.group('slidesurl') if m is not None else None
 116
 117     def _extract_zip(self, html):
 118         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 119         return m.group('zipurl') if m is not None else None
 120
 121     def _extract_avg_rating(self, html):
 122         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 123         return float(m.group('avgrating')) if m is not None else 0
 124
 125     def _extract_rating_count(self, html):
 126         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 127         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 128
 129     def _extract_view_count(self, html):
 130         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 131         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 132
 133     def _extract_comment_count(self, html):
 134         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 135         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 136
 137     def _fix_count(self, count):
 138         return int(str(count).replace(',', '')) if count is not None else None
 139
 140     def _extract_authors(self, html):
 141         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 142         if m is None:
 143             return None
 144         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 145
 146     def _extract_session_code(self, html):
 147         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 148         return m.group('code') if m is not None else None
 149
 150     def _extract_session_day(self, html):
 151         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 152         return m.group('day').strip() if m is not None else None
 153
 154     def _extract_session_room(self, html):
 155         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 156         return m.group('room') if m is not None else None
 157
 158     def _extract_session_speakers(self, html):
 159         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 160
 161     def _extract_content(self, html, content_path):
 162         # Look for downloadable content
 163         formats = self._formats_from_html(html)
 164         slides = self._extract_slides(html)
 165         zip_ = self._extract_zip(html)
 166
 167         # Nothing to download
 168         if len(formats) == 0 and slides is None and zip_ is None:
 169             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 170             return
 171
 172         # Extract meta
 173         title = self._extract_title(html)
 174         description = self._extract_description(html)
 175         thumbnail = self._og_search_thumbnail(html)
 176         duration = self._extract_duration(html)
 177         avg_rating = self._extract_avg_rating(html)
 178         rating_count = self._extract_rating_count(html)
 179         view_count = self._extract_view_count(html)
 180         comment_count = self._extract_comment_count(html)
 181
 182         common = {
 183             '_type': 'video',
 184             'id': content_path,
 185             'description': description,
 186             'thumbnail': thumbnail,
 187             'duration': duration,
 188             'avg_rating': avg_rating,
 189             'rating_count': rating_count,
 190             'view_count': view_count,
 191             'comment_count': comment_count,
 192         }
 193
 194         result = []
 195
 196         if slides is not None:
 197             d = common.copy()
 198             d.update({'title': title + '-Slides', 'url': slides})
 199             result.append(d)
 200
 201         if zip_ is not None:
 202             d = common.copy()
 203             d.update({'title': title + '-Zip', 'url': zip_})
 204             result.append(d)
 205
 206         if len(formats) > 0:
 207             d = common.copy()
 208             d.update({'title': title, 'formats': formats})
 209             result.append(d)
 210
 211         return result
 212
 213     def _extract_entry_item(self, html, content_path):
 214         contents = self._extract_content(html, content_path)
 215         if contents is None:
 216             return contents
 217
 218         if len(contents) > 1:
 219             raise ExtractorError('Got more than one entry')
 220         result = contents[0]
 221         result['authors'] = self._extract_authors(html)
 222
 223         return result
 224
 225     def _extract_session(self, html, content_path):
 226         contents = self._extract_content(html, content_path)
 227         if contents is None:
 228             return contents
 229
 230         session_meta = {
 231             'session_code': self._extract_session_code(html),
 232             'session_day': self._extract_session_day(html),
 233             'session_room': self._extract_session_room(html),
 234             'session_speakers': self._extract_session_speakers(html),
 235         }
 236
 237         for content in contents:
 238             content.update(session_meta)
 239
 240         return self.playlist_result(contents)
 241
 242     def _extract_list(self, content_path):
 243         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 244         entries = [self.url_result(session_url.text, 'Channel9')
 245                    for session_url in rss.findall('./channel/item/link')]
 246         title_text = rss.find('./channel/title').text
 247         return self.playlist_result(entries, content_path, title_text)
 248
 249     def _real_extract(self, url):
 250         mobj = re.match(self._VALID_URL, url)
 251         content_path = mobj.group('contentpath')
 252
 253         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 254
 255         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
 256         if page_type_m is not None:
 257             page_type = page_type_m.group('pagetype')
 258             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 259                 return self._extract_entry_item(webpage, content_path)
 260             elif page_type == 'Session':  # Event session page, may contain downloadable content
 261                 return self._extract_session(webpage, content_path)
 262             elif page_type == 'Event':
 263                 return self._extract_list(content_path)
 264             else:
 265                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 266
 267         else:  # Assuming list
 268             return self._extract_list(content_path)