_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 # encoding: utf-8
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     format_bytes,
   8     ExtractorError,
   9 )
  10
  11 class Channel9IE(InfoExtractor):
  12     '''
  13     Common extractor for channel9.msdn.com.
  14
  15     The type of provided URL (video or playlist) is determined according to
  16     meta Search.PageType from web page HTML rather than URL itself, as it is
  17     not always possible to do.
  18     '''
  19     IE_DESC = u'Channel 9'
  20     IE_NAME = u'channel9'
  21     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  22
  23     _TESTS = [
  24         {
  25             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  26             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
  27             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
  28             u'info_dict': {
  29                 u'title': u'Developer Kick-Off Session: Stuff We Love',
  30                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
  31                 u'duration': 4576,
  32                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  33                 u'session_code': u'KOS002',
  34                 u'session_day': u'Day 1',
  35                 u'session_room': u'Arena 1A',
  36                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
  37             },
  38         },
  39         {
  40             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  41             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
  42             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
  43             u'info_dict': {
  44                 u'title': u'Self-service BI with Power BI - nuclear testing',
  45                 u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10',
  46                 u'duration': 1540,
  47                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  48                 u'authors': [ u'Mike Wilmot' ],
  49             },
  50         }
  51     ]
  52
  53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  54     _EXTRACT_ENTRY_ITEMS_FROM_RSS = False
  55
  56     # Sorted by quality
  57     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  58
  59     def _restore_bytes(self, formatted_size):
  60         if not formatted_size:
  61             return 0
  62         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  63         if not m:
  64             return 0
  65         units = m.group('units')
  66         try:
  67             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
  68         except ValueError:
  69             return 0
  70         size = float(m.group('size'))
  71         return int(size * (1024 ** exponent))
  72
  73     def _formats_from_html(self, html):
  74         FORMAT_REGEX = r'''
  75             (?x)
  76             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  77             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  78             (?:<div\s+class="popup\s+rounded">\s*
  79             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  80             </div>)?                                                # File size part may be missing
  81         '''
  82         # Extract known formats
  83         formats = [{'url': x.group('url'),
  84                  'format_id': x.group('quality'),
  85                  'format_note': x.group('note'),
  86                  'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  87                  'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
  88                  } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  89         # Sort according to known formats list
  90         formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
  91         return formats
  92
  93     def _formats_from_rss_item(self, item):
  94
  95         def process_formats(elem):
  96             formats = []
  97             for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'):
  98                 url = media_content.attrib['url']
  99                 # Ignore unrelated media
 100                 if url.endswith('.ism/manifest'):
 101                     continue
 102                 format_note = media_content.attrib['type']
 103                 filesize = int(media_content.attrib['fileSize'])
 104                 formats.append({'url': url,
 105                                 'format_note': format_note,
 106                                 'format': '%s %s' % (format_note, format_bytes(filesize)),
 107                                 'filesize': filesize,
 108                                 })
 109             return formats
 110
 111         formats = []
 112
 113         for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'):
 114             formats.extend(process_formats(media_group))
 115
 116         # Sometimes there are no media:groups in item, but there is media:content
 117         # right in item (usually when there is the only media source)
 118         formats.extend(process_formats(item))
 119
 120         # Sort by file size
 121         formats.sort(key=lambda fmt: fmt['filesize'])
 122         return formats
 123
 124     def _extract_title(self, html):
 125         title = self._html_search_meta(u'title', html, u'title')
 126         if title is None:
 127             title = self._og_search_title(html)
 128             TITLE_SUFFIX = u' (Channel 9)'
 129             if title is not None and title.endswith(TITLE_SUFFIX):
 130                 title = title[:-len(TITLE_SUFFIX)]
 131         return title
 132
 133     def _extract_description(self, html):
 134         DESCRIPTION_REGEX = r'''(?sx)
 135             <div\s+class="entry-content">\s*
 136             <div\s+id="entry-body">\s*
 137             (?P<description>.+?)\s*
 138             </div>\s*
 139             </div>
 140         '''
 141         m = re.search(DESCRIPTION_REGEX, html)
 142         if m is not None:
 143             return m.group('description')
 144         return self._html_search_meta(u'description', html, u'description')
 145
 146     def _extract_duration(self, html):
 147         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 148         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 149
 150     def _extract_slides(self, html):
 151         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 152         return m.group('slidesurl') if m is not None else None
 153
 154     def _extract_zip(self, html):
 155         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 156         return m.group('zipurl') if m is not None else None
 157
 158     def _extract_avg_rating(self, html):
 159         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 160         return float(m.group('avgrating')) if m is not None else 0
 161
 162     def _extract_rating_count(self, html):
 163         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 164         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 165
 166     def _extract_view_count(self, html):
 167         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 168         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 169
 170     def _extract_comment_count(self, html):
 171         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 172         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 173
 174     def _fix_count(self, count):
 175         return int(str(count).replace(',', '')) if count is not None else None
 176
 177     def _extract_authors(self, html):
 178         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 179         if m is None:
 180             return None
 181         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 182
 183     def _extract_session_code(self, html):
 184         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 185         return m.group('code') if m is not None else None
 186
 187     def _extract_session_day(self, html):
 188         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 189         return m.group('day') if m is not None else None
 190
 191     def _extract_session_room(self, html):
 192         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 193         return m.group('room') if m is not None else None
 194
 195     def _extract_session_speakers(self, html):
 196         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 197
 198     def _extract_content(self, html, content_path):
 199         # Look for downloadable content
 200         formats = self._formats_from_html(html)
 201         slides = self._extract_slides(html)
 202         zip_ = self._extract_zip(html)
 203
 204         # Nothing to download
 205         if len(formats) == 0 and slides is None and zip_ is None:
 206             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
 207             return
 208
 209         # Extract meta
 210         title = self._extract_title(html)
 211         description = self._extract_description(html)
 212         thumbnail = self._og_search_thumbnail(html)
 213         duration = self._extract_duration(html)
 214         avg_rating = self._extract_avg_rating(html)
 215         rating_count = self._extract_rating_count(html)
 216         view_count = self._extract_view_count(html)
 217         comment_count = self._extract_comment_count(html)
 218
 219         common = {'_type': 'video',
 220                   'id': content_path,
 221                   'description': description,
 222                   'thumbnail': thumbnail,
 223                   'duration': duration,
 224                   'avg_rating': avg_rating,
 225                   'rating_count': rating_count,
 226                   'view_count': view_count,
 227                   'comment_count': comment_count,
 228                 }
 229
 230         result = []
 231
 232         if slides is not None:
 233             d = common.copy()
 234             d.update({ 'title': title + '-Slides', 'url': slides })
 235             result.append(d)
 236
 237         if zip_ is not None:
 238             d = common.copy()
 239             d.update({ 'title': title + '-Zip', 'url': zip_ })
 240             result.append(d)
 241
 242         if len(formats) > 0:
 243             d = common.copy()
 244             d.update({ 'title': title, 'formats': formats })
 245             result.append(d)
 246
 247         return result
 248
 249     def _extract_entry_item(self, html, content_path):
 250         contents = self._extract_content(html, content_path)
 251         if contents is None:
 252             return contents
 253
 254         authors = self._extract_authors(html)
 255
 256         for content in contents:
 257             content['authors'] = authors
 258
 259         return contents
 260
 261     def _extract_session(self, html, content_path):
 262         contents = self._extract_content(html, content_path)
 263         if contents is None:
 264             return contents
 265
 266         session_meta = {'session_code': self._extract_session_code(html),
 267                         'session_day': self._extract_session_day(html),
 268                         'session_room': self._extract_session_room(html),
 269                         'session_speakers': self._extract_session_speakers(html),
 270                         }
 271
 272         for content in contents:
 273             content.update(session_meta)
 274
 275         return contents
 276
 277     def _extract_content_rss(self, rss):
 278         '''
 279         Extracts links to entry items right out of RSS feed.
 280         This approach is faster than extracting from web pages
 281         one by one, but suffers from some problems.
 282         Pros:
 283          - no need to download additional pages
 284          - provides more media links
 285          - accurate file size
 286         Cons:
 287          - fewer meta data provided
 288          - links to media files have no appropriate data that may be used as format_id
 289          - RSS does not contain links to presentation materials (slides, zip)
 290         '''
 291         entries = []
 292         for item in rss.findall('./channel/item'):
 293             url = item.find('./link').text
 294             video_id = url.split('/')[-1]
 295             formats = self._formats_from_rss_item(item)
 296
 297             if len(formats) == 0:
 298                 self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id)
 299                 continue
 300
 301             title = item.find('./title').text
 302             description = item.find('./description').text
 303
 304             thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text
 305
 306             duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration')
 307             duration = duration_e.text if duration_e is not None else 0
 308
 309             speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator')
 310             speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else []
 311
 312             entries.append({'_type': 'video',
 313                             'id': video_id,
 314                             'formats': formats,
 315                             'title': title,
 316                             'description': description,
 317                             'thumbnail': thumbnail,
 318                             'duration': duration,
 319                             'session_speakers': speakers,
 320                             })
 321         return entries
 322
 323     def _extract_list(self, content_path):
 324         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
 325         if self._EXTRACT_ENTRY_ITEMS_FROM_RSS:
 326             return self._extract_content_rss(rss)
 327         else:
 328             entries = [self.url_result(session_url.text, 'Channel9')
 329                        for session_url in rss.findall('./channel/item/link')]
 330             title_text = rss.find('./channel/title').text
 331             return self.playlist_result(entries, content_path, title_text)
 332
 333     def _real_extract(self, url):
 334         mobj = re.match(self._VALID_URL, url)
 335         content_path = mobj.group('contentpath')
 336
 337         webpage = self._download_webpage(url, content_path, u'Downloading web page')
 338
 339         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
 340         if page_type_m is None:
 341             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
 342
 343         page_type = page_type_m.group('pagetype')
 344         if page_type == 'List':         # List page, may contain list of 'item'-like objects
 345             return self._extract_list(content_path)
 346         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
 347             return self._extract_entry_item(webpage, content_path)
 348         elif page_type == 'Session':    # Event session page, may contain downloadable content
 349             return self._extract_session(webpage, content_path)
 350         else:
 351             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)