_ Git - youtube-dl/blob - youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     unescapeHTML,
   9     int_or_none,
  10     parse_iso8601,
  11     clean_html,
  12     qualities,
  13 )
  14
  15
  16 class Channel9IE(InfoExtractor):
  17     '''
  18     Common extractor for channel9.msdn.com.
  19
  20     The type of provided URL (video or playlist) is determined according to
  21     meta Search.PageType from web page HTML rather than URL itself, as it is
  22     not always possible to do.
  23     '''
  24     IE_DESC = 'Channel 9'
  25     IE_NAME = 'channel9'
  26     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
  27
  28     _TESTS = [{
  29         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  30         'md5': '32083d4eaf1946db6d454313f44510ca',
  31         'info_dict': {
  32             'id': '6c413323-383a-49dc-88f9-a22800cab024',
  33             'ext': 'wmv',
  34             'title': 'Developer Kick-Off Session: Stuff We Love',
  35             'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
  36             'duration': 4576,
  37             'thumbnail': r're:https?://.*\.jpg',
  38             'timestamp': 1377717420,
  39             'upload_date': '20130828',
  40             'session_code': 'KOS002',
  41             'session_room': 'Arena 1A',
  42             'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
  43         },
  44     }, {
  45         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  46         'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
  47         'info_dict': {
  48             'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
  49             'ext': 'wmv',
  50             'title': 'Self-service BI with Power BI - nuclear testing',
  51             'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
  52             'duration': 1540,
  53             'thumbnail': r're:https?://.*\.jpg',
  54             'timestamp': 1386381991,
  55             'upload_date': '20131207',
  56             'authors': ['Mike Wilmot'],
  57         },
  58     }, {
  59         # low quality mp4 is best
  60         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  61         'info_dict': {
  62             'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
  63             'ext': 'mp4',
  64             'title': 'Ranges for the Standard Library',
  65             'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
  66             'duration': 5646,
  67             'thumbnail': r're:https?://.*\.jpg',
  68             'upload_date': '20150930',
  69             'timestamp': 1443640735,
  70         },
  71         'params': {
  72             'skip_download': True,
  73         },
  74     }, {
  75         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
  76         'info_dict': {
  77             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
  78             'title': 'Channel 9',
  79         },
  80         'playlist_mincount': 100,
  81     }, {
  82         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
  83         'only_matching': True,
  84     }, {
  85         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
  86         'only_matching': True,
  87     }]
  88
  89     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  90
  91     def _extract_list(self, video_id, rss_url=None):
  92         if not rss_url:
  93             rss_url = self._RSS_URL % video_id
  94         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
  95         entries = [self.url_result(session_url.text, 'Channel9')
  96                    for session_url in rss.findall('./channel/item/link')]
  97         title_text = rss.find('./channel/title').text
  98         return self.playlist_result(entries, video_id, title_text)
  99
 100     def _real_extract(self, url):
 101         content_path, rss = re.match(self._VALID_URL, url).groups()
 102
 103         if rss:
 104             return self._extract_list(content_path, url)
 105
 106         webpage = self._download_webpage(
 107             url, content_path, 'Downloading web page')
 108
 109         episode_data = self._search_regex(
 110             r"data-episode='([^']+)'", webpage, 'episode data', default=None)
 111         if episode_data:
 112             episode_data = self._parse_json(unescapeHTML(
 113                 episode_data), content_path)
 114             content_id = episode_data['contentId']
 115             is_session = '/Sessions(' in episode_data['api']
 116             content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
 117             if is_session:
 118                 content_url += '?$expand=Speakers'
 119             else:
 120                 content_url += '?$expand=Authors'
 121             content_data = self._download_json(content_url, content_id)
 122             title = content_data['Title']
 123
 124             QUALITIES = (
 125                 'mp3',
 126                 'wmv', 'mp4',
 127                 'wmv-low', 'mp4-low',
 128                 'wmv-mid', 'mp4-mid',
 129                 'wmv-high', 'mp4-high',
 130             )
 131
 132             quality_key = qualities(QUALITIES)
 133
 134             def quality(quality_id, format_url):
 135                 return (len(QUALITIES) if '_Source.' in format_url
 136                         else quality_key(quality_id))
 137
 138             formats = []
 139             urls = set()
 140
 141             SITE_QUALITIES = {
 142                 'MP3': 'mp3',
 143                 'MP4': 'mp4',
 144                 'Low Quality WMV': 'wmv-low',
 145                 'Low Quality MP4': 'mp4-low',
 146                 'Mid Quality WMV': 'wmv-mid',
 147                 'Mid Quality MP4': 'mp4-mid',
 148                 'High Quality WMV': 'wmv-high',
 149                 'High Quality MP4': 'mp4-high',
 150             }
 151
 152             formats_select = self._search_regex(
 153                 r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
 154                 'formats select', default=None)
 155             if formats_select:
 156                 for mobj in re.finditer(
 157                         r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
 158                         formats_select):
 159                     format_url = mobj.group('url')
 160                     if format_url in urls:
 161                         continue
 162                     urls.add(format_url)
 163                     format_id = mobj.group('format')
 164                     quality_id = SITE_QUALITIES.get(format_id, format_id)
 165                     formats.append({
 166                         'url': format_url,
 167                         'format_id': quality_id,
 168                         'quality': quality(quality_id, format_url),
 169                         'vcodec': 'none' if quality_id == 'mp3' else None,
 170                     })
 171
 172             API_QUALITIES = {
 173                 'VideoMP4Low': 'mp4-low',
 174                 'VideoWMV': 'wmv-mid',
 175                 'VideoMP4Medium': 'mp4-mid',
 176                 'VideoMP4High': 'mp4-high',
 177                 'VideoWMVHQ': 'wmv-hq',
 178             }
 179
 180             for format_id, q in API_QUALITIES.items():
 181                 q_url = content_data.get(format_id)
 182                 if not q_url or q_url in urls:
 183                     continue
 184                 urls.add(q_url)
 185                 formats.append({
 186                     'url': q_url,
 187                     'format_id': q,
 188                     'quality': quality(q, q_url),
 189                 })
 190
 191             self._sort_formats(formats)
 192
 193             slides = content_data.get('Slides')
 194             zip_file = content_data.get('ZipFile')
 195
 196             if not formats and not slides and not zip_file:
 197                 raise ExtractorError(
 198                     'None of recording, slides or zip are available for %s' % content_path)
 199
 200             subtitles = {}
 201             for caption in content_data.get('Captions', []):
 202                 caption_url = caption.get('Url')
 203                 if not caption_url:
 204                     continue
 205                 subtitles.setdefault(caption.get('Language', 'en'), []).append({
 206                     'url': caption_url,
 207                     'ext': 'vtt',
 208                 })
 209
 210             common = {
 211                 'id': content_id,
 212                 'title': title,
 213                 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
 214                 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
 215                 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
 216                 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
 217                 'avg_rating': int_or_none(content_data.get('Rating')),
 218                 'rating_count': int_or_none(content_data.get('RatingCount')),
 219                 'view_count': int_or_none(content_data.get('Views')),
 220                 'comment_count': int_or_none(content_data.get('CommentCount')),
 221                 'subtitles': subtitles,
 222             }
 223             if is_session:
 224                 speakers = []
 225                 for s in content_data.get('Speakers', []):
 226                     speaker_name = s.get('FullName')
 227                     if not speaker_name:
 228                         continue
 229                     speakers.append(speaker_name)
 230
 231                 common.update({
 232                     'session_code': content_data.get('Code'),
 233                     'session_room': content_data.get('Room'),
 234                     'session_speakers': speakers,
 235                 })
 236             else:
 237                 authors = []
 238                 for a in content_data.get('Authors', []):
 239                     author_name = a.get('DisplayName')
 240                     if not author_name:
 241                         continue
 242                     authors.append(author_name)
 243                 common['authors'] = authors
 244
 245             contents = []
 246
 247             if slides:
 248                 d = common.copy()
 249                 d.update({'title': title + '-Slides', 'url': slides})
 250                 contents.append(d)
 251
 252             if zip_file:
 253                 d = common.copy()
 254                 d.update({'title': title + '-Zip', 'url': zip_file})
 255                 contents.append(d)
 256
 257             if formats:
 258                 d = common.copy()
 259                 d.update({'title': title, 'formats': formats})
 260                 contents.append(d)
 261             return self.playlist_result(contents)
 262         else:
 263             return self._extract_list(content_path)