[channel9] fix extraction(closes #11323)

[youtube-dl] / youtube_dl / extractor / channel9.py
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py

index 574881b70de67b9521b5e813f0cafa6da59d1068..b1cb585309380eb4127658d539ec73355a2f2f1c 100644 (file)
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -1,9 +1,16 @@
-# encoding: utf-8
+from __future__ import unicode_literals
  
  import re
  
  from .common import InfoExtractor
  
  import re
  
  from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    unescapeHTML,
+    int_or_none,
+    parse_iso8601,
+    clean_html,
+)
+
  
  class Channel9IE(InfoExtractor):
      '''
  
  class Channel9IE(InfoExtractor):
      '''
@@ -11,261 +18,192 @@ class Channel9IE(InfoExtractor):
  
      The type of provided URL (video or playlist) is determined according to
      meta Search.PageType from web page HTML rather than URL itself, as it is
  
      The type of provided URL (video or playlist) is determined according to
      meta Search.PageType from web page HTML rather than URL itself, as it is
-    not always possible to do.    
+    not always possible to do.
      '''
      '''
-    IE_DESC = u'Channel 9'
-    IE_NAME = u'channel9'
-    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
-
-    _TESTS = [
-        {
-            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
-            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
-            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
-            u'info_dict': {
-                u'title': u'Developer Kick-Off Session: Stuff We Love',
-                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
-                u'duration': 4576,
-                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
-                u'session_code': u'KOS002',
-                u'session_day': u'Day 1',
-                u'session_room': u'Arena 1A',
-                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
-            },
+    IE_DESC = 'Channel 9'
+    IE_NAME = 'channel9'
+    _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+
+    _TESTS = [{
+        'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+        'md5': '32083d4eaf1946db6d454313f44510ca',
+        'info_dict': {
+            'id': '6c413323-383a-49dc-88f9-a22800cab024',
+            'ext': 'wmv',
+            'title': 'Developer Kick-Off Session: Stuff We Love',
+            'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
+            'duration': 4576,
+            'thumbnail': r're:https?://.*\.jpg',
+            'timestamp': 1377717420,
+            'upload_date': '20130828',
+            'session_code': 'KOS002',
+            'session_room': 'Arena 1A',
+            'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
+        },
+    }, {
+        'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+        'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
+        'info_dict': {
+            'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
+            'ext': 'wmv',
+            'title': 'Self-service BI with Power BI - nuclear testing',
+            'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
+            'duration': 1540,
+            'thumbnail': r're:https?://.*\.jpg',
+            'timestamp': 1386381991,
+            'upload_date': '20131207',
+            'authors': ['Mike Wilmot'],
+        },
+    }, {
+        # low quality mp4 is best
+        'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+        'info_dict': {
+            'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
+            'ext': 'mp4',
+            'title': 'Ranges for the Standard Library',
+            'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
+            'duration': 5646,
+            'thumbnail': r're:https?://.*\.jpg',
+            'upload_date': '20150930',
+            'timestamp': 1443640735,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+        'info_dict': {
+            'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
+            'title': 'Channel 9',
          },
          },
-        {
-            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
-            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
-            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
-            u'info_dict': {
-                u'title': u'Self-service BI with Power BI - nuclear testing',
-                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
-                u'duration': 1540,
-                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
-                u'authors': [ u'Mike Wilmot' ],
-            },
-        }
-    ]
+        'playlist_mincount': 100,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+        'only_matching': True,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
+        'only_matching': True,
+    }]
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  
-    # Sorted by quality
-    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
-
-    def _restore_bytes(self, formatted_size):
-        if not formatted_size:
-            return 0
-        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
-        if not m:
-            return 0
-        units = m.group('units')
-        try:
-            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
-        except ValueError:
-            return 0
-        size = float(m.group('size'))
-        return int(size * (1024 ** exponent))
-
-    def _formats_from_html(self, html):
-        FORMAT_REGEX = r'''
-            (?x)
-            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
-            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
-            (?:<div\s+class="popup\s+rounded">\s*
-            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
-            </div>)?                                                # File size part may be missing
-        '''
-        # Extract known formats
-        formats = [{
-            'url': x.group('url'),
-            'format_id': x.group('quality'),
-            'format_note': x.group('note'),
-            'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
-            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
-            'preference': self._known_formats.index(x.group('quality')),
-            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
-        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
-
-        self._sort_formats(formats)
-
-        return formats
-
-    def _extract_title(self, html):
-        title = self._html_search_meta(u'title', html, u'title')
-        if title is None:           
-            title = self._og_search_title(html)
-            TITLE_SUFFIX = u' (Channel 9)'
-            if title is not None and title.endswith(TITLE_SUFFIX):
-                title = title[:-len(TITLE_SUFFIX)]
-        return title
-
-    def _extract_description(self, html):
-        DESCRIPTION_REGEX = r'''(?sx)
-            <div\s+class="entry-content">\s*
-            <div\s+id="entry-body">\s*
-            (?P<description>.+?)\s*
-            </div>\s*
-            </div>
-        '''
-        m = re.search(DESCRIPTION_REGEX, html)
-        if m is not None:
-            return m.group('description')
-        return self._html_search_meta(u'description', html, u'description')
-
-    def _extract_duration(self, html):
-        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
-        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
-
-    def _extract_slides(self, html):
-        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
-        return m.group('slidesurl') if m is not None else None
-
-    def _extract_zip(self, html):
-        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
-        return m.group('zipurl') if m is not None else None
-
-    def _extract_avg_rating(self, html):
-        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
-        return float(m.group('avgrating')) if m is not None else 0
-
-    def _extract_rating_count(self, html):
-        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
-        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
-
-    def _extract_view_count(self, html):
-        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
-        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
-
-    def _extract_comment_count(self, html):
-        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
-        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
-
-    def _fix_count(self, count):
-        return int(str(count).replace(',', '')) if count is not None else None
-
-    def _extract_authors(self, html):
-        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
-        if m is None:
-            return None
-        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
-
-    def _extract_session_code(self, html):
-        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
-        return m.group('code') if m is not None else None
-
-    def _extract_session_day(self, html):
-        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
-        return m.group('day') if m is not None else None
-
-    def _extract_session_room(self, html):
-        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
-        return m.group('room') if m is not None else None
-
-    def _extract_session_speakers(self, html):
-        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
-
-    def _extract_content(self, html, content_path):
-        # Look for downloadable content        
-        formats = self._formats_from_html(html)
-        slides = self._extract_slides(html)
-        zip_ = self._extract_zip(html)
-
-        # Nothing to download
-        if len(formats) == 0 and slides is None and zip_ is None:
-            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
-            return
-
-        # Extract meta
-        title = self._extract_title(html)
-        description = self._extract_description(html)
-        thumbnail = self._og_search_thumbnail(html)
-        duration = self._extract_duration(html)
-        avg_rating = self._extract_avg_rating(html)
-        rating_count = self._extract_rating_count(html)
-        view_count = self._extract_view_count(html)
-        comment_count = self._extract_comment_count(html)
-
-        common = {'_type': 'video',
-                  'id': content_path,
-                  'description': description,
-                  'thumbnail': thumbnail,
-                  'duration': duration,
-                  'avg_rating': avg_rating,
-                  'rating_count': rating_count,
-                  'view_count': view_count,
-                  'comment_count': comment_count,
-                }
-
-        result = []
-
-        if slides is not None:
-            d = common.copy()
-            d.update({ 'title': title + '-Slides', 'url': slides })
-            result.append(d)
-
-        if zip_ is not None:
-            d = common.copy()
-            d.update({ 'title': title + '-Zip', 'url': zip_ })
-            result.append(d)
-
-        if len(formats) > 0:
-            d = common.copy()
-            d.update({ 'title': title, 'formats': formats })
-            result.append(d)
-
-        return result
-
-    def _extract_entry_item(self, html, content_path):
-        contents = self._extract_content(html, content_path)
-        if contents is None:
-            return contents
-
-        authors = self._extract_authors(html)
-
-        for content in contents:
-            content['authors'] = authors
-
-        return contents
-
-    def _extract_session(self, html, content_path):
-        contents = self._extract_content(html, content_path)
-        if contents is None:
-            return contents
-
-        session_meta = {'session_code': self._extract_session_code(html),
-                        'session_day': self._extract_session_day(html),
-                        'session_room': self._extract_session_room(html),
-                        'session_speakers': self._extract_session_speakers(html),
-                        }
-
-        for content in contents:
-            content.update(session_meta)
-
-        return contents
-
-    def _extract_list(self, content_path):
-        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
+    def _extract_list(self, video_id, rss_url=None):
+        if not rss_url:
+            rss_url = self._RSS_URL % video_id
+        rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
          entries = [self.url_result(session_url.text, 'Channel9')
                     for session_url in rss.findall('./channel/item/link')]
          title_text = rss.find('./channel/title').text
          entries = [self.url_result(session_url.text, 'Channel9')
                     for session_url in rss.findall('./channel/item/link')]
          title_text = rss.find('./channel/title').text
-        return self.playlist_result(entries, content_path, title_text)
+        return self.playlist_result(entries, video_id, title_text)
  
      def _real_extract(self, url):
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        content_path = mobj.group('contentpath')
-
-        webpage = self._download_webpage(url, content_path, u'Downloading web page')
-
-        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
-        if page_type_m is None:
-            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
-
-        page_type = page_type_m.group('pagetype')
-        if page_type == 'List':         # List page, may contain list of 'item'-like objects
-            return self._extract_list(content_path)
-        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
-            return self._extract_entry_item(webpage, content_path)
-        elif page_type == 'Session':    # Event session page, may contain downloadable content
-            return self._extract_session(webpage, content_path)
+        content_path, rss = re.match(self._VALID_URL, url).groups()
+
+        if rss:
+            return self._extract_list(content_path, url)
+
+        webpage = self._download_webpage(
+            url, content_path, 'Downloading web page')
+
+        episode_data = self._search_regex(
+            r"data-episode='([^']+)'", webpage, 'episode data', default=None)
+        if episode_data:
+            episode_data = self._parse_json(unescapeHTML(
+                episode_data), content_path)
+            content_id = episode_data['contentId']
+            is_session = '/Sessions(' in episode_data['api']
+            content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
+            if is_session:
+                content_url += '?$expand=Speakers'
+            else:
+                content_url += '?$expand=Authors'
+            content_data = self._download_json(content_url, content_id)
+            title = content_data['Title']
+
+            formats = []
+            qualities = [
+                'VideoMP4Low',
+                'VideoWMV',
+                'VideoMP4Medium',
+                'VideoMP4High',
+                'VideoWMVHQ',
+            ]
+            for q in qualities:
+                q_url = content_data.get(q)
+                if not q_url:
+                    continue
+                formats.append({
+                    'format_id': q,
+                    'url': q_url,
+                })
+            slides = content_data.get('Slides')
+            zip_file = content_data.get('ZipFile')
+
+            if not formats and not slides and not zip_file:
+                raise ExtractorError(
+                    'None of recording, slides or zip are available for %s' % content_path)
+
+            subtitles = {}
+            for caption in content_data.get('Captions', []):
+                caption_url = caption.get('Url')
+                if not caption_url:
+                    continue
+                subtitles.setdefault(caption.get('Language', 'en'), []).append({
+                    'url': caption_url,
+                    'ext': 'vtt',
+                })
+
+            common = {
+                'id': content_id,
+                'title': title,
+                'description': clean_html(content_data.get('Description') or content_data.get('Body')),
+                'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
+                'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
+                'timestamp': parse_iso8601(content_data.get('PublishedDate')),
+                'avg_rating': int_or_none(content_data.get('Rating')),
+                'rating_count': int_or_none(content_data.get('RatingCount')),
+                'view_count': int_or_none(content_data.get('Views')),
+                'comment_count': int_or_none(content_data.get('CommentCount')),
+                'subtitles': subtitles,
+            }
+            if is_session:
+                speakers = []
+                for s in content_data.get('Speakers', []):
+                    speaker_name = s.get('FullName')
+                    if not speaker_name:
+                        continue
+                    speakers.append(speaker_name)
+
+                common.update({
+                    'session_code': content_data.get('Code'),
+                    'session_room': content_data.get('Room'),
+                    'session_speakers': speakers,
+                })
+            else:
+                authors = []
+                for a in content_data.get('Authors', []):
+                    author_name = a.get('DisplayName')
+                    if not author_name:
+                        continue
+                    authors.append(author_name)
+                common['authors'] = authors
+
+            contents = []
+
+            if slides:
+                d = common.copy()
+                d.update({'title': title + '-Slides', 'url': slides})
+                contents.append(d)
+
+            if zip_file:
+                d = common.copy()
+                d.update({'title': title + '-Zip', 'url': zip_file})
+                contents.append(d)
+
+            if formats:
+                d = common.copy()
+                d.update({'title': title, 'formats': formats})
+                contents.append(d)
+            return self.playlist_result(contents)
          else:
          else:
-            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
-\ No newline at end of file
+            return self._extract_list(content_path)