Merge pull request #7045 from remitamine/ign

[youtube-dl] / youtube_dl / extractor / channel9.py
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py

index 016c4497a343f427da54907675ee4d0477b74700..c74553dcfa7c689b7fc8d69147625b1169e1e178 100644 (file)
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -1,74 +1,75 @@
-# encoding: utf-8
+from __future__ import unicode_literals
  
  import re
  
  from .common import InfoExtractor
  from ..utils import (
  
  import re
  
  from .common import InfoExtractor
  from ..utils import (
-    format_bytes,
      ExtractorError,
      ExtractorError,
+    parse_filesize,
+    qualities,
  )
  
  )
  
+
  class Channel9IE(InfoExtractor):
      '''
      Common extractor for channel9.msdn.com.
  
      The type of provided URL (video or playlist) is determined according to
      meta Search.PageType from web page HTML rather than URL itself, as it is
  class Channel9IE(InfoExtractor):
      '''
      Common extractor for channel9.msdn.com.
  
      The type of provided URL (video or playlist) is determined according to
      meta Search.PageType from web page HTML rather than URL itself, as it is
-    not always possible to do.    
+    not always possible to do.
      '''
      '''
-    IE_DESC = u'Channel 9'
-    IE_NAME = u'channel9'
-    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
+    IE_DESC = 'Channel 9'
+    IE_NAME = 'channel9'
+    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  
      _TESTS = [
          {
  
      _TESTS = [
          {
-            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
-            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
-            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
-            u'info_dict': {
-                u'title': u'Developer Kick-Off Session: Stuff We Love',
-                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
-                u'duration': 4576,
-                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
-                u'session_code': u'KOS002',
-                u'session_day': u'Day 1',
-                u'session_room': u'Arena 1A',
-                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
+            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+            'info_dict': {
+                'id': 'Events/TechEd/Australia/2013/KOS002',
+                'ext': 'mp4',
+                'title': 'Developer Kick-Off Session: Stuff We Love',
+                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+                'duration': 4576,
+                'thumbnail': 're:http://.*\.jpg',
+                'session_code': 'KOS002',
+                'session_day': 'Day 1',
+                'session_room': 'Arena 1A',
+                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
+            },
+        },
+        {
+            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+            'info_dict': {
+                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
+                'ext': 'mp4',
+                'title': 'Self-service BI with Power BI - nuclear testing',
+                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+                'duration': 1540,
+                'thumbnail': 're:http://.*\.jpg',
+                'authors': ['Mike Wilmot'],
              },
          },
          {
              },
          },
          {
-            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
-            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
-            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
-            u'info_dict': {
-                u'title': u'Self-service BI with Power BI - nuclear testing',
-                u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10',
-                u'duration': 1540,
-                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
-                u'authors': [ u'Mike Wilmot' ],
+            # low quality mp4 is best
+            'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+            'info_dict': {
+                'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+                'ext': 'mp4',
+                'title': 'Ranges for the Standard Library',
+                'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+                'duration': 5646,
+                'thumbnail': 're:http://.*\.jpg',
+            },
+            'params': {
+                'skip_download': True,
              },
          }
      ]
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
              },
          }
      ]
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
-    _EXTRACT_ENTRY_ITEMS_FROM_RSS = False
-
-    # Sorted by quality
-    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
-
-    def _restore_bytes(self, formatted_size):
-        if not formatted_size:
-            return 0
-        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
-        if not m:
-            return 0
-        units = m.group('units')
-        try:
-            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
-        except ValueError:
-            return 0
-        size = float(m.group('size'))
-        return int(size * (1024 ** exponent))
  
      def _formats_from_html(self, html):
          FORMAT_REGEX = r'''
  
      def _formats_from_html(self, html):
          FORMAT_REGEX = r'''
@@ -79,53 +80,30 @@ class Channel9IE(InfoExtractor):
              <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
              </div>)?                                                # File size part may be missing
          '''
              <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
              </div>)?                                                # File size part may be missing
          '''
-        # Extract known formats
-        formats = [{'url': x.group('url'),
-                 'format_id': x.group('quality'),
-                 'format_note': x.group('note'),
-                 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
-                 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
-                 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
-        # Sort according to known formats list
-        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
-        return formats
+        quality = qualities((
+            'MP3', 'MP4',
+            'Low Quality WMV', 'Low Quality MP4',
+            'Mid Quality WMV', 'Mid Quality MP4',
+            'High Quality WMV', 'High Quality MP4'))
+        formats = [{
+            'url': x.group('url'),
+            'format_id': x.group('quality'),
+            'format_note': x.group('note'),
+            'format': '%s (%s)' % (x.group('quality'), x.group('note')),
+            'filesize_approx': parse_filesize(x.group('filesize')),
+            'quality': quality(x.group('quality')),
+            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
+        } for x in list(re.finditer(FORMAT_REGEX, html))]
+
+        self._sort_formats(formats)
  
  
-    def _formats_from_rss_item(self, item):
-
-        def process_formats(elem):
-            formats = []
-            for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'):
-                url = media_content.attrib['url']
-                # Ignore unrelated media
-                if url.endswith('.ism/manifest'):
-                    continue
-                format_note = media_content.attrib['type']
-                filesize = int(media_content.attrib['fileSize'])
-                formats.append({'url': url,
-                                'format_note': format_note,
-                                'format': '%s %s' % (format_note, format_bytes(filesize)),
-                                'filesize': filesize,
-                                })
-            return formats
-
-        formats = []
-
-        for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'):
-            formats.extend(process_formats(media_group))
-
-        # Sometimes there are no media:groups in item, but there is media:content
-        # right in item (usually when there is the only media source)
-        formats.extend(process_formats(item))        
-
-        # Sort by file size
-        formats.sort(key=lambda fmt: fmt['filesize'])
          return formats
  
      def _extract_title(self, html):
          return formats
  
      def _extract_title(self, html):
-        title = self._html_search_meta(u'title', html, u'title')
-        if title is None:           
+        title = self._html_search_meta('title', html, 'title')
+        if title is None:
              title = self._og_search_title(html)
              title = self._og_search_title(html)
-            TITLE_SUFFIX = u' (Channel 9)'
+            TITLE_SUFFIX = ' (Channel 9)'
              if title is not None and title.endswith(TITLE_SUFFIX):
                  title = title[:-len(TITLE_SUFFIX)]
          return title
              if title is not None and title.endswith(TITLE_SUFFIX):
                  title = title[:-len(TITLE_SUFFIX)]
          return title
@@ -141,10 +119,10 @@ class Channel9IE(InfoExtractor):
          m = re.search(DESCRIPTION_REGEX, html)
          if m is not None:
              return m.group('description')
          m = re.search(DESCRIPTION_REGEX, html)
          if m is not None:
              return m.group('description')
-        return self._html_search_meta(u'description', html, u'description')
+        return self._html_search_meta('description', html, 'description')
  
      def _extract_duration(self, html):
  
      def _extract_duration(self, html):
-        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
          return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
  
      def _extract_slides(self, html):
          return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
  
      def _extract_slides(self, html):
@@ -186,7 +164,7 @@ class Channel9IE(InfoExtractor):
  
      def _extract_session_day(self, html):
          m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
  
      def _extract_session_day(self, html):
          m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
-        return m.group('day') if m is not None else None
+        return m.group('day').strip() if m is not None else None
  
      def _extract_session_room(self, html):
          m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
  
      def _extract_session_room(self, html):
          m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
@@ -196,14 +174,14 @@ class Channel9IE(InfoExtractor):
          return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
  
      def _extract_content(self, html, content_path):
          return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
  
      def _extract_content(self, html, content_path):
-        # Look for downloadable content        
+        # Look for downloadable content
          formats = self._formats_from_html(html)
          slides = self._extract_slides(html)
          zip_ = self._extract_zip(html)
  
          # Nothing to download
          if len(formats) == 0 and slides is None and zip_ is None:
          formats = self._formats_from_html(html)
          slides = self._extract_slides(html)
          zip_ = self._extract_zip(html)
  
          # Nothing to download
          if len(formats) == 0 and slides is None and zip_ is None:
-            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
+            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
              return
  
          # Extract meta
              return
  
          # Extract meta
@@ -216,32 +194,33 @@ class Channel9IE(InfoExtractor):
          view_count = self._extract_view_count(html)
          comment_count = self._extract_comment_count(html)
  
          view_count = self._extract_view_count(html)
          comment_count = self._extract_comment_count(html)
  
-        common = {'_type': 'video',
-                  'id': content_path,
-                  'description': description,
-                  'thumbnail': thumbnail,
-                  'duration': duration,
-                  'avg_rating': avg_rating,
-                  'rating_count': rating_count,
-                  'view_count': view_count,
-                  'comment_count': comment_count,
-                }
+        common = {
+            '_type': 'video',
+            'id': content_path,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'avg_rating': avg_rating,
+            'rating_count': rating_count,
+            'view_count': view_count,
+            'comment_count': comment_count,
+        }
  
          result = []
  
          if slides is not None:
              d = common.copy()
  
          result = []
  
          if slides is not None:
              d = common.copy()
-            d.update({ 'title': title + '-Slides', 'url': slides })
+            d.update({'title': title + '-Slides', 'url': slides})
              result.append(d)
  
          if zip_ is not None:
              d = common.copy()
              result.append(d)
  
          if zip_ is not None:
              d = common.copy()
-            d.update({ 'title': title + '-Zip', 'url': zip_ })
+            d.update({'title': title + '-Zip', 'url': zip_})
              result.append(d)
  
          if len(formats) > 0:
              d = common.copy()
              result.append(d)
  
          if len(formats) > 0:
              d = common.copy()
-            d.update({ 'title': title, 'formats': formats })
+            d.update({'title': title, 'formats': formats})
              result.append(d)
  
          return result
              result.append(d)
  
          return result
@@ -251,101 +230,54 @@ class Channel9IE(InfoExtractor):
          if contents is None:
              return contents
  
          if contents is None:
              return contents
  
-        authors = self._extract_authors(html)
-
-        for content in contents:
-            content['authors'] = authors
+        if len(contents) > 1:
+            raise ExtractorError('Got more than one entry')
+        result = contents[0]
+        result['authors'] = self._extract_authors(html)
  
  
-        return contents
+        return result
  
      def _extract_session(self, html, content_path):
          contents = self._extract_content(html, content_path)
          if contents is None:
              return contents
  
  
      def _extract_session(self, html, content_path):
          contents = self._extract_content(html, content_path)
          if contents is None:
              return contents
  
-        session_meta = {'session_code': self._extract_session_code(html),
-                        'session_day': self._extract_session_day(html),
-                        'session_room': self._extract_session_room(html),
-                        'session_speakers': self._extract_session_speakers(html),
-                        }
+        session_meta = {
+            'session_code': self._extract_session_code(html),
+            'session_day': self._extract_session_day(html),
+            'session_room': self._extract_session_room(html),
+            'session_speakers': self._extract_session_speakers(html),
+        }
  
          for content in contents:
              content.update(session_meta)
  
  
          for content in contents:
              content.update(session_meta)
  
-        return contents
-
-    def _extract_content_rss(self, rss):
-        '''
-        Extracts links to entry items right out of RSS feed.
-        This approach is faster than extracting from web pages
-        one by one, but suffers from some problems.
-        Pros:
-         - no need to download additional pages
-         - provides more media links
-         - accurate file size
-        Cons:
-         - fewer meta data provided
-         - links to media files have no appropriate data that may be used as format_id
-         - RSS does not contain links to presentation materials (slides, zip)
-        '''
-        entries = []
-        for item in rss.findall('./channel/item'):
-            url = item.find('./link').text
-            video_id = url.split('/')[-1]
-            formats = self._formats_from_rss_item(item)
-
-            if len(formats) == 0:
-                self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id)
-                continue
-
-            title = item.find('./title').text
-            description = item.find('./description').text
-
-            thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text
-
-            duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration')
-            duration = duration_e.text if duration_e is not None else 0
-
-            speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator')
-            speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else []
-
-            entries.append({'_type': 'video',
-                            'id': video_id,
-                            'formats': formats,
-                            'title': title,
-                            'description': description,
-                            'thumbnail': thumbnail,
-                            'duration': duration,
-                            'session_speakers': speakers,                            
-                            })
-        return entries
+        return self.playlist_result(contents)
  
      def _extract_list(self, content_path):
  
      def _extract_list(self, content_path):
-        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
-        if self._EXTRACT_ENTRY_ITEMS_FROM_RSS:   
-            return self._extract_content_rss(rss)
-        else:
-            entries = [self.url_result(session_url.text, 'Channel9')
-                       for session_url in rss.findall('./channel/item/link')]
-            title_text = rss.find('./channel/title').text
-            return self.playlist_result(entries, content_path, title_text)
+        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
+        entries = [self.url_result(session_url.text, 'Channel9')
+                   for session_url in rss.findall('./channel/item/link')]
+        title_text = rss.find('./channel/title').text
+        return self.playlist_result(entries, content_path, title_text)
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          content_path = mobj.group('contentpath')
  
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          content_path = mobj.group('contentpath')
  
-        webpage = self._download_webpage(url, content_path, u'Downloading web page')
-
-        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
-        if page_type_m is None:
-            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
-
-        page_type = page_type_m.group('pagetype')
-        if page_type == 'List':         # List page, may contain list of 'item'-like objects
+        webpage = self._download_webpage(url, content_path, 'Downloading web page')
+
+        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+        if page_type_m is not None:
+            page_type = page_type_m.group('pagetype')
+            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
+                return self._extract_entry_item(webpage, content_path)
+            elif page_type == 'Session':  # Event session page, may contain downloadable content
+                return self._extract_session(webpage, content_path)
+            elif page_type == 'Event':
+                return self._extract_list(content_path)
+            else:
+                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+        else:  # Assuming list
              return self._extract_list(content_path)
              return self._extract_list(content_path)
-        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
-            return self._extract_entry_item(webpage, content_path)
-        elif page_type == 'Session':    # Event session page, may contain downloadable content
-            return self._extract_session(webpage, content_path)
-        else:
-            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
-\ No newline at end of file