[GenericIE] Outsource embedded blip.tv player video id extraction to BlipTVIE and...
[youtube-dl] / youtube_dl / extractor / channel9.py
index 016c4497a343f427da54907675ee4d0477b74700..574881b70de67b9521b5e813f0cafa6da59d1068 100644 (file)
@@ -3,10 +3,7 @@
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    format_bytes,
-    ExtractorError,
-)
+from ..utils import ExtractorError
 
 class Channel9IE(InfoExtractor):
     '''
@@ -42,7 +39,7 @@ class Channel9IE(InfoExtractor):
             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
             u'info_dict': {
                 u'title': u'Self-service BI with Power BI - nuclear testing',
-                u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10',
+                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
                 u'duration': 1540,
                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
                 u'authors': [ u'Mike Wilmot' ],
@@ -51,7 +48,6 @@ class Channel9IE(InfoExtractor):
     ]
 
     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
-    _EXTRACT_ENTRY_ITEMS_FROM_RSS = False
 
     # Sorted by quality
     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
@@ -80,45 +76,18 @@ class Channel9IE(InfoExtractor):
             </div>)?                                                # File size part may be missing
         '''
         # Extract known formats
-        formats = [{'url': x.group('url'),
-                 'format_id': x.group('quality'),
-                 'format_note': x.group('note'),
-                 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
-                 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
-                 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
-        # Sort according to known formats list
-        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
-        return formats
+        formats = [{
+            'url': x.group('url'),
+            'format_id': x.group('quality'),
+            'format_note': x.group('note'),
+            'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
+            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+            'preference': self._known_formats.index(x.group('quality')),
+            'vcodec': 'none' if x.group('note') == 'Audio only' else None,
+        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+
+        self._sort_formats(formats)
 
-    def _formats_from_rss_item(self, item):
-
-        def process_formats(elem):
-            formats = []
-            for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'):
-                url = media_content.attrib['url']
-                # Ignore unrelated media
-                if url.endswith('.ism/manifest'):
-                    continue
-                format_note = media_content.attrib['type']
-                filesize = int(media_content.attrib['fileSize'])
-                formats.append({'url': url,
-                                'format_note': format_note,
-                                'format': '%s %s' % (format_note, format_bytes(filesize)),
-                                'filesize': filesize,
-                                })
-            return formats
-
-        formats = []
-
-        for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'):
-            formats.extend(process_formats(media_group))
-
-        # Sometimes there are no media:groups in item, but there is media:content
-        # right in item (usually when there is the only media source)
-        formats.extend(process_formats(item))        
-
-        # Sort by file size
-        formats.sort(key=lambda fmt: fmt['filesize'])
         return formats
 
     def _extract_title(self, html):
@@ -274,61 +243,12 @@ class Channel9IE(InfoExtractor):
 
         return contents
 
-    def _extract_content_rss(self, rss):
-        '''
-        Extracts links to entry items right out of RSS feed.
-        This approach is faster than extracting from web pages
-        one by one, but suffers from some problems.
-        Pros:
-         - no need to download additional pages
-         - provides more media links
-         - accurate file size
-        Cons:
-         - fewer meta data provided
-         - links to media files have no appropriate data that may be used as format_id
-         - RSS does not contain links to presentation materials (slides, zip)
-        '''
-        entries = []
-        for item in rss.findall('./channel/item'):
-            url = item.find('./link').text
-            video_id = url.split('/')[-1]
-            formats = self._formats_from_rss_item(item)
-
-            if len(formats) == 0:
-                self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id)
-                continue
-
-            title = item.find('./title').text
-            description = item.find('./description').text
-
-            thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text
-
-            duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration')
-            duration = duration_e.text if duration_e is not None else 0
-
-            speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator')
-            speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else []
-
-            entries.append({'_type': 'video',
-                            'id': video_id,
-                            'formats': formats,
-                            'title': title,
-                            'description': description,
-                            'thumbnail': thumbnail,
-                            'duration': duration,
-                            'session_speakers': speakers,                            
-                            })
-        return entries
-
     def _extract_list(self, content_path):
         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
-        if self._EXTRACT_ENTRY_ITEMS_FROM_RSS:   
-            return self._extract_content_rss(rss)
-        else:
-            entries = [self.url_result(session_url.text, 'Channel9')
-                       for session_url in rss.findall('./channel/item/link')]
-            title_text = rss.find('./channel/title').text
-            return self.playlist_result(entries, content_path, title_text)
+        entries = [self.url_result(session_url.text, 'Channel9')
+                   for session_url in rss.findall('./channel/item/link')]
+        title_text = rss.find('./channel/title').text
+        return self.playlist_result(entries, content_path, title_text)
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)