X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fchannel9.py;h=574881b70de67b9521b5e813f0cafa6da59d1068;hb=19dab5e6cc9e0a7a726af8bb67ca30801c2107b0;hp=016c4497a343f427da54907675ee4d0477b74700;hpb=df5374743615fe54178b3942b115e7168ce0cd97;p=youtube-dl diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 016c4497a..574881b70 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,10 +3,7 @@ import re from .common import InfoExtractor -from ..utils import ( - format_bytes, - ExtractorError, -) +from ..utils import ExtractorError class Channel9IE(InfoExtractor): ''' @@ -42,7 +39,7 @@ class Channel9IE(InfoExtractor): u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', u'info_dict': { u'title': u'Self-service BI with Power BI - nuclear testing', - u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10', + u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', u'duration': 1540, u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', u'authors': [ u'Mike Wilmot' ], @@ -51,7 +48,6 @@ class Channel9IE(InfoExtractor): ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - _EXTRACT_ENTRY_ITEMS_FROM_RSS = False # Sorted by quality _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] @@ -80,45 +76,18 @@ class Channel9IE(InfoExtractor): )? # File size part may be missing ''' # Extract known formats - formats = [{'url': x.group('url'), - 'format_id': x.group('quality'), - 'format_note': x.group('note'), - 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] - # Sort according to known formats list - formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) - return formats + formats = [{ + 'url': x.group('url'), + 'format_id': x.group('quality'), + 'format_note': x.group('note'), + 'format': u'%s (%s)' % (x.group('quality'), x.group('note')), + 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate + 'preference': self._known_formats.index(x.group('quality')), + 'vcodec': 'none' if x.group('note') == 'Audio only' else None, + } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + + self._sort_formats(formats) - def _formats_from_rss_item(self, item): - - def process_formats(elem): - formats = [] - for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'): - url = media_content.attrib['url'] - # Ignore unrelated media - if url.endswith('.ism/manifest'): - continue - format_note = media_content.attrib['type'] - filesize = int(media_content.attrib['fileSize']) - formats.append({'url': url, - 'format_note': format_note, - 'format': '%s %s' % (format_note, format_bytes(filesize)), - 'filesize': filesize, - }) - return formats - - formats = [] - - for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'): - formats.extend(process_formats(media_group)) - - # Sometimes there are no media:groups in item, but there is media:content - # right in item (usually when there is the only media source) - formats.extend(process_formats(item)) - - # Sort by file size - formats.sort(key=lambda fmt: fmt['filesize']) return formats def _extract_title(self, html): @@ -274,61 +243,12 @@ class Channel9IE(InfoExtractor): return contents - def _extract_content_rss(self, rss): - ''' - Extracts links to entry items right out of RSS feed. - This approach is faster than extracting from web pages - one by one, but suffers from some problems. - Pros: - - no need to download additional pages - - provides more media links - - accurate file size - Cons: - - fewer meta data provided - - links to media files have no appropriate data that may be used as format_id - - RSS does not contain links to presentation materials (slides, zip) - ''' - entries = [] - for item in rss.findall('./channel/item'): - url = item.find('./link').text - video_id = url.split('/')[-1] - formats = self._formats_from_rss_item(item) - - if len(formats) == 0: - self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id) - continue - - title = item.find('./title').text - description = item.find('./description').text - - thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text - - duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration') - duration = duration_e.text if duration_e is not None else 0 - - speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator') - speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else [] - - entries.append({'_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'session_speakers': speakers, - }) - return entries - def _extract_list(self, content_path): rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') - if self._EXTRACT_ENTRY_ITEMS_FROM_RSS: - return self._extract_content_rss(rss) - else: - entries = [self.url_result(session_url.text, 'Channel9') - for session_url in rss.findall('./channel/item/link')] - title_text = rss.find('./channel/title').text - return self.playlist_result(entries, content_path, title_text) + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, content_path, title_text) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url)