From: Kacper Michajłow Date: Wed, 9 Mar 2016 19:55:27 +0000 (+0100) Subject: [cda] Add new extractor for cda.pl X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=8b0d7a66ef5451556bb8ae5b085c7bef4c992f8b;p=youtube-dl [cda] Add new extractor for cda.pl Fixes #8760 --- diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b3bc38916..5f5eca42b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -108,6 +108,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chaturbate import ChaturbateIE diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..4c53b8dda --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|ebd)\.)?cda\.pl/(?:video|[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _TESTS = [ + { + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, + { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'(.+?)', webpage, 'title', fatal=False) + + def _get_format(page, version=''): + unpacked = decode_packed_codes(page) + duration = self._search_regex(r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False) + format_id = None + height = None + + m = re.search(r'(?P[0-9]+)p<\/a>', page) + if m: + format_id = m.group('format_id') + height = int(m.group('height')) + + url = self._search_regex(r"url:\\'(.+?)\\'", unpacked, version + ' url', fatal=False) + if url is None: + return None + + return { + 'format_id': format_id, + 'height': height, + 'url': url + }, parse_duration(duration) + + formats = [] + + format_desc, duration = _get_format(webpage) or (None, None) + if format_desc is not None: + formats.append(format_desc) + + pattern = re.compile(r'([0-9]+p)<\/a>') + for version in re.findall(pattern, webpage): + webpage = self._download_webpage(version[0], video_id, 'Downloading %s version information' % version[1], fatal=False) + if not webpage: + # Manually report warning because empty page is returned when invalid version is requested. + self.report_warning('Unable to download %s version information' % version[1]) + continue + + format_desc, duration_ = _get_format(webpage, version[1]) or (None, None) + duration = duration or duration_ + if format_desc is not None: + formats.append(format_desc) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration + }