X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fceskatelevize.py;h=345ac5e859d7e6b66af789f99beabdf9f1c36cc7;hb=ecd1936695e73ba850d0618828b4a40d7d16c091;hp=97feb6704075831fb8b5ef95a547428ecc57ec3f;hpb=cf372f0778e82cdc181a6173909589e640ac29fb;p=youtube-dl diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 97feb6704..345ac5e85 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,55 +3,51 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import ( +from .subtitles import SubtitlesInfoExtractor +from ..compat import ( compat_urllib_request, compat_urllib_parse, compat_urllib_parse_urlparse, +) +from ..utils import ( ExtractorError, + float_or_none, + HEADRequest, ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' _TESTS = [ { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', - 'info_dict': { - 'id': '213512120230004', - 'ext': 'flv', - 'title': 'První republika: Španělská chřipka', - 'duration': 3107.4, - }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - 'skip': 'Works only from Czech Republic.', - }, - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { - 'id': '20138143440', - 'ext': 'flv', - 'title': 'Tsatsiki, maminka a policajt', - 'duration': 6754.1, + 'id': '214411058091220', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, }, 'params': { - 'skip_download': True, # requires rtmpdump + # m3u8 download + 'skip_download': True, }, - 'skip': 'Works only from Czech Republic.', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', 'info_dict': { 'id': '14716', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'duration': 90, + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, }, 'params': { - 'skip_download': True, # requires rtmpdump + # m3u8 download + 'skip_download': True, }, }, ] @@ -78,8 +74,9 @@ class CeskaTelevizeIE(InfoExtractor): 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', - data=compat_urllib_parse.urlencode(data)) + req = compat_urllib_request.Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=compat_urllib_parse.urlencode(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') @@ -88,39 +85,81 @@ class CeskaTelevizeIE(InfoExtractor): playlistpage = self._download_json(req, video_id) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url'])) + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_xml(req, video_id) + playlist = self._download_json(req, video_id) + item = playlist['playlist'][0] formats = [] - for i in playlist.find('smilRoot/body'): - if 'AD' not in i.attrib['id']: - base_url = i.attrib['base'] - parsedurl = compat_urllib_parse_urlparse(base_url) - duration = i.attrib['duration'] - - for video in i.findall('video'): - if video.attrib['label'] != 'AD': - format_id = video.attrib['label'] - play_path = video.attrib['src'] - vbr = int(video.attrib['system-bitrate']) - - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'vbr': vbr, - 'play_path': play_path, - 'app': parsedurl.path[1:] + '?' + parsedurl.query, - 'rtmp_live': True, - 'ext': 'flv', - }) - + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) self._sort_formats(formats) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + # subtitles + subtitles = self.extract_subtitles(video_id, webpage) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, webpage) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, webpage)) + return { 'id': episode_id, - 'title': self._html_search_regex(r'(.+?) — iVysílání — Česká televize', webpage, 'title'), - 'duration': float(duration), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + def _fix_subtitles(self, subtitles): + """ Convert milisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert miliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^ *([0-9]+); *([0-9]+) +([0-9]+) *$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{} --> {}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles + + def _get_available_subtitles(self, video_id, webpage): + video_id = video_id.partition('-')[0] + url = 'http://imgct.ceskatelevize.cz/cache/data/ivysilani/' \ + 'subtitles/{}/{}/sub.txt'.format(video_id[:3], video_id) + req = HEADRequest(url) + sub = self._request_webpage( + req, video_id, + note="Checking subtitles", + errnote="No subtitles found", + fatal=False) + return {'cs': url} if sub else {}