X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fceskatelevize.py;h=345ac5e859d7e6b66af789f99beabdf9f1c36cc7;hb=ecd1936695e73ba850d0618828b4a40d7d16c091;hp=59f2a8e451953b45d84957369a2d8b0a2d029ae4;hpb=392017874c646cd884817fb5c16d101b67f20760;p=youtube-dl diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 59f2a8e45..345ac5e85 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -2,45 +2,55 @@ from __future__ import unicode_literals import re -import json -from .common import InfoExtractor -from ..utils import ( +from .subtitles import SubtitlesInfoExtractor +from ..compat import ( compat_urllib_request, compat_urllib_parse, compat_urllib_parse_urlparse, +) +from ..utils import ( ExtractorError, + float_or_none, + HEADRequest, ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' - _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', - 'info_dict': { - 'id': '213512120230004', - 'ext': 'flv', - 'title': 'První republika: Španělská chřipka', - 'duration': 3107.4, - }, - 'params': { - 'skip_download': True, # requires rtmpdump + _TESTS = [ + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '214411058091220', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, - 'skip': 'Works only from Czech Republic.', - }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', - 'info_dict': { - 'id': '20138143440', - 'ext': 'flv', - 'title': 'Tsatsiki, maminka a policajt', - 'duration': 6754.1, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '14716', + 'ext': 'mp4', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - 'skip': 'Works only from Czech Republic.', - }] + ] def _real_extract(self, url): url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') @@ -50,9 +60,9 @@ class CeskaTelevizeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - if '

Chyba konfigurace prohlížeče.

' not in webpage: - msg = self._html_search_regex(r'

(.+?)

', webpage, 'error-message') - raise ExtractorError(msg.replace('
', ' ')) + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s

' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') @@ -64,49 +74,92 @@ class CeskaTelevizeIE(InfoExtractor): 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', - data=compat_urllib_parse.urlencode(data)) + req = compat_urllib_request.Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=compat_urllib_parse.urlencode(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) - playlistpage = self._download_webpage(req, video_id) + playlistpage = self._download_json(req, video_id) + + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(json.loads(playlistpage)['url'])) + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_xml(req, video_id) - - formats = [] - for i in playlist.find('smilRoot/body'): - if 'AD' not in i.attrib['id']: - base_url = i.attrib['base'] - parsedurl = compat_urllib_parse_urlparse(base_url) - duration = i.attrib['duration'] - - for video in i.findall('video'): - if video.attrib['label'] != 'AD': - format_id = video.attrib['label'] - play_path = video.attrib['src'] - vbr = int(video.attrib['system-bitrate']) - - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'vbr': vbr, - 'play_path': play_path, - 'app': parsedurl.path[1:] + '?' + parsedurl.query, - 'rtmp_live': True, - 'ext': 'flv', - }) + playlist = self._download_json(req, video_id) + item = playlist['playlist'][0] + formats = [] + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) self._sort_formats(formats) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + # subtitles + subtitles = self.extract_subtitles(video_id, webpage) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, webpage) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, webpage)) + return { 'id': episode_id, - 'title': self._html_search_regex(r'(.+?) — iVysílání — Česká televize', webpage, 'title'), - 'duration': float(duration), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + def _fix_subtitles(self, subtitles): + """ Convert milisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert miliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^ *([0-9]+); *([0-9]+) +([0-9]+) *$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{} --> {}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles + + def _get_available_subtitles(self, video_id, webpage): + video_id = video_id.partition('-')[0] + url = 'http://imgct.ceskatelevize.cz/cache/data/ivysilani/' \ + 'subtitles/{}/{}/sub.txt'.format(video_id[:3], video_id) + req = HEADRequest(url) + sub = self._request_webpage( + req, video_id, + note="Checking subtitles", + errnote="No subtitles found", + fatal=False) + return {'cs': url} if sub else {}