X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fceskatelevize.py;h=65f6be62313dfc623cf1f9aa7adc52282872aade;hb=233c1c0e76d64c9e13dc8968bfd8a014c49e66a8;hp=59f2a8e451953b45d84957369a2d8b0a2d029ae4;hpb=392017874c646cd884817fb5c16d101b67f20760;p=youtube-dl diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 59f2a8e45..65f6be623 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -2,45 +2,54 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_urllib_request, compat_urllib_parse, compat_urllib_parse_urlparse, +) +from ..utils import ( ExtractorError, + float_or_none, ) class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' - _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', - 'info_dict': { - 'id': '213512120230004', - 'ext': 'flv', - 'title': 'První republika: Španělská chřipka', - 'duration': 3107.4, - }, - 'params': { - 'skip_download': True, # requires rtmpdump + _TESTS = [ + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '214411058091220', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, - 'skip': 'Works only from Czech Republic.', - }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', - 'info_dict': { - 'id': '20138143440', - 'ext': 'flv', - 'title': 'Tsatsiki, maminka a policajt', - 'duration': 6754.1, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '14716', + 'ext': 'mp4', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - 'skip': 'Works only from Czech Republic.', - }] + ] def _real_extract(self, url): url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') @@ -50,9 +59,9 @@ class CeskaTelevizeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - if '

Chyba konfigurace prohlížeče.

' not in webpage: - msg = self._html_search_regex(r'

(.+?)

', webpage, 'error-message') - raise ExtractorError(msg.replace('
', ' ')) + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s

' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') @@ -64,49 +73,83 @@ class CeskaTelevizeIE(InfoExtractor): 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', - data=compat_urllib_parse.urlencode(data)) + req = compat_urllib_request.Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=compat_urllib_parse.urlencode(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) - playlistpage = self._download_webpage(req, video_id) + playlistpage = self._download_json(req, video_id) + + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(json.loads(playlistpage)['url'])) + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_xml(req, video_id) - - formats = [] - for i in playlist.find('smilRoot/body'): - if 'AD' not in i.attrib['id']: - base_url = i.attrib['base'] - parsedurl = compat_urllib_parse_urlparse(base_url) - duration = i.attrib['duration'] - - for video in i.findall('video'): - if video.attrib['label'] != 'AD': - format_id = video.attrib['label'] - play_path = video.attrib['src'] - vbr = int(video.attrib['system-bitrate']) - - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'vbr': vbr, - 'play_path': play_path, - 'app': parsedurl.path[1:] + '?' + parsedurl.query, - 'rtmp_live': True, - 'ext': 'flv', - }) + playlist = self._download_json(req, video_id) + item = playlist['playlist'][0] + formats = [] + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) self._sort_formats(formats) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + return { 'id': episode_id, - 'title': self._html_search_regex(r'(.+?) — iVysílání — Česká televize', webpage, 'title'), - 'duration': float(duration), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{0} --> {1}".format(start, stop) + else: + yield line + + return "\r\n".join(_fix_subtitle(subtitles))