[ceskatelevize] Add support for live streams
[youtube-dl] / youtube_dl / extractor / ceskatelevize.py
1 # -*- coding: utf-8 -*-
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_unquote,
9     compat_urllib_parse_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     float_or_none,
14     sanitized_Request,
15     urlencode_postdata,
16 )
17
18
19 class CeskaTelevizeIE(InfoExtractor):
20     _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$'
21     _TESTS = [{
22         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
23         'info_dict': {
24             'id': '61924494876951776',
25             'ext': 'mp4',
26             'title': 'Hyde Park Civilizace',
27             'description': 'md5:fe93f6eda372d150759d11644ebbfb4a',
28             'thumbnail': 're:^https?://.*\.jpg',
29             'duration': 3350,
30         },
31         'params': {
32             # m3u8 download
33             'skip_download': True,
34         },
35     }, {
36         # live stream
37         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
38         'info_dict': {
39             'id': 402,
40             'ext': 'mp4',
41             'title': 're:ČT Sport.*',
42             'is_live': True,
43         },
44         'params': {
45             # m3u8 download
46             'skip_download': True,
47         },
48     }, {
49         # video with 18+ caution trailer
50         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
51         'info_dict': {
52             'id': '215562210900007-bogotart',
53             'title': 'Queer: Bogotart',
54             'description': 'Alternativní průvodce současným queer světem',
55         },
56         'playlist': [{
57             'info_dict': {
58                 'id': '61924494876844842',
59                 'ext': 'mp4',
60                 'title': 'Queer: Bogotart (Varování 18+)',
61                 'duration': 10.2,
62             },
63         }, {
64             'info_dict': {
65                 'id': '61924494877068022',
66                 'ext': 'mp4',
67                 'title': 'Queer: Bogotart (Queer)',
68                 'thumbnail': 're:^https?://.*\.jpg',
69                 'duration': 1558.3,
70             },
71         }],
72         'params': {
73             # m3u8 download
74             'skip_download': True,
75         },
76     }]
77
78     def _real_extract(self, url):
79         url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
80
81         mobj = re.match(self._VALID_URL, url)
82         playlist_id = mobj.group('id')
83
84         webpage = self._download_webpage(url, playlist_id)
85
86         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
87         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
88             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
89
90         typ = self._html_search_regex(
91             r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
92         episode_id = self._html_search_regex(
93             r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
94
95         data = {
96             'playlist[0][type]': typ,
97             'playlist[0][id]': episode_id,
98             'requestUrl': compat_urllib_parse_urlparse(url).path,
99             'requestSource': 'iVysilani',
100         }
101
102         req = sanitized_Request(
103             'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
104             data=urlencode_postdata(data))
105
106         req.add_header('Content-type', 'application/x-www-form-urlencoded')
107         req.add_header('x-addr', '127.0.0.1')
108         req.add_header('X-Requested-With', 'XMLHttpRequest')
109         req.add_header('Referer', url)
110
111         playlistpage = self._download_json(req, playlist_id)
112
113         playlist_url = playlistpage['url']
114         if playlist_url == 'error_region':
115             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
116
117         req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
118         req.add_header('Referer', url)
119
120         playlist_title = self._og_search_title(webpage, default=None)
121         playlist_description = self._og_search_description(webpage, default=None)
122
123         playlist = self._download_json(req, playlist_id)['playlist']
124         playlist_len = len(playlist)
125
126         entries = []
127         for item in playlist:
128             is_live = item['type'] == 'LIVE'
129             formats = []
130             for format_id, stream_url in item['streamUrls'].items():
131                 formats.extend(self._extract_m3u8_formats(
132                     stream_url, playlist_id, 'mp4',
133                     entry_protocol='m3u8' if is_live else 'm3u8_native',
134                     fatal=False))
135             self._sort_formats(formats)
136
137             item_id = item.get('id') or item['assetId']
138             title = item['title']
139
140             duration = float_or_none(item.get('duration'))
141             thumbnail = item.get('previewImageUrl')
142
143             subtitles = {}
144             if item.get('type') == 'VOD':
145                 subs = item.get('subtitles')
146                 if subs:
147                     subtitles = self.extract_subtitles(episode_id, subs)
148
149             if playlist_len == 1:
150                 if is_live:
151                     # live streams has channel name in title
152                     final_title = self._live_title(title)
153                 elif playlist_title:
154                     # title is always set (no KeyError caught)
155                     # and gives good fallback
156                     final_title = title
157                 else:
158                     final_title = playlist_title
159             else:
160                 final_title = '%s (%s)' % (playlist_title, title)
161
162             entries.append({
163                 'id': item_id,
164                 'title': final_title,
165                 'description': playlist_description if playlist_len == 1 else None,
166                 'thumbnail': thumbnail,
167                 'duration': duration,
168                 'formats': formats,
169                 'subtitles': subtitles,
170                 'is_live': is_live,
171             })
172
173         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
174
175     def _get_subtitles(self, episode_id, subs):
176         original_subtitles = self._download_webpage(
177             subs[0]['url'], episode_id, 'Downloading subtitles')
178         srt_subs = self._fix_subtitles(original_subtitles)
179         return {
180             'cs': [{
181                 'ext': 'srt',
182                 'data': srt_subs,
183             }]
184         }
185
186     @staticmethod
187     def _fix_subtitles(subtitles):
188         """ Convert millisecond-based subtitles to SRT """
189
190         def _msectotimecode(msec):
191             """ Helper utility to convert milliseconds to timecode """
192             components = []
193             for divider in [1000, 60, 60, 100]:
194                 components.append(msec % divider)
195                 msec //= divider
196             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
197
198         def _fix_subtitle(subtitle):
199             for line in subtitle.splitlines():
200                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
201                 if m:
202                     yield m.group(1)
203                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
204                     yield '{0} --> {1}'.format(start, stop)
205                 else:
206                     yield line
207
208         return '\r\n'.join(_fix_subtitle(subtitles))