[ceskateleveize] Improve extraction and remove URL replacement hacks
[youtube-dl] / youtube_dl / extractor / ceskatelevize.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_unquote,
9     compat_urllib_parse_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     float_or_none,
14     sanitized_Request,
15     urlencode_postdata,
16     USER_AGENTS,
17 )
18
19
20 class CeskaTelevizeIE(InfoExtractor):
21     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
22     _TESTS = [{
23         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
24         'info_dict': {
25             'id': '61924494877246241',
26             'ext': 'mp4',
27             'title': 'Hyde Park Civilizace: Život v Grónsku',
28             'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
29             'thumbnail': r're:^https?://.*\.jpg',
30             'duration': 3350,
31         },
32         'params': {
33             # m3u8 download
34             'skip_download': True,
35         },
36     }, {
37         'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
38         'info_dict': {
39             'id': '61924494877028507',
40             'ext': 'mp4',
41             'title': 'Hyde Park Civilizace: Bonus 01 - En',
42             'description': 'English Subtittles',
43             'thumbnail': r're:^https?://.*\.jpg',
44             'duration': 81.3,
45         },
46         'params': {
47             # m3u8 download
48             'skip_download': True,
49         },
50     }, {
51         # live stream
52         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
53         'info_dict': {
54             'id': 402,
55             'ext': 'mp4',
56             'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
57             'is_live': True,
58         },
59         'params': {
60             # m3u8 download
61             'skip_download': True,
62         },
63         'skip': 'Georestricted to Czech Republic',
64     }, {
65         'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
66         'only_matching': True,
67     }]
68
69     def _real_extract(self, url):
70         playlist_id = self._match_id(url)
71
72         webpage = self._download_webpage(url, playlist_id)
73
74         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
75         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
76             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
77
78         type_ = None
79         episode_id = None
80
81         playlist = self._parse_json(
82             self._search_regex(
83                 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
84                 default='{}'), playlist_id)
85         if playlist:
86             type_ = playlist.get('type')
87             episode_id = playlist.get('id')
88
89         if not type_:
90             type_ = self._html_search_regex(
91                 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
92                 webpage, 'type')
93         if not episode_id:
94             episode_id = self._html_search_regex(
95                 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
96                 webpage, 'episode_id')
97
98         data = {
99             'playlist[0][type]': type_,
100             'playlist[0][id]': episode_id,
101             'requestUrl': compat_urllib_parse_urlparse(url).path,
102             'requestSource': 'iVysilani',
103         }
104
105         entries = []
106
107         for user_agent in (None, USER_AGENTS['Safari']):
108             req = sanitized_Request(
109                 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
110                 data=urlencode_postdata(data))
111
112             req.add_header('Content-type', 'application/x-www-form-urlencoded')
113             req.add_header('x-addr', '127.0.0.1')
114             req.add_header('X-Requested-With', 'XMLHttpRequest')
115             if user_agent:
116                 req.add_header('User-Agent', user_agent)
117             req.add_header('Referer', url)
118
119             playlistpage = self._download_json(req, playlist_id, fatal=False)
120
121             if not playlistpage:
122                 continue
123
124             playlist_url = playlistpage['url']
125             if playlist_url == 'error_region':
126                 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
127
128             req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
129             req.add_header('Referer', url)
130
131             playlist_title = self._og_search_title(webpage, default=None)
132             playlist_description = self._og_search_description(webpage, default=None)
133
134             playlist = self._download_json(req, playlist_id, fatal=False)
135             if not playlist:
136                 continue
137
138             playlist = playlist.get('playlist')
139             if not isinstance(playlist, list):
140                 continue
141
142             playlist_len = len(playlist)
143
144             for num, item in enumerate(playlist):
145                 is_live = item.get('type') == 'LIVE'
146                 formats = []
147                 for format_id, stream_url in item.get('streamUrls', {}).items():
148                     if 'playerType=flash' in stream_url:
149                         stream_formats = self._extract_m3u8_formats(
150                             stream_url, playlist_id, 'mp4', 'm3u8_native',
151                             m3u8_id='hls-%s' % format_id, fatal=False)
152                     else:
153                         stream_formats = self._extract_mpd_formats(
154                             stream_url, playlist_id,
155                             mpd_id='dash-%s' % format_id, fatal=False)
156                     # See https://github.com/rg3/youtube-dl/issues/12119#issuecomment-280037031
157                     if format_id == 'audioDescription':
158                         for f in stream_formats:
159                             f['source_preference'] = -10
160                     formats.extend(stream_formats)
161
162                 if user_agent and len(entries) == playlist_len:
163                     entries[num]['formats'].extend(formats)
164                     continue
165
166                 item_id = item.get('id') or item['assetId']
167                 title = item['title']
168
169                 duration = float_or_none(item.get('duration'))
170                 thumbnail = item.get('previewImageUrl')
171
172                 subtitles = {}
173                 if item.get('type') == 'VOD':
174                     subs = item.get('subtitles')
175                     if subs:
176                         subtitles = self.extract_subtitles(episode_id, subs)
177
178                 if playlist_len == 1:
179                     final_title = playlist_title or title
180                     if is_live:
181                         final_title = self._live_title(final_title)
182                 else:
183                     final_title = '%s (%s)' % (playlist_title, title)
184
185                 entries.append({
186                     'id': item_id,
187                     'title': final_title,
188                     'description': playlist_description if playlist_len == 1 else None,
189                     'thumbnail': thumbnail,
190                     'duration': duration,
191                     'formats': formats,
192                     'subtitles': subtitles,
193                     'is_live': is_live,
194                 })
195
196         for e in entries:
197             self._sort_formats(e['formats'])
198
199         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
200
201     def _get_subtitles(self, episode_id, subs):
202         original_subtitles = self._download_webpage(
203             subs[0]['url'], episode_id, 'Downloading subtitles')
204         srt_subs = self._fix_subtitles(original_subtitles)
205         return {
206             'cs': [{
207                 'ext': 'srt',
208                 'data': srt_subs,
209             }]
210         }
211
212     @staticmethod
213     def _fix_subtitles(subtitles):
214         """ Convert millisecond-based subtitles to SRT """
215
216         def _msectotimecode(msec):
217             """ Helper utility to convert milliseconds to timecode """
218             components = []
219             for divider in [1000, 60, 60, 100]:
220                 components.append(msec % divider)
221                 msec //= divider
222             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
223
224         def _fix_subtitle(subtitle):
225             for line in subtitle.splitlines():
226                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
227                 if m:
228                     yield m.group(1)
229                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
230                     yield '{0} --> {1}'.format(start, stop)
231                 else:
232                     yield line
233
234         return '\r\n'.join(_fix_subtitle(subtitles))