[ceskateleveize:porady] Add extractor (closes #7411, closes #12645)
[youtube-dl] / youtube_dl / extractor / ceskatelevize.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_unquote,
9     compat_urllib_parse_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     float_or_none,
14     sanitized_Request,
15     unescapeHTML,
16     urlencode_postdata,
17     USER_AGENTS,
18 )
19
20
21 class CeskaTelevizeIE(InfoExtractor):
22     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
23     _TESTS = [{
24         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
25         'info_dict': {
26             'id': '61924494877246241',
27             'ext': 'mp4',
28             'title': 'Hyde Park Civilizace: Život v Grónsku',
29             'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
30             'thumbnail': r're:^https?://.*\.jpg',
31             'duration': 3350,
32         },
33         'params': {
34             # m3u8 download
35             'skip_download': True,
36         },
37     }, {
38         'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
39         'info_dict': {
40             'id': '61924494877028507',
41             'ext': 'mp4',
42             'title': 'Hyde Park Civilizace: Bonus 01 - En',
43             'description': 'English Subtittles',
44             'thumbnail': r're:^https?://.*\.jpg',
45             'duration': 81.3,
46         },
47         'params': {
48             # m3u8 download
49             'skip_download': True,
50         },
51     }, {
52         # live stream
53         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
54         'info_dict': {
55             'id': 402,
56             'ext': 'mp4',
57             'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
58             'is_live': True,
59         },
60         'params': {
61             # m3u8 download
62             'skip_download': True,
63         },
64         'skip': 'Georestricted to Czech Republic',
65     }, {
66         'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
67         'only_matching': True,
68     }]
69
70     def _real_extract(self, url):
71         playlist_id = self._match_id(url)
72
73         webpage = self._download_webpage(url, playlist_id)
74
75         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
76         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
77             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
78
79         type_ = None
80         episode_id = None
81
82         playlist = self._parse_json(
83             self._search_regex(
84                 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
85                 default='{}'), playlist_id)
86         if playlist:
87             type_ = playlist.get('type')
88             episode_id = playlist.get('id')
89
90         if not type_:
91             type_ = self._html_search_regex(
92                 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
93                 webpage, 'type')
94         if not episode_id:
95             episode_id = self._html_search_regex(
96                 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
97                 webpage, 'episode_id')
98
99         data = {
100             'playlist[0][type]': type_,
101             'playlist[0][id]': episode_id,
102             'requestUrl': compat_urllib_parse_urlparse(url).path,
103             'requestSource': 'iVysilani',
104         }
105
106         entries = []
107
108         for user_agent in (None, USER_AGENTS['Safari']):
109             req = sanitized_Request(
110                 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
111                 data=urlencode_postdata(data))
112
113             req.add_header('Content-type', 'application/x-www-form-urlencoded')
114             req.add_header('x-addr', '127.0.0.1')
115             req.add_header('X-Requested-With', 'XMLHttpRequest')
116             if user_agent:
117                 req.add_header('User-Agent', user_agent)
118             req.add_header('Referer', url)
119
120             playlistpage = self._download_json(req, playlist_id, fatal=False)
121
122             if not playlistpage:
123                 continue
124
125             playlist_url = playlistpage['url']
126             if playlist_url == 'error_region':
127                 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
128
129             req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
130             req.add_header('Referer', url)
131
132             playlist_title = self._og_search_title(webpage, default=None)
133             playlist_description = self._og_search_description(webpage, default=None)
134
135             playlist = self._download_json(req, playlist_id, fatal=False)
136             if not playlist:
137                 continue
138
139             playlist = playlist.get('playlist')
140             if not isinstance(playlist, list):
141                 continue
142
143             playlist_len = len(playlist)
144
145             for num, item in enumerate(playlist):
146                 is_live = item.get('type') == 'LIVE'
147                 formats = []
148                 for format_id, stream_url in item.get('streamUrls', {}).items():
149                     if 'playerType=flash' in stream_url:
150                         stream_formats = self._extract_m3u8_formats(
151                             stream_url, playlist_id, 'mp4', 'm3u8_native',
152                             m3u8_id='hls-%s' % format_id, fatal=False)
153                     else:
154                         stream_formats = self._extract_mpd_formats(
155                             stream_url, playlist_id,
156                             mpd_id='dash-%s' % format_id, fatal=False)
157                     # See https://github.com/rg3/youtube-dl/issues/12119#issuecomment-280037031
158                     if format_id == 'audioDescription':
159                         for f in stream_formats:
160                             f['source_preference'] = -10
161                     formats.extend(stream_formats)
162
163                 if user_agent and len(entries) == playlist_len:
164                     entries[num]['formats'].extend(formats)
165                     continue
166
167                 item_id = item.get('id') or item['assetId']
168                 title = item['title']
169
170                 duration = float_or_none(item.get('duration'))
171                 thumbnail = item.get('previewImageUrl')
172
173                 subtitles = {}
174                 if item.get('type') == 'VOD':
175                     subs = item.get('subtitles')
176                     if subs:
177                         subtitles = self.extract_subtitles(episode_id, subs)
178
179                 if playlist_len == 1:
180                     final_title = playlist_title or title
181                     if is_live:
182                         final_title = self._live_title(final_title)
183                 else:
184                     final_title = '%s (%s)' % (playlist_title, title)
185
186                 entries.append({
187                     'id': item_id,
188                     'title': final_title,
189                     'description': playlist_description if playlist_len == 1 else None,
190                     'thumbnail': thumbnail,
191                     'duration': duration,
192                     'formats': formats,
193                     'subtitles': subtitles,
194                     'is_live': is_live,
195                 })
196
197         for e in entries:
198             self._sort_formats(e['formats'])
199
200         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
201
202     def _get_subtitles(self, episode_id, subs):
203         original_subtitles = self._download_webpage(
204             subs[0]['url'], episode_id, 'Downloading subtitles')
205         srt_subs = self._fix_subtitles(original_subtitles)
206         return {
207             'cs': [{
208                 'ext': 'srt',
209                 'data': srt_subs,
210             }]
211         }
212
213     @staticmethod
214     def _fix_subtitles(subtitles):
215         """ Convert millisecond-based subtitles to SRT """
216
217         def _msectotimecode(msec):
218             """ Helper utility to convert milliseconds to timecode """
219             components = []
220             for divider in [1000, 60, 60, 100]:
221                 components.append(msec % divider)
222                 msec //= divider
223             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
224
225         def _fix_subtitle(subtitle):
226             for line in subtitle.splitlines():
227                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
228                 if m:
229                     yield m.group(1)
230                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
231                     yield '{0} --> {1}'.format(start, stop)
232                 else:
233                     yield line
234
235         return '\r\n'.join(_fix_subtitle(subtitles))
236
237
238 class CeskaTelevizePoradyIE(InfoExtractor):
239     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
240     _TESTS = [{
241         # video with 18+ caution trailer
242         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
243         'info_dict': {
244             'id': '215562210900007-bogotart',
245             'title': 'Queer: Bogotart',
246             'description': 'Alternativní průvodce současným queer světem',
247         },
248         'playlist': [{
249             'info_dict': {
250                 'id': '61924494876844842',
251                 'ext': 'mp4',
252                 'title': 'Queer: Bogotart (Varování 18+)',
253                 'duration': 10.2,
254             },
255         }, {
256             'info_dict': {
257                 'id': '61924494877068022',
258                 'ext': 'mp4',
259                 'title': 'Queer: Bogotart (Queer)',
260                 'thumbnail': r're:^https?://.*\.jpg',
261                 'duration': 1558.3,
262             },
263         }],
264         'params': {
265             # m3u8 download
266             'skip_download': True,
267         },
268     }]
269
270     def _real_extract(self, url):
271         video_id = self._match_id(url)
272
273         webpage = self._download_webpage(url, video_id)
274
275         data_url = unescapeHTML(self._search_regex(
276             r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
277             webpage, 'iframe player url', group='url'))
278
279         return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())