[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / ceskatelevize.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_unquote,
9     compat_urllib_parse_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     float_or_none,
14     sanitized_Request,
15     unescapeHTML,
16     update_url_query,
17     urlencode_postdata,
18     USER_AGENTS,
19 )
20
21
22 class CeskaTelevizeIE(InfoExtractor):
23     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
24     _TESTS = [{
25         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
26         'info_dict': {
27             'id': '61924494877246241',
28             'ext': 'mp4',
29             'title': 'Hyde Park Civilizace: Život v Grónsku',
30             'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
31             'thumbnail': r're:^https?://.*\.jpg',
32             'duration': 3350,
33         },
34         'params': {
35             # m3u8 download
36             'skip_download': True,
37         },
38     }, {
39         'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
40         'info_dict': {
41             'id': '61924494877028507',
42             'ext': 'mp4',
43             'title': 'Hyde Park Civilizace: Bonus 01 - En',
44             'description': 'English Subtittles',
45             'thumbnail': r're:^https?://.*\.jpg',
46             'duration': 81.3,
47         },
48         'params': {
49             # m3u8 download
50             'skip_download': True,
51         },
52     }, {
53         # live stream
54         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
55         'info_dict': {
56             'id': 402,
57             'ext': 'mp4',
58             'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
59             'is_live': True,
60         },
61         'params': {
62             # m3u8 download
63             'skip_download': True,
64         },
65         'skip': 'Georestricted to Czech Republic',
66     }, {
67         'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
68         'only_matching': True,
69     }]
70
71     def _real_extract(self, url):
72         playlist_id = self._match_id(url)
73
74         webpage = self._download_webpage(url, playlist_id)
75
76         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
77         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
78             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
79
80         type_ = None
81         episode_id = None
82
83         playlist = self._parse_json(
84             self._search_regex(
85                 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
86                 default='{}'), playlist_id)
87         if playlist:
88             type_ = playlist.get('type')
89             episode_id = playlist.get('id')
90
91         if not type_:
92             type_ = self._html_search_regex(
93                 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
94                 webpage, 'type')
95         if not episode_id:
96             episode_id = self._html_search_regex(
97                 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
98                 webpage, 'episode_id')
99
100         data = {
101             'playlist[0][type]': type_,
102             'playlist[0][id]': episode_id,
103             'requestUrl': compat_urllib_parse_urlparse(url).path,
104             'requestSource': 'iVysilani',
105         }
106
107         entries = []
108
109         for user_agent in (None, USER_AGENTS['Safari']):
110             req = sanitized_Request(
111                 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
112                 data=urlencode_postdata(data))
113
114             req.add_header('Content-type', 'application/x-www-form-urlencoded')
115             req.add_header('x-addr', '127.0.0.1')
116             req.add_header('X-Requested-With', 'XMLHttpRequest')
117             if user_agent:
118                 req.add_header('User-Agent', user_agent)
119             req.add_header('Referer', url)
120
121             playlistpage = self._download_json(req, playlist_id, fatal=False)
122
123             if not playlistpage:
124                 continue
125
126             playlist_url = playlistpage['url']
127             if playlist_url == 'error_region':
128                 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
129
130             req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
131             req.add_header('Referer', url)
132
133             playlist_title = self._og_search_title(webpage, default=None)
134             playlist_description = self._og_search_description(webpage, default=None)
135
136             playlist = self._download_json(req, playlist_id, fatal=False)
137             if not playlist:
138                 continue
139
140             playlist = playlist.get('playlist')
141             if not isinstance(playlist, list):
142                 continue
143
144             playlist_len = len(playlist)
145
146             for num, item in enumerate(playlist):
147                 is_live = item.get('type') == 'LIVE'
148                 formats = []
149                 for format_id, stream_url in item.get('streamUrls', {}).items():
150                     if 'drmOnly=true' in stream_url:
151                         continue
152                     if 'playerType=flash' in stream_url:
153                         stream_formats = self._extract_m3u8_formats(
154                             stream_url, playlist_id, 'mp4', 'm3u8_native',
155                             m3u8_id='hls-%s' % format_id, fatal=False)
156                     else:
157                         stream_formats = self._extract_mpd_formats(
158                             stream_url, playlist_id,
159                             mpd_id='dash-%s' % format_id, fatal=False)
160                     # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
161                     if format_id == 'audioDescription':
162                         for f in stream_formats:
163                             f['source_preference'] = -10
164                     formats.extend(stream_formats)
165
166                 if user_agent and len(entries) == playlist_len:
167                     entries[num]['formats'].extend(formats)
168                     continue
169
170                 item_id = item.get('id') or item['assetId']
171                 title = item['title']
172
173                 duration = float_or_none(item.get('duration'))
174                 thumbnail = item.get('previewImageUrl')
175
176                 subtitles = {}
177                 if item.get('type') == 'VOD':
178                     subs = item.get('subtitles')
179                     if subs:
180                         subtitles = self.extract_subtitles(episode_id, subs)
181
182                 if playlist_len == 1:
183                     final_title = playlist_title or title
184                     if is_live:
185                         final_title = self._live_title(final_title)
186                 else:
187                     final_title = '%s (%s)' % (playlist_title, title)
188
189                 entries.append({
190                     'id': item_id,
191                     'title': final_title,
192                     'description': playlist_description if playlist_len == 1 else None,
193                     'thumbnail': thumbnail,
194                     'duration': duration,
195                     'formats': formats,
196                     'subtitles': subtitles,
197                     'is_live': is_live,
198                 })
199
200         for e in entries:
201             self._sort_formats(e['formats'])
202
203         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
204
205     def _get_subtitles(self, episode_id, subs):
206         original_subtitles = self._download_webpage(
207             subs[0]['url'], episode_id, 'Downloading subtitles')
208         srt_subs = self._fix_subtitles(original_subtitles)
209         return {
210             'cs': [{
211                 'ext': 'srt',
212                 'data': srt_subs,
213             }]
214         }
215
216     @staticmethod
217     def _fix_subtitles(subtitles):
218         """ Convert millisecond-based subtitles to SRT """
219
220         def _msectotimecode(msec):
221             """ Helper utility to convert milliseconds to timecode """
222             components = []
223             for divider in [1000, 60, 60, 100]:
224                 components.append(msec % divider)
225                 msec //= divider
226             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
227
228         def _fix_subtitle(subtitle):
229             for line in subtitle.splitlines():
230                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
231                 if m:
232                     yield m.group(1)
233                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
234                     yield '{0} --> {1}'.format(start, stop)
235                 else:
236                     yield line
237
238         return '\r\n'.join(_fix_subtitle(subtitles))
239
240
241 class CeskaTelevizePoradyIE(InfoExtractor):
242     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
243     _TESTS = [{
244         # video with 18+ caution trailer
245         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
246         'info_dict': {
247             'id': '215562210900007-bogotart',
248             'title': 'Queer: Bogotart',
249             'description': 'Alternativní průvodce současným queer světem',
250         },
251         'playlist': [{
252             'info_dict': {
253                 'id': '61924494876844842',
254                 'ext': 'mp4',
255                 'title': 'Queer: Bogotart (Varování 18+)',
256                 'duration': 10.2,
257             },
258         }, {
259             'info_dict': {
260                 'id': '61924494877068022',
261                 'ext': 'mp4',
262                 'title': 'Queer: Bogotart (Queer)',
263                 'thumbnail': r're:^https?://.*\.jpg',
264                 'duration': 1558.3,
265             },
266         }],
267         'params': {
268             # m3u8 download
269             'skip_download': True,
270         },
271     }, {
272         # iframe embed
273         'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
274         'only_matching': True,
275     }]
276
277     def _real_extract(self, url):
278         video_id = self._match_id(url)
279
280         webpage = self._download_webpage(url, video_id)
281
282         data_url = update_url_query(unescapeHTML(self._search_regex(
283             (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
284              r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
285             webpage, 'iframe player url', group='url')), query={
286                 'autoStart': 'true',
287         })
288
289         return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())