[orf:fm4] Fix extraction (#23599)
[youtube-dl] / youtube_dl / extractor / orf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     determine_ext,
10     float_or_none,
11     HEADRequest,
12     int_or_none,
13     orderedSet,
14     remove_end,
15     strip_jsonp,
16     unescapeHTML,
17     unified_strdate,
18     url_or_none,
19 )
20
21
22 class ORFTVthekIE(InfoExtractor):
23     IE_NAME = 'orf:tvthek'
24     IE_DESC = 'ORF TVthek'
25     _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
26
27     _TESTS = [{
28         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
29         'playlist': [{
30             'md5': '2942210346ed779588f428a92db88712',
31             'info_dict': {
32                 'id': '8896777',
33                 'ext': 'mp4',
34                 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
35                 'description': 'md5:c1272f0245537812d4e36419c207b67d',
36                 'duration': 2668,
37                 'upload_date': '20141208',
38             },
39         }],
40         'skip': 'Blocked outside of Austria / Germany',
41     }, {
42         'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
43         'info_dict': {
44             'id': '7982259',
45             'ext': 'mp4',
46             'title': 'Best of Ingrid Thurnher',
47             'upload_date': '20140527',
48             'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
49         },
50         'params': {
51             'skip_download': True,  # rtsp downloads
52         },
53         'skip': 'Blocked outside of Austria / Germany',
54     }, {
55         'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
56         'only_matching': True,
57     }, {
58         'url': 'http://tvthek.orf.at/profile/Universum/35429',
59         'only_matching': True,
60     }]
61
62     def _real_extract(self, url):
63         playlist_id = self._match_id(url)
64         webpage = self._download_webpage(url, playlist_id)
65
66         data_jsb = self._parse_json(
67             self._search_regex(
68                 r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
69                 webpage, 'playlist', group='json'),
70             playlist_id, transform_source=unescapeHTML)['playlist']['videos']
71
72         entries = []
73         for sd in data_jsb:
74             video_id, title = sd.get('id'), sd.get('title')
75             if not video_id or not title:
76                 continue
77             video_id = compat_str(video_id)
78             formats = []
79             for fd in sd['sources']:
80                 src = url_or_none(fd.get('src'))
81                 if not src:
82                     continue
83                 format_id_list = []
84                 for key in ('delivery', 'quality', 'quality_string'):
85                     value = fd.get(key)
86                     if value:
87                         format_id_list.append(value)
88                 format_id = '-'.join(format_id_list)
89                 ext = determine_ext(src)
90                 if ext == 'm3u8':
91                     formats.extend(self._extract_m3u8_formats(
92                         src, video_id, 'mp4', m3u8_id=format_id, fatal=False))
93                 elif ext == 'f4m':
94                     formats.extend(self._extract_f4m_formats(
95                         src, video_id, f4m_id=format_id, fatal=False))
96                 else:
97                     formats.append({
98                         'format_id': format_id,
99                         'url': src,
100                         'protocol': fd.get('protocol'),
101                     })
102
103             # Check for geoblocking.
104             # There is a property is_geoprotection, but that's always false
105             geo_str = sd.get('geoprotection_string')
106             if geo_str:
107                 try:
108                     http_url = next(
109                         f['url']
110                         for f in formats
111                         if re.match(r'^https?://.*\.mp4$', f['url']))
112                 except StopIteration:
113                     pass
114                 else:
115                     req = HEADRequest(http_url)
116                     self._request_webpage(
117                         req, video_id,
118                         note='Testing for geoblocking',
119                         errnote=((
120                             'This video seems to be blocked outside of %s. '
121                             'You may want to try the streaming-* formats.')
122                             % geo_str),
123                         fatal=False)
124
125             self._check_formats(formats, video_id)
126             self._sort_formats(formats)
127
128             subtitles = {}
129             for sub in sd.get('subtitles', []):
130                 sub_src = sub.get('src')
131                 if not sub_src:
132                     continue
133                 subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
134                     'url': sub_src,
135                 })
136
137             upload_date = unified_strdate(sd.get('created_date'))
138             entries.append({
139                 '_type': 'video',
140                 'id': video_id,
141                 'title': title,
142                 'formats': formats,
143                 'subtitles': subtitles,
144                 'description': sd.get('description'),
145                 'duration': int_or_none(sd.get('duration_in_seconds')),
146                 'upload_date': upload_date,
147                 'thumbnail': sd.get('image_full_url'),
148             })
149
150         return {
151             '_type': 'playlist',
152             'entries': entries,
153             'id': playlist_id,
154         }
155
156
157 class ORFRadioIE(InfoExtractor):
158     def _real_extract(self, url):
159         mobj = re.match(self._VALID_URL, url)
160         station = mobj.group('station')
161         show_date = mobj.group('date')
162         show_id = mobj.group('show')
163
164         data = self._download_json(
165             'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),
166             show_id
167         )
168
169         def extract_entry_dict(info, title, subtitle):
170             return {
171                 'id': info['loopStreamId'].replace('.mp3', ''),
172                 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),
173                 'title': title,
174                 'description': subtitle,
175                 'duration': (info['end'] - info['start']) / 1000,
176                 'timestamp': info['start'] / 1000,
177                 'ext': 'mp3',
178                 'series': data.get('programTitle')
179             }
180
181         entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']]
182
183         return {
184             '_type': 'playlist',
185             'id': show_id,
186             'title': data['title'],
187             'description': data['subtitle'],
188             'entries': entries
189         }
190
191
192 class ORFFM4IE(ORFRadioIE):
193     IE_NAME = 'orf:fm4'
194     IE_DESC = 'radio FM4'
195     _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
196
197     _TEST = {
198         'url': 'http://fm4.orf.at/player/20170107/4CC',
199         'md5': '2b0be47375432a7ef104453432a19212',
200         'info_dict': {
201             'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
202             'ext': 'mp3',
203             'title': 'Solid Steel Radioshow',
204             'description': 'Die Mixshow von Coldcut und Ninja Tune.',
205             'duration': 3599,
206             'timestamp': 1483819257,
207             'upload_date': '20170107',
208         },
209         'skip': 'Shows from ORF radios are only available for 7 days.'
210     }
211
212
213 class ORFOE1IE(ORFRadioIE):
214     IE_NAME = 'orf:oe1'
215     IE_DESC = 'Radio Österreich 1'
216     _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
217
218     _TEST = {
219         'url': 'http://oe1.orf.at/player/20170108/456544',
220         'md5': '34d8a6e67ea888293741c86a099b745b',
221         'info_dict': {
222             'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
223             'ext': 'mp3',
224             'title': 'Morgenjournal',
225             'duration': 609,
226             'timestamp': 1483858796,
227             'upload_date': '20170108',
228         },
229         'skip': 'Shows from ORF radios are only available for 7 days.'
230     }
231
232
233 class ORFIPTVIE(InfoExtractor):
234     IE_NAME = 'orf:iptv'
235     IE_DESC = 'iptv.ORF.at'
236     _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
237
238     _TEST = {
239         'url': 'http://iptv.orf.at/stories/2275236/',
240         'md5': 'c8b22af4718a4b4af58342529453e3e5',
241         'info_dict': {
242             'id': '350612',
243             'ext': 'flv',
244             'title': 'Weitere Evakuierungen um Vulkan Calbuco',
245             'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
246             'duration': 68.197,
247             'thumbnail': r're:^https?://.*\.jpg$',
248             'upload_date': '20150425',
249         },
250     }
251
252     def _real_extract(self, url):
253         story_id = self._match_id(url)
254
255         webpage = self._download_webpage(
256             'http://iptv.orf.at/stories/%s' % story_id, story_id)
257
258         video_id = self._search_regex(
259             r'data-video(?:id)?="(\d+)"', webpage, 'video id')
260
261         data = self._download_json(
262             'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
263             video_id)[0]
264
265         duration = float_or_none(data['duration'], 1000)
266
267         video = data['sources']['default']
268         load_balancer_url = video['loadBalancerUrl']
269         abr = int_or_none(video.get('audioBitrate'))
270         vbr = int_or_none(video.get('bitrate'))
271         fps = int_or_none(video.get('videoFps'))
272         width = int_or_none(video.get('videoWidth'))
273         height = int_or_none(video.get('videoHeight'))
274         thumbnail = video.get('preview')
275
276         rendition = self._download_json(
277             load_balancer_url, video_id, transform_source=strip_jsonp)
278
279         f = {
280             'abr': abr,
281             'vbr': vbr,
282             'fps': fps,
283             'width': width,
284             'height': height,
285         }
286
287         formats = []
288         for format_id, format_url in rendition['redirect'].items():
289             if format_id == 'rtmp':
290                 ff = f.copy()
291                 ff.update({
292                     'url': format_url,
293                     'format_id': format_id,
294                 })
295                 formats.append(ff)
296             elif determine_ext(format_url) == 'f4m':
297                 formats.extend(self._extract_f4m_formats(
298                     format_url, video_id, f4m_id=format_id))
299             elif determine_ext(format_url) == 'm3u8':
300                 formats.extend(self._extract_m3u8_formats(
301                     format_url, video_id, 'mp4', m3u8_id=format_id))
302             else:
303                 continue
304         self._sort_formats(formats)
305
306         title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
307         description = self._og_search_description(webpage)
308         upload_date = unified_strdate(self._html_search_meta(
309             'dc.date', webpage, 'upload date'))
310
311         return {
312             'id': video_id,
313             'title': title,
314             'description': description,
315             'duration': duration,
316             'thumbnail': thumbnail,
317             'upload_date': upload_date,
318             'formats': formats,
319         }
320
321
322 class ORFFM4StoryIE(InfoExtractor):
323     IE_NAME = 'orf:fm4:story'
324     IE_DESC = 'fm4.orf.at stories'
325     _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
326
327     _TEST = {
328         'url': 'http://fm4.orf.at/stories/2865738/',
329         'playlist': [{
330             'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
331             'info_dict': {
332                 'id': '547792',
333                 'ext': 'flv',
334                 'title': 'Manu Delago und Inner Tongue live',
335                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
336                 'duration': 1748.52,
337                 'thumbnail': r're:^https?://.*\.jpg$',
338                 'upload_date': '20170913',
339             },
340         }, {
341             'md5': 'c6dd2179731f86f4f55a7b49899d515f',
342             'info_dict': {
343                 'id': '547798',
344                 'ext': 'flv',
345                 'title': 'Manu Delago und Inner Tongue live (2)',
346                 'duration': 1504.08,
347                 'thumbnail': r're:^https?://.*\.jpg$',
348                 'upload_date': '20170913',
349                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
350             },
351         }],
352     }
353
354     def _real_extract(self, url):
355         story_id = self._match_id(url)
356         webpage = self._download_webpage(url, story_id)
357
358         entries = []
359         all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
360         for idx, video_id in enumerate(all_ids):
361             data = self._download_json(
362                 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
363                 video_id)[0]
364
365             duration = float_or_none(data['duration'], 1000)
366
367             video = data['sources']['q8c']
368             load_balancer_url = video['loadBalancerUrl']
369             abr = int_or_none(video.get('audioBitrate'))
370             vbr = int_or_none(video.get('bitrate'))
371             fps = int_or_none(video.get('videoFps'))
372             width = int_or_none(video.get('videoWidth'))
373             height = int_or_none(video.get('videoHeight'))
374             thumbnail = video.get('preview')
375
376             rendition = self._download_json(
377                 load_balancer_url, video_id, transform_source=strip_jsonp)
378
379             f = {
380                 'abr': abr,
381                 'vbr': vbr,
382                 'fps': fps,
383                 'width': width,
384                 'height': height,
385             }
386
387             formats = []
388             for format_id, format_url in rendition['redirect'].items():
389                 if format_id == 'rtmp':
390                     ff = f.copy()
391                     ff.update({
392                         'url': format_url,
393                         'format_id': format_id,
394                     })
395                     formats.append(ff)
396                 elif determine_ext(format_url) == 'f4m':
397                     formats.extend(self._extract_f4m_formats(
398                         format_url, video_id, f4m_id=format_id))
399                 elif determine_ext(format_url) == 'm3u8':
400                     formats.extend(self._extract_m3u8_formats(
401                         format_url, video_id, 'mp4', m3u8_id=format_id))
402                 else:
403                     continue
404             self._sort_formats(formats)
405
406             title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
407             if idx >= 1:
408                 # Titles are duplicates, make them unique
409                 title += ' (' + str(idx + 1) + ')'
410             description = self._og_search_description(webpage)
411             upload_date = unified_strdate(self._html_search_meta(
412                 'dc.date', webpage, 'upload date'))
413
414             entries.append({
415                 'id': video_id,
416                 'title': title,
417                 'description': description,
418                 'duration': duration,
419                 'thumbnail': thumbnail,
420                 'upload_date': upload_date,
421                 'formats': formats,
422             })
423
424         return self.playlist_result(entries)