45fc745a3e9ece07ffa943ff4d38288a6cb943e9
[youtube-dl] / youtube_dl / extractor / orf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     clean_html,
10     determine_ext,
11     float_or_none,
12     HEADRequest,
13     int_or_none,
14     orderedSet,
15     remove_end,
16     str_or_none,
17     strip_jsonp,
18     unescapeHTML,
19     unified_strdate,
20     url_or_none,
21 )
22
23
24 class ORFTVthekIE(InfoExtractor):
25     IE_NAME = 'orf:tvthek'
26     IE_DESC = 'ORF TVthek'
27     _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
28
29     _TESTS = [{
30         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
31         'playlist': [{
32             'md5': '2942210346ed779588f428a92db88712',
33             'info_dict': {
34                 'id': '8896777',
35                 'ext': 'mp4',
36                 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
37                 'description': 'md5:c1272f0245537812d4e36419c207b67d',
38                 'duration': 2668,
39                 'upload_date': '20141208',
40             },
41         }],
42         'skip': 'Blocked outside of Austria / Germany',
43     }, {
44         'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
45         'info_dict': {
46             'id': '7982259',
47             'ext': 'mp4',
48             'title': 'Best of Ingrid Thurnher',
49             'upload_date': '20140527',
50             'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
51         },
52         'params': {
53             'skip_download': True,  # rtsp downloads
54         },
55         'skip': 'Blocked outside of Austria / Germany',
56     }, {
57         'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
58         'only_matching': True,
59     }, {
60         'url': 'http://tvthek.orf.at/profile/Universum/35429',
61         'only_matching': True,
62     }]
63
64     def _real_extract(self, url):
65         playlist_id = self._match_id(url)
66         webpage = self._download_webpage(url, playlist_id)
67
68         data_jsb = self._parse_json(
69             self._search_regex(
70                 r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
71                 webpage, 'playlist', group='json'),
72             playlist_id, transform_source=unescapeHTML)['playlist']['videos']
73
74         entries = []
75         for sd in data_jsb:
76             video_id, title = sd.get('id'), sd.get('title')
77             if not video_id or not title:
78                 continue
79             video_id = compat_str(video_id)
80             formats = []
81             for fd in sd['sources']:
82                 src = url_or_none(fd.get('src'))
83                 if not src:
84                     continue
85                 format_id_list = []
86                 for key in ('delivery', 'quality', 'quality_string'):
87                     value = fd.get(key)
88                     if value:
89                         format_id_list.append(value)
90                 format_id = '-'.join(format_id_list)
91                 ext = determine_ext(src)
92                 if ext == 'm3u8':
93                     formats.extend(self._extract_m3u8_formats(
94                         src, video_id, 'mp4', m3u8_id=format_id, fatal=False))
95                 elif ext == 'f4m':
96                     formats.extend(self._extract_f4m_formats(
97                         src, video_id, f4m_id=format_id, fatal=False))
98                 else:
99                     formats.append({
100                         'format_id': format_id,
101                         'url': src,
102                         'protocol': fd.get('protocol'),
103                     })
104
105             # Check for geoblocking.
106             # There is a property is_geoprotection, but that's always false
107             geo_str = sd.get('geoprotection_string')
108             if geo_str:
109                 try:
110                     http_url = next(
111                         f['url']
112                         for f in formats
113                         if re.match(r'^https?://.*\.mp4$', f['url']))
114                 except StopIteration:
115                     pass
116                 else:
117                     req = HEADRequest(http_url)
118                     self._request_webpage(
119                         req, video_id,
120                         note='Testing for geoblocking',
121                         errnote=((
122                             'This video seems to be blocked outside of %s. '
123                             'You may want to try the streaming-* formats.')
124                             % geo_str),
125                         fatal=False)
126
127             self._check_formats(formats, video_id)
128             self._sort_formats(formats)
129
130             subtitles = {}
131             for sub in sd.get('subtitles', []):
132                 sub_src = sub.get('src')
133                 if not sub_src:
134                     continue
135                 subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
136                     'url': sub_src,
137                 })
138
139             upload_date = unified_strdate(sd.get('created_date'))
140             entries.append({
141                 '_type': 'video',
142                 'id': video_id,
143                 'title': title,
144                 'formats': formats,
145                 'subtitles': subtitles,
146                 'description': sd.get('description'),
147                 'duration': int_or_none(sd.get('duration_in_seconds')),
148                 'upload_date': upload_date,
149                 'thumbnail': sd.get('image_full_url'),
150             })
151
152         return {
153             '_type': 'playlist',
154             'entries': entries,
155             'id': playlist_id,
156         }
157
158
159 class ORFRadioIE(InfoExtractor):
160     def _real_extract(self, url):
161         mobj = re.match(self._VALID_URL, url)
162         station = mobj.group('station')
163         show_date = mobj.group('date')
164         show_id = mobj.group('show')
165
166         data = self._download_json(
167             'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s'
168             % (station, show_id, show_date), show_id)
169
170         entries = []
171         for info in data['streams']:
172             loop_stream_id = str_or_none(info.get('loopStreamId'))
173             if not loop_stream_id:
174                 continue
175             title = str_or_none(data.get('title'))
176             if not title:
177                 continue
178             start = int_or_none(info.get('start'), scale=1000)
179             end = int_or_none(info.get('end'), scale=1000)
180             duration = end - start if end and start else None
181             entries.append({
182                 'id': loop_stream_id.replace('.mp3', ''),
183                 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, loop_stream_id),
184                 'title': title,
185                 'description': clean_html(data.get('subtitle')),
186                 'duration': duration,
187                 'timestamp': start,
188                 'ext': 'mp3',
189                 'series': data.get('programTitle'),
190             })
191
192         return {
193             '_type': 'playlist',
194             'id': show_id,
195             'title': data.get('title'),
196             'description': clean_html(data.get('subtitle')),
197             'entries': entries,
198         }
199
200
201 class ORFFM4IE(ORFRadioIE):
202     IE_NAME = 'orf:fm4'
203     IE_DESC = 'radio FM4'
204     _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
205
206     _TEST = {
207         'url': 'http://fm4.orf.at/player/20170107/4CC',
208         'md5': '2b0be47375432a7ef104453432a19212',
209         'info_dict': {
210             'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
211             'ext': 'mp3',
212             'title': 'Solid Steel Radioshow',
213             'description': 'Die Mixshow von Coldcut und Ninja Tune.',
214             'duration': 3599,
215             'timestamp': 1483819257,
216             'upload_date': '20170107',
217         },
218         'skip': 'Shows from ORF radios are only available for 7 days.',
219         'only_matching': True,
220     }
221
222
223 class ORFOE1IE(ORFRadioIE):
224     IE_NAME = 'orf:oe1'
225     IE_DESC = 'Radio Österreich 1'
226     _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
227
228     _TEST = {
229         'url': 'http://oe1.orf.at/player/20170108/456544',
230         'md5': '34d8a6e67ea888293741c86a099b745b',
231         'info_dict': {
232             'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
233             'ext': 'mp3',
234             'title': 'Morgenjournal',
235             'duration': 609,
236             'timestamp': 1483858796,
237             'upload_date': '20170108',
238         },
239         'skip': 'Shows from ORF radios are only available for 7 days.'
240     }
241
242
243 class ORFIPTVIE(InfoExtractor):
244     IE_NAME = 'orf:iptv'
245     IE_DESC = 'iptv.ORF.at'
246     _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
247
248     _TEST = {
249         'url': 'http://iptv.orf.at/stories/2275236/',
250         'md5': 'c8b22af4718a4b4af58342529453e3e5',
251         'info_dict': {
252             'id': '350612',
253             'ext': 'flv',
254             'title': 'Weitere Evakuierungen um Vulkan Calbuco',
255             'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
256             'duration': 68.197,
257             'thumbnail': r're:^https?://.*\.jpg$',
258             'upload_date': '20150425',
259         },
260     }
261
262     def _real_extract(self, url):
263         story_id = self._match_id(url)
264
265         webpage = self._download_webpage(
266             'http://iptv.orf.at/stories/%s' % story_id, story_id)
267
268         video_id = self._search_regex(
269             r'data-video(?:id)?="(\d+)"', webpage, 'video id')
270
271         data = self._download_json(
272             'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
273             video_id)[0]
274
275         duration = float_or_none(data['duration'], 1000)
276
277         video = data['sources']['default']
278         load_balancer_url = video['loadBalancerUrl']
279         abr = int_or_none(video.get('audioBitrate'))
280         vbr = int_or_none(video.get('bitrate'))
281         fps = int_or_none(video.get('videoFps'))
282         width = int_or_none(video.get('videoWidth'))
283         height = int_or_none(video.get('videoHeight'))
284         thumbnail = video.get('preview')
285
286         rendition = self._download_json(
287             load_balancer_url, video_id, transform_source=strip_jsonp)
288
289         f = {
290             'abr': abr,
291             'vbr': vbr,
292             'fps': fps,
293             'width': width,
294             'height': height,
295         }
296
297         formats = []
298         for format_id, format_url in rendition['redirect'].items():
299             if format_id == 'rtmp':
300                 ff = f.copy()
301                 ff.update({
302                     'url': format_url,
303                     'format_id': format_id,
304                 })
305                 formats.append(ff)
306             elif determine_ext(format_url) == 'f4m':
307                 formats.extend(self._extract_f4m_formats(
308                     format_url, video_id, f4m_id=format_id))
309             elif determine_ext(format_url) == 'm3u8':
310                 formats.extend(self._extract_m3u8_formats(
311                     format_url, video_id, 'mp4', m3u8_id=format_id))
312             else:
313                 continue
314         self._sort_formats(formats)
315
316         title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
317         description = self._og_search_description(webpage)
318         upload_date = unified_strdate(self._html_search_meta(
319             'dc.date', webpage, 'upload date'))
320
321         return {
322             'id': video_id,
323             'title': title,
324             'description': description,
325             'duration': duration,
326             'thumbnail': thumbnail,
327             'upload_date': upload_date,
328             'formats': formats,
329         }
330
331
332 class ORFFM4StoryIE(InfoExtractor):
333     IE_NAME = 'orf:fm4:story'
334     IE_DESC = 'fm4.orf.at stories'
335     _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
336
337     _TEST = {
338         'url': 'http://fm4.orf.at/stories/2865738/',
339         'playlist': [{
340             'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
341             'info_dict': {
342                 'id': '547792',
343                 'ext': 'flv',
344                 'title': 'Manu Delago und Inner Tongue live',
345                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
346                 'duration': 1748.52,
347                 'thumbnail': r're:^https?://.*\.jpg$',
348                 'upload_date': '20170913',
349             },
350         }, {
351             'md5': 'c6dd2179731f86f4f55a7b49899d515f',
352             'info_dict': {
353                 'id': '547798',
354                 'ext': 'flv',
355                 'title': 'Manu Delago und Inner Tongue live (2)',
356                 'duration': 1504.08,
357                 'thumbnail': r're:^https?://.*\.jpg$',
358                 'upload_date': '20170913',
359                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
360             },
361         }],
362     }
363
364     def _real_extract(self, url):
365         story_id = self._match_id(url)
366         webpage = self._download_webpage(url, story_id)
367
368         entries = []
369         all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
370         for idx, video_id in enumerate(all_ids):
371             data = self._download_json(
372                 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
373                 video_id)[0]
374
375             duration = float_or_none(data['duration'], 1000)
376
377             video = data['sources']['q8c']
378             load_balancer_url = video['loadBalancerUrl']
379             abr = int_or_none(video.get('audioBitrate'))
380             vbr = int_or_none(video.get('bitrate'))
381             fps = int_or_none(video.get('videoFps'))
382             width = int_or_none(video.get('videoWidth'))
383             height = int_or_none(video.get('videoHeight'))
384             thumbnail = video.get('preview')
385
386             rendition = self._download_json(
387                 load_balancer_url, video_id, transform_source=strip_jsonp)
388
389             f = {
390                 'abr': abr,
391                 'vbr': vbr,
392                 'fps': fps,
393                 'width': width,
394                 'height': height,
395             }
396
397             formats = []
398             for format_id, format_url in rendition['redirect'].items():
399                 if format_id == 'rtmp':
400                     ff = f.copy()
401                     ff.update({
402                         'url': format_url,
403                         'format_id': format_id,
404                     })
405                     formats.append(ff)
406                 elif determine_ext(format_url) == 'f4m':
407                     formats.extend(self._extract_f4m_formats(
408                         format_url, video_id, f4m_id=format_id))
409                 elif determine_ext(format_url) == 'm3u8':
410                     formats.extend(self._extract_m3u8_formats(
411                         format_url, video_id, 'mp4', m3u8_id=format_id))
412                 else:
413                     continue
414             self._sort_formats(formats)
415
416             title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
417             if idx >= 1:
418                 # Titles are duplicates, make them unique
419                 title += ' (' + str(idx + 1) + ')'
420             description = self._og_search_description(webpage)
421             upload_date = unified_strdate(self._html_search_meta(
422                 'dc.date', webpage, 'upload date'))
423
424             entries.append({
425                 'id': video_id,
426                 'title': title,
427                 'description': description,
428                 'duration': duration,
429                 'thumbnail': thumbnail,
430                 'upload_date': upload_date,
431                 'formats': formats,
432             })
433
434         return self.playlist_result(entries)