[drtv] Fix extraction (closes #18989)
[youtube-dl] / youtube_dl / extractor / drtv.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     ExtractorError,
7     int_or_none,
8     float_or_none,
9     mimetype2ext,
10     parse_iso8601,
11     remove_end,
12     update_url_query,
13 )
14
15
16 class DRTVIE(InfoExtractor):
17     _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
18     _GEO_BYPASS = False
19     _GEO_COUNTRIES = ['DK']
20     IE_NAME = 'drtv'
21     _TESTS = [{
22         'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
23         'md5': '7ae17b4e18eb5d29212f424a7511c184',
24         'info_dict': {
25             'id': 'klassen-darlig-taber-10',
26             'ext': 'mp4',
27             'title': 'Klassen - Dårlig taber (10)',
28             'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
29             'timestamp': 1471991907,
30             'upload_date': '20160823',
31             'duration': 606.84,
32         },
33     }, {
34         # embed
35         'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
36         'info_dict': {
37             'id': 'christiania-pusher-street-ryddes-drdkrjpo',
38             'ext': 'mp4',
39             'title': 'LIVE Christianias rydning af Pusher Street er i gang',
40             'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
41             'timestamp': 1472800279,
42             'upload_date': '20160902',
43             'duration': 131.4,
44         },
45         'params': {
46             'skip_download': True,
47         },
48     }, {
49         # with SignLanguage formats
50         'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
51         'info_dict': {
52             'id': 'historien-om-danmark-stenalder',
53             'ext': 'mp4',
54             'title': 'Historien om Danmark: Stenalder (1)',
55             'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
56             'timestamp': 1490401996,
57             'upload_date': '20170325',
58             'duration': 3502.04,
59             'formats': 'mincount:20',
60         },
61         'params': {
62             'skip_download': True,
63         },
64     }]
65
66     def _real_extract(self, url):
67         video_id = self._match_id(url)
68
69         webpage = self._download_webpage(url, video_id)
70
71         if '>Programmet er ikke længere tilgængeligt' in webpage:
72             raise ExtractorError(
73                 'Video %s is not available' % video_id, expected=True)
74
75         video_id = self._search_regex(
76             (r'data-(?:material-identifier|episode-slug)="([^"]+)"',
77                 r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
78             webpage, 'video id')
79
80         data = self._download_json(
81             'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id,
82             video_id, 'Downloading video JSON', query={'expanded': 'true'})
83
84         title = remove_end(self._og_search_title(
85             webpage, default=None), ' | TV | DR') or data['Title']
86         description = self._og_search_description(
87             webpage, default=None) or data.get('Description')
88
89         timestamp = parse_iso8601(data.get('CreatedTime'))
90
91         thumbnail = None
92         duration = None
93
94         restricted_to_denmark = False
95
96         formats = []
97         subtitles = {}
98
99         for asset in [data['PrimaryAsset']]:
100             kind = asset.get('Kind')
101             if kind == 'Image':
102                 thumbnail = asset.get('Uri')
103             elif kind in ('VideoResource', 'AudioResource'):
104                 duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
105                 restricted_to_denmark = asset.get('RestrictedToDenmark')
106                 asset_target = asset.get('Target')
107                 for link in asset.get('Links', []):
108                     uri = link.get('Uri')
109                     if not uri:
110                         continue
111                     target = link.get('Target')
112                     format_id = target or ''
113                     preference = None
114                     if asset_target in ('SpokenSubtitles', 'SignLanguage'):
115                         preference = -1
116                         format_id += '-%s' % asset_target
117                     if target == 'HDS':
118                         f4m_formats = self._extract_f4m_formats(
119                             uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
120                             video_id, preference, f4m_id=format_id, fatal=False)
121                         if kind == 'AudioResource':
122                             for f in f4m_formats:
123                                 f['vcodec'] = 'none'
124                         formats.extend(f4m_formats)
125                     elif target == 'HLS':
126                         formats.extend(self._extract_m3u8_formats(
127                             uri, video_id, 'mp4', entry_protocol='m3u8_native',
128                             preference=preference, m3u8_id=format_id,
129                             fatal=False))
130                     else:
131                         bitrate = link.get('Bitrate')
132                         if bitrate:
133                             format_id += '-%s' % bitrate
134                         formats.append({
135                             'url': uri,
136                             'format_id': format_id,
137                             'tbr': int_or_none(bitrate),
138                             'ext': link.get('FileFormat'),
139                             'vcodec': 'none' if kind == 'AudioResource' else None,
140                             'preference': preference,
141                         })
142                 subtitles_list = asset.get('SubtitlesList')
143                 if isinstance(subtitles_list, list):
144                     LANGS = {
145                         'Danish': 'da',
146                     }
147                     for subs in subtitles_list:
148                         if not subs.get('Uri'):
149                             continue
150                         lang = subs.get('Language') or 'da'
151                         subtitles.setdefault(LANGS.get(lang, lang), []).append({
152                             'url': subs['Uri'],
153                             'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
154                         })
155
156         if not formats and restricted_to_denmark:
157             self.raise_geo_restricted(
158                 'Unfortunately, DR is not allowed to show this program outside Denmark.',
159                 countries=self._GEO_COUNTRIES)
160
161         self._sort_formats(formats)
162
163         return {
164             'id': video_id,
165             'title': title,
166             'description': description,
167             'thumbnail': thumbnail,
168             'timestamp': timestamp,
169             'duration': duration,
170             'formats': formats,
171             'subtitles': subtitles,
172         }
173
174
175 class DRTVLiveIE(InfoExtractor):
176     IE_NAME = 'drtv:live'
177     _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
178     _GEO_COUNTRIES = ['DK']
179     _TEST = {
180         'url': 'https://www.dr.dk/tv/live/dr1',
181         'info_dict': {
182             'id': 'dr1',
183             'ext': 'mp4',
184             'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
185         },
186         'params': {
187             # m3u8 download
188             'skip_download': True,
189         },
190     }
191
192     def _real_extract(self, url):
193         channel_id = self._match_id(url)
194         channel_data = self._download_json(
195             'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
196             channel_id)
197         title = self._live_title(channel_data['Title'])
198
199         formats = []
200         for streaming_server in channel_data.get('StreamingServers', []):
201             server = streaming_server.get('Server')
202             if not server:
203                 continue
204             link_type = streaming_server.get('LinkType')
205             for quality in streaming_server.get('Qualities', []):
206                 for stream in quality.get('Streams', []):
207                     stream_path = stream.get('Stream')
208                     if not stream_path:
209                         continue
210                     stream_url = update_url_query(
211                         '%s/%s' % (server, stream_path), {'b': ''})
212                     if link_type == 'HLS':
213                         formats.extend(self._extract_m3u8_formats(
214                             stream_url, channel_id, 'mp4',
215                             m3u8_id=link_type, fatal=False, live=True))
216                     elif link_type == 'HDS':
217                         formats.extend(self._extract_f4m_formats(update_url_query(
218                             '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}),
219                             channel_id, f4m_id=link_type, fatal=False))
220         self._sort_formats(formats)
221
222         return {
223             'id': channel_id,
224             'title': title,
225             'thumbnail': channel_data.get('PrimaryImageUri'),
226             'formats': formats,
227             'is_live': True,
228         }