[zdf] Fix extraction
[youtube-dl] / youtube_dl / extractor / zdf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import functools
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     OnDemandPagedList,
10     determine_ext,
11     parse_iso8601,
12     ExtractorError
13 )
14 from ..compat import compat_str
15
16 class ZDFIE(InfoExtractor):
17     _VALID_URL = r'https?://www\.zdf\.de/.*?/(?P<id>[^/?]*?)\.html'
18
19     _TESTS = [{
20         'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
21         'info_dict': {
22             'id': 'zdfmediathek-trailer-100',
23             'ext': 'mp4',
24             'title': 'Trailer ZDFmediathek Supermarkt',
25         }
26     }]
27
28     def _real_extract(self, url):
29         video_id = self._match_id(url)
30         try:
31             extr_player = ZDFExtractorPlayer(self, url, video_id)
32             formats = extr_player._real_extract()
33         except (ExtractorError, KeyError) as e:
34             self._downloader.report_warning('%s: %s\nusing fallback method (mobile url)' % (type(e).__name__, compat_str(e)))
35             extr_mobile = ZDFExtractorMobile(self, url, video_id)
36             formats = extr_mobile._real_extract()
37         return formats
38
39 class ZDFExtractor:
40     """Super class for the 2 extraction methods"""
41     def __init__(self, parent, url, video_id):
42         self.parent = parent
43         self.url = url
44         self.video_id = video_id
45
46     def _real_extract(self):
47         formats = []
48         for entry in self._fetch_entries():
49             video_url = self._get_video_url(entry)
50             if not video_url:
51                 continue
52             format_id = self._get_format_id(entry)
53             ext = determine_ext(video_url, None)
54             if ext == 'meta':
55                 continue
56             if ext == 'm3u8':
57                 formats.extend(self.parent._extract_m3u8_formats(
58                     video_url, self.video_id, 'mp4', m3u8_id=format_id, fatal=False))
59             elif ext == 'f4m':
60                 formats.extend(self.parent._extract_f4m_formats(
61                     video_url, self.video_id, f4m_id=format_id, fatal=False))
62             else:
63                 formats.append({
64                     'format_id': format_id,
65                     'url': video_url,
66                     'format_note': self._get_format_note(entry)
67                 })
68         self.parent._sort_formats(formats)
69
70         return {
71             'id': self.video_id,
72             'title': self._get_title(),
73             'formats': formats,
74             'subtitles': self._get_subtitles(),
75             'thumbnail': self._get_thumbnail(),
76             'description': self._get_description(),
77             'timestamp': self._get_timestamp()
78         }
79
80 class ZDFExtractorMobile(ZDFExtractor):
81     """Simple URL extraction method. Disadvantage: fewer formats, no subtitles"""
82     def __init__(self, parent, url, video_id):
83         ZDFExtractor.__init__(self, parent, url, video_id)
84
85     def _fetch_entries(self):
86         meta_data_url = 'https://zdf-cdn.live.cellular.de/mediathekV2/document/' + self.video_id
87         self.meta_data = self.parent._download_json(meta_data_url, self.video_id, note='Downloading meta data')
88         return self.meta_data['document']['formitaeten']
89
90     def _get_title(self):
91         return self.meta_data['document']['titel']
92
93     def _get_video_url(self, entry):
94         return entry['url']
95
96     def _get_format_id(self, entry):
97         format_id = entry['type']
98         if 'quality' in entry:
99             format_id += '-' + entry['quality']
100         return format_id
101
102     def _get_format_note(self, entry):
103         return None
104
105     def _get_subtitles(self):
106         return None
107
108     def _get_description(self):
109         return self.meta_data['document'].get('beschreibung')
110
111     def _get_timestamp(self):
112         meta = self.meta_data['meta']
113         if meta:
114             return parse_iso8601(meta.get('editorialDate'))
115
116     def _get_thumbnail(self):
117         teaser_images = self.meta_data['document'].get('teaserBild')
118         if teaser_images:
119             max_res = max(teaser_images, key=int)
120             return teaser_images[max_res].get('url')
121
122 class ZDFExtractorPlayer(ZDFExtractor):
123     """Extraction method that requires downloads of several pages.
124
125     Follows the requests of the website."""
126     def __init__(self, parent, url, video_id):
127         ZDFExtractor.__init__(self, parent, url, video_id)
128
129     def _fetch_entries(self):
130         webpage = self.parent._download_webpage(self.url, self.video_id)
131
132         jsb = self.parent._search_regex(r"data-zdfplayer-jsb='([^']*)'", webpage, 'zdfplayer jsb data')
133         jsb_json = self.parent._parse_json(jsb, self.video_id)
134
135         configuration_url = 'https://www.zdf.de' + jsb_json['config']
136         configuration_json = self.parent._download_json(configuration_url, self.video_id, note='Downloading player configuration')
137         api_token = configuration_json['apiToken']
138
139         player_js = self.parent._download_webpage('https://www.zdf.de/ZDFplayer/latest-v2/skins/zdf/zdf-player.js', self.video_id, fatal=False, note='Downloading player script')
140         if player_js:
141             player_id = self.parent._search_regex(r'this\.ptmd_player_id="([^"]*)"', player_js, 'player id', fatal=False)
142         else:
143             player_id = None
144
145         self.content_json = self.parent._download_json(jsb_json['content'], self.video_id, headers={'Api-Auth': 'Bearer %s' % api_token}, note='Downloading content description')
146
147         main_video_content = self.content_json['mainVideoContent']['http://zdf.de/rels/target']
148         meta_data_url = None
149         if not player_id:
150             # could not determine player_id => try alternativ generic URL
151             meta_data_url = main_video_content.get('http://zdf.de/rels/streams/ptmd')
152             if meta_data_url:
153                 meta_data_url = 'https://api.zdf.de' + meta_data_url
154             else:
155                 # no generic URL found => 2nd fallback: hardcoded player_id
156                 player_id = 'ngplayer_2_3'
157         if not meta_data_url:
158             meta_data_url_template = main_video_content['http://zdf.de/rels/streams/ptmd-template']
159             meta_data_url = 'https://api.zdf.de' + meta_data_url_template.replace('{playerId}', player_id)
160
161         self.meta_data = self.parent._download_json(meta_data_url, self.video_id, note='Downloading meta data')
162
163         formats = []
164         for p_list_entry in self.meta_data['priorityList']:
165             for formitaet in p_list_entry['formitaeten']:
166                 for entry in formitaet['qualities']:
167                     yield (formitaet, entry)
168
169     def _get_title(self):
170         return self.content_json['title']
171
172     def _get_video_url(self, entry_tuple):
173         (formitaet, entry) = entry_tuple
174         tracks = entry['audio'].get('tracks')
175         if not tracks:
176             return
177         if len(tracks) > 1:
178             self._downloader.report_warning('unexpected input: multiple tracks')
179         track = tracks[0]
180         return track['uri']
181
182     def _get_format_id(self, entry_tuple):
183         (formitaet, entry) = entry_tuple
184         facets = self._get_facets(formitaet)
185         add = ''
186         if 'adaptive' in facets:
187             add += 'a'
188         if 'restriction_useragent' in facets:
189             add += 'b'
190         if 'progressive' in facets:
191             add += 'p'
192         type_ = formitaet['type']
193         format_id = type_ + '-'
194         if add:
195             format_id += add + '-'
196         # named qualities are not very useful for sorting the formats:
197         # a 'high' m3u8 entry can be better quality than a 'veryhigh' direct mp4 download
198         format_id += entry['quality']
199         return format_id
200
201     def _get_facets(self, formitaet):
202         facets = formitaet.get('facets') or []
203         if formitaet.get('isAdaptive'):
204             facets.append('adaptive')
205         return facets
206
207     def _get_format_note(self, entry_tuple):
208         (formitaet, entry) = entry_tuple
209         return ', '.join(self._get_facets(formitaet))
210
211     def _get_subtitles(self):
212         subtitles = {}
213         if 'captions' in self.meta_data:
214             for caption in self.meta_data['captions']:
215                 lang = caption.get('language')
216                 if not lang:
217                     continue
218                 if lang == 'deu':
219                     lang = 'de'
220                 subformat = {'url': caption.get('uri')}
221                 if caption.get('format') == 'webvtt':
222                     subformat['ext'] = 'vtt'
223                 elif caption.get('format') == 'ebu-tt-d-basic-de':
224                     subformat['ext'] = 'ttml'
225                 if not lang in subtitles:
226                     subtitles[lang] = []
227                 subtitles[lang].append(subformat)
228         return subtitles
229
230     def _get_description(self):
231         return self.content_json.get('teasertext')
232
233     def _get_timestamp(self):
234         return parse_iso8601(self.content_json.get('editorialDate'))
235
236     def _get_thumbnail(self):
237         teaser_images = self.content_json.get('teaserImageRef')
238         if teaser_images:
239             teaser_images_layouts = teaser_images.get('layouts')
240             if teaser_images_layouts:
241                 if 'original' in teaser_images_layouts:
242                     return teaser_images_layouts['original']
243                 teasers = {}
244                 for key in teaser_images_layouts:
245                     width = self.parent._search_regex(r'(\d+)x\d+', key, 'teaser width', fatal=False)
246                     if width:
247                         teasers[int(width)] = teaser_images_layouts[key]
248                 if teasers:
249                     best = max(teasers)
250                     return teasers[best]
251
252 class ZDFChannelIE(InfoExtractor):
253     _WORKING = False
254     _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)'
255     _TESTS = [{
256         'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',
257         'info_dict': {
258             'id': '1586442',
259         },
260         'playlist_count': 3,
261     }, {
262         'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332',
263         'only_matching': True,
264     }, {
265         'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332',
266         'only_matching': True,
267     }, {
268         'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off',
269         'only_matching': True,
270     }]
271     _PAGE_SIZE = 50
272
273     def _fetch_page(self, channel_id, page):
274         offset = page * self._PAGE_SIZE
275         xml_url = (
276             'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s'
277             % (offset, self._PAGE_SIZE, channel_id))
278         doc = self._download_xml(
279             xml_url, channel_id,
280             note='Downloading channel info',
281             errnote='Failed to download channel info')
282
283         title = doc.find('.//information/title').text
284         description = doc.find('.//information/detail').text
285         for asset in doc.findall('.//teasers/teaser'):
286             a_type = asset.find('./type').text
287             a_id = asset.find('./details/assetId').text
288             if a_type not in ('video', 'topic'):
289                 continue
290             yield {
291                 '_type': 'url',
292                 'playlist_title': title,
293                 'playlist_description': description,
294                 'url': 'zdf:%s:%s' % (a_type, a_id),
295             }
296
297     def _real_extract(self, url):
298         channel_id = self._match_id(url)
299         entries = OnDemandPagedList(
300             functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE)
301
302         return {
303             '_type': 'playlist',
304             'id': channel_id,
305             'entries': entries,
306         }