[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / tvnow.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     ExtractorError,
10     int_or_none,
11     parse_iso8601,
12     parse_duration,
13     str_or_none,
14     update_url_query,
15     urljoin,
16 )
17
18
19 class TVNowBaseIE(InfoExtractor):
20     _VIDEO_FIELDS = (
21         'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
22         'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode',
23         'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear',
24         'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo')
25
26     def _call_api(self, path, video_id, query):
27         return self._download_json(
28             'https://api.tvnow.de/v3/' + path, video_id, query=query)
29
30     def _extract_video(self, info, display_id):
31         video_id = compat_str(info['id'])
32         title = info['title']
33
34         paths = []
35         for manifest_url in (info.get('manifest') or {}).values():
36             if not manifest_url:
37                 continue
38             manifest_url = update_url_query(manifest_url, {'filter': ''})
39             path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
40             if path in paths:
41                 continue
42             paths.append(path)
43
44             def url_repl(proto, suffix):
45                 return re.sub(
46                     r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
47                         r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
48                         '.ism/' + suffix, manifest_url))
49
50             def make_urls(proto, suffix):
51                 urls = [url_repl(proto, suffix)]
52                 hd_url = urls[0].replace('/manifest/', '/ngvod/')
53                 if hd_url != urls[0]:
54                     urls.append(hd_url)
55                 return urls
56
57             for man_url in make_urls('dash', '.mpd'):
58                 formats = self._extract_mpd_formats(
59                     man_url, video_id, mpd_id='dash', fatal=False)
60             for man_url in make_urls('hss', 'Manifest'):
61                 formats.extend(self._extract_ism_formats(
62                     man_url, video_id, ism_id='mss', fatal=False))
63             for man_url in make_urls('hls', '.m3u8'):
64                 formats.extend(self._extract_m3u8_formats(
65                     man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls',
66                     fatal=False))
67             if formats:
68                 break
69         else:
70             if info.get('isDrm'):
71                 raise ExtractorError(
72                     'Video %s is DRM protected' % video_id, expected=True)
73             if info.get('geoblocked'):
74                 raise self.raise_geo_restricted()
75             if not info.get('free', True):
76                 raise ExtractorError(
77                     'Video %s is not available for free' % video_id, expected=True)
78         self._sort_formats(formats)
79
80         description = info.get('articleLong') or info.get('articleShort')
81         timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
82         duration = parse_duration(info.get('duration'))
83
84         f = info.get('format', {})
85
86         thumbnails = [{
87             'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id,
88         }]
89         thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
90         if thumbnail:
91             thumbnails.append({
92                 'url': thumbnail,
93             })
94
95         return {
96             'id': video_id,
97             'display_id': display_id,
98             'title': title,
99             'description': description,
100             'thumbnails': thumbnails,
101             'timestamp': timestamp,
102             'duration': duration,
103             'series': f.get('title'),
104             'season_number': int_or_none(info.get('season')),
105             'episode_number': int_or_none(info.get('episode')),
106             'episode': title,
107             'formats': formats,
108         }
109
110
111 class TVNowIE(TVNowBaseIE):
112     _VALID_URL = r'''(?x)
113                     https?://
114                         (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/
115                         (?P<show_id>[^/]+)/
116                         (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
117                     '''
118
119     @classmethod
120     def suitable(cls, url):
121         return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url)
122                 else super(TVNowIE, cls).suitable(url))
123
124     _TESTS = [{
125         'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player',
126         'info_dict': {
127             'id': '331082',
128             'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
129             'ext': 'mp4',
130             'title': 'Der neue Porsche 911 GT 3',
131             'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
132             'timestamp': 1495994400,
133             'upload_date': '20170528',
134             'duration': 5283,
135             'series': 'GRIP - Das Motormagazin',
136             'season_number': 14,
137             'episode_number': 405,
138             'episode': 'Der neue Porsche 911 GT 3',
139         },
140     }, {
141         # rtl2
142         'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player',
143         'only_matching': True,
144     }, {
145         # rtlnitro
146         'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player',
147         'only_matching': True,
148     }, {
149         # superrtl
150         'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player',
151         'only_matching': True,
152     }, {
153         # ntv
154         'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player',
155         'only_matching': True,
156     }, {
157         # vox
158         'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player',
159         'only_matching': True,
160     }, {
161         # rtlplus
162         'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player',
163         'only_matching': True,
164     }, {
165         'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3',
166         'only_matching': True,
167     }]
168
169     def _real_extract(self, url):
170         mobj = re.match(self._VALID_URL, url)
171         display_id = '%s/%s' % mobj.group(2, 3)
172
173         info = self._call_api(
174             'movies/' + display_id, display_id, query={
175                 'fields': ','.join(self._VIDEO_FIELDS),
176             })
177
178         return self._extract_video(info, display_id)
179
180
181 class TVNowNewIE(InfoExtractor):
182     _VALID_URL = r'''(?x)
183                     (?P<base_url>https?://
184                         (?:www\.)?tvnow\.(?:de|at|ch)/
185                         (?:shows|serien))/
186                         (?P<show>[^/]+)-\d+/
187                         [^/]+/
188                         episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+)
189                     '''
190
191     _TESTS = [{
192         'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
193         'only_matching': True,
194     }]
195
196     def _real_extract(self, url):
197         mobj = re.match(self._VALID_URL, url)
198         base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url'))
199         show, episode = mobj.group('show', 'episode')
200         return self.url_result(
201             # Rewrite new URLs to the old format and use extraction via old API
202             # at api.tvnow.de as a loophole for bypassing premium content checks
203             '%s/%s/%s' % (base_url, show, episode),
204             ie=TVNowIE.ie_key(), video_id=mobj.group('id'))
205
206
207 class TVNowNewBaseIE(InfoExtractor):
208     def _call_api(self, path, video_id, query={}):
209         result = self._download_json(
210             'https://apigw.tvnow.de/module/' + path, video_id, query=query)
211         error = result.get('error')
212         if error:
213             raise ExtractorError(
214                 '%s said: %s' % (self.IE_NAME, error), expected=True)
215         return result
216
217
218 r"""
219 TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it
220 when api.tvnow.de is shut down. This version can't bypass premium checks though.
221 class TVNowIE(TVNowNewBaseIE):
222     _VALID_URL = r'''(?x)
223                     https?://
224                         (?:www\.)?tvnow\.(?:de|at|ch)/
225                         (?:shows|serien)/[^/]+/
226                         (?:[^/]+/)+
227                         (?P<display_id>[^/?$&]+)-(?P<id>\d+)
228                     '''
229
230     _TESTS = [{
231         # episode with annual navigation
232         'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
233         'info_dict': {
234             'id': '331082',
235             'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
236             'ext': 'mp4',
237             'title': 'Der neue Porsche 911 GT 3',
238             'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
239             'thumbnail': r're:^https?://.*\.jpg$',
240             'timestamp': 1495994400,
241             'upload_date': '20170528',
242             'duration': 5283,
243             'series': 'GRIP - Das Motormagazin',
244             'season_number': 14,
245             'episode_number': 405,
246             'episode': 'Der neue Porsche 911 GT 3',
247         },
248     }, {
249         # rtl2, episode with season navigation
250         'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124',
251         'only_matching': True,
252     }, {
253         # rtlnitro
254         'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822',
255         'only_matching': True,
256     }, {
257         # superrtl
258         'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120',
259         'only_matching': True,
260     }, {
261         # ntv
262         'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630',
263         'only_matching': True,
264     }, {
265         # vox
266         'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072',
267         'only_matching': True,
268     }, {
269         'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
270         'only_matching': True,
271     }]
272
273     def _extract_video(self, info, url, display_id):
274         config = info['config']
275         source = config['source']
276
277         video_id = compat_str(info.get('id') or source['videoId'])
278         title = source['title'].strip()
279
280         paths = []
281         for manifest_url in (info.get('manifest') or {}).values():
282             if not manifest_url:
283                 continue
284             manifest_url = update_url_query(manifest_url, {'filter': ''})
285             path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
286             if path in paths:
287                 continue
288             paths.append(path)
289
290             def url_repl(proto, suffix):
291                 return re.sub(
292                     r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
293                         r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
294                         '.ism/' + suffix, manifest_url))
295
296             formats = self._extract_mpd_formats(
297                 url_repl('dash', '.mpd'), video_id,
298                 mpd_id='dash', fatal=False)
299             formats.extend(self._extract_ism_formats(
300                 url_repl('hss', 'Manifest'),
301                 video_id, ism_id='mss', fatal=False))
302             formats.extend(self._extract_m3u8_formats(
303                 url_repl('hls', '.m3u8'), video_id, 'mp4',
304                 'm3u8_native', m3u8_id='hls', fatal=False))
305             if formats:
306                 break
307         else:
308             if try_get(info, lambda x: x['rights']['isDrm']):
309                 raise ExtractorError(
310                     'Video %s is DRM protected' % video_id, expected=True)
311             if try_get(config, lambda x: x['boards']['geoBlocking']['block']):
312                 raise self.raise_geo_restricted()
313             if not info.get('free', True):
314                 raise ExtractorError(
315                     'Video %s is not available for free' % video_id, expected=True)
316         self._sort_formats(formats)
317
318         description = source.get('description')
319         thumbnail = url_or_none(source.get('poster'))
320         timestamp = unified_timestamp(source.get('previewStart'))
321         duration = parse_duration(source.get('length'))
322
323         series = source.get('format')
324         season_number = int_or_none(self._search_regex(
325             r'staffel-(\d+)', url, 'season number', default=None))
326         episode_number = int_or_none(self._search_regex(
327             r'episode-(\d+)', url, 'episode number', default=None))
328
329         return {
330             'id': video_id,
331             'display_id': display_id,
332             'title': title,
333             'description': description,
334             'thumbnail': thumbnail,
335             'timestamp': timestamp,
336             'duration': duration,
337             'series': series,
338             'season_number': season_number,
339             'episode_number': episode_number,
340             'episode': title,
341             'formats': formats,
342         }
343
344     def _real_extract(self, url):
345         display_id, video_id = re.match(self._VALID_URL, url).groups()
346         info = self._call_api('player/' + video_id, video_id)
347         return self._extract_video(info, video_id, display_id)
348 """
349
350
351 class TVNowListBaseIE(TVNowNewBaseIE):
352     _SHOW_VALID_URL = r'''(?x)
353                     (?P<base_url>
354                         https?://
355                             (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/
356                             [^/?#&]+-(?P<show_id>\d+)
357                     )
358                     '''
359
360     @classmethod
361     def suitable(cls, url):
362         return (False if TVNowNewIE.suitable(url)
363                 else super(TVNowListBaseIE, cls).suitable(url))
364
365     def _extract_items(self, url, show_id, list_id, query):
366         items = self._call_api(
367             'teaserrow/format/episode/' + show_id, list_id,
368             query=query)['items']
369
370         entries = []
371         for item in items:
372             if not isinstance(item, dict):
373                 continue
374             item_url = urljoin(url, item.get('url'))
375             if not item_url:
376                 continue
377             video_id = str_or_none(item.get('id') or item.get('videoId'))
378             item_title = item.get('subheadline') or item.get('text')
379             entries.append(self.url_result(
380                 item_url, ie=TVNowNewIE.ie_key(), video_id=video_id,
381                 video_title=item_title))
382
383         return self.playlist_result(entries, '%s/%s' % (show_id, list_id))
384
385
386 class TVNowSeasonIE(TVNowListBaseIE):
387     _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL
388     _TESTS = [{
389         'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13',
390         'info_dict': {
391             'id': '1815/13',
392         },
393         'playlist_mincount': 22,
394     }]
395
396     def _real_extract(self, url):
397         _, show_id, season_id = re.match(self._VALID_URL, url).groups()
398         return self._extract_items(
399             url, show_id, season_id, {'season': season_id})
400
401
402 class TVNowAnnualIE(TVNowListBaseIE):
403     _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL
404     _TESTS = [{
405         'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05',
406         'info_dict': {
407             'id': '1669/2017-05',
408         },
409         'playlist_mincount': 2,
410     }]
411
412     def _real_extract(self, url):
413         _, show_id, year, month = re.match(self._VALID_URL, url).groups()
414         return self._extract_items(
415             url, show_id, '%s-%s' % (year, month), {
416                 'year': int(year),
417                 'month': int(month),
418             })
419
420
421 class TVNowShowIE(TVNowListBaseIE):
422     _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL
423     _TESTS = [{
424         # annual navigationType
425         'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669',
426         'info_dict': {
427             'id': '1669',
428         },
429         'playlist_mincount': 73,
430     }, {
431         # season navigationType
432         'url': 'https://www.tvnow.de/shows/armes-deutschland-11471',
433         'info_dict': {
434             'id': '11471',
435         },
436         'playlist_mincount': 3,
437     }]
438
439     @classmethod
440     def suitable(cls, url):
441         return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url)
442                 else super(TVNowShowIE, cls).suitable(url))
443
444     def _real_extract(self, url):
445         base_url, show_id = re.match(self._VALID_URL, url).groups()
446
447         result = self._call_api(
448             'teaserrow/format/navigation/' + show_id, show_id)
449
450         items = result['items']
451
452         entries = []
453         navigation = result.get('navigationType')
454         if navigation == 'annual':
455             for item in items:
456                 if not isinstance(item, dict):
457                     continue
458                 year = int_or_none(item.get('year'))
459                 if year is None:
460                     continue
461                 months = item.get('months')
462                 if not isinstance(months, list):
463                     continue
464                 for month_dict in months:
465                     if not isinstance(month_dict, dict) or not month_dict:
466                         continue
467                     month_number = int_or_none(list(month_dict.keys())[0])
468                     if month_number is None:
469                         continue
470                     entries.append(self.url_result(
471                         '%s/%04d-%02d' % (base_url, year, month_number),
472                         ie=TVNowAnnualIE.ie_key()))
473         elif navigation == 'season':
474             for item in items:
475                 if not isinstance(item, dict):
476                     continue
477                 season_number = int_or_none(item.get('season'))
478                 if season_number is None:
479                     continue
480                 entries.append(self.url_result(
481                     '%s/staffel-%d' % (base_url, season_number),
482                     ie=TVNowSeasonIE.ie_key()))
483         else:
484             raise ExtractorError('Unknown navigationType')
485
486         return self.playlist_result(entries, show_id)