[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / tvnow.py
index e2169f2bce30a3bc42fcb422bd6978dda0175f3b..9c8a8a0dc3944bdf616d7edfde0c4aaa1a9890ef 100644 (file)
@@ -7,48 +7,74 @@ from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
     ExtractorError,
+    int_or_none,
     parse_iso8601,
     parse_duration,
+    str_or_none,
     update_url_query,
+    urljoin,
 )
 
 
 class TVNowBaseIE(InfoExtractor):
     _VIDEO_FIELDS = (
         'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
-        'broadcastStartDate', 'isDrm', 'duration', 'manifest.dashclear',
-        'format.defaultImage169Format', 'format.defaultImage169Logo')
+        'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode',
+        'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear',
+        'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo')
 
     def _call_api(self, path, video_id, query):
         return self._download_json(
-            'https://api.tvnow.de/v3/' + path,
-            video_id, query=query)
+            'https://api.tvnow.de/v3/' + path, video_id, query=query)
 
     def _extract_video(self, info, display_id):
         video_id = compat_str(info['id'])
         title = info['title']
 
-        mpd_url = info['manifest']['dashclear']
-        if not mpd_url:
+        paths = []
+        for manifest_url in (info.get('manifest') or {}).values():
+            if not manifest_url:
+                continue
+            manifest_url = update_url_query(manifest_url, {'filter': ''})
+            path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+            if path in paths:
+                continue
+            paths.append(path)
+
+            def url_repl(proto, suffix):
+                return re.sub(
+                    r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+                        r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+                        '.ism/' + suffix, manifest_url))
+
+            def make_urls(proto, suffix):
+                urls = [url_repl(proto, suffix)]
+                hd_url = urls[0].replace('/manifest/', '/ngvod/')
+                if hd_url != urls[0]:
+                    urls.append(hd_url)
+                return urls
+
+            for man_url in make_urls('dash', '.mpd'):
+                formats = self._extract_mpd_formats(
+                    man_url, video_id, mpd_id='dash', fatal=False)
+            for man_url in make_urls('hss', 'Manifest'):
+                formats.extend(self._extract_ism_formats(
+                    man_url, video_id, ism_id='mss', fatal=False))
+            for man_url in make_urls('hls', '.m3u8'):
+                formats.extend(self._extract_m3u8_formats(
+                    man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls',
+                    fatal=False))
+            if formats:
+                break
+        else:
             if info.get('isDrm'):
                 raise ExtractorError(
                     'Video %s is DRM protected' % video_id, expected=True)
             if info.get('geoblocked'):
-                raise ExtractorError(
-                    'Video %s is not available from your location due to geo restriction' % video_id,
-                    expected=True)
+                raise self.raise_geo_restricted()
             if not info.get('free', True):
                 raise ExtractorError(
                     'Video %s is not available for free' % video_id, expected=True)
-
-        mpd_url = update_url_query(mpd_url, {'filter': ''})
-        formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)
-        formats.extend(self._extract_ism_formats(
-            mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'),
-            video_id, ism_id='mss', fatal=False))
-        formats.extend(self._extract_m3u8_formats(
-            mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'),
-            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
         self._sort_formats(formats)
 
         description = info.get('articleLong') or info.get('articleShort')
@@ -56,65 +82,93 @@ class TVNowBaseIE(InfoExtractor):
         duration = parse_duration(info.get('duration'))
 
         f = info.get('format', {})
+
+        thumbnails = [{
+            'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id,
+        }]
         thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+        if thumbnail:
+            thumbnails.append({
+                'url': thumbnail,
+            })
 
         return {
             'id': video_id,
             'display_id': display_id,
             'title': title,
             'description': description,
-            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
             'timestamp': timestamp,
             'duration': duration,
+            'series': f.get('title'),
+            'season_number': int_or_none(info.get('season')),
+            'episode_number': int_or_none(info.get('episode')),
+            'episode': title,
             'formats': formats,
         }
 
 
 class TVNowIE(TVNowBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/
+                        (?P<show_id>[^/]+)/
+                        (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
+                    '''
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url)
+                else super(TVNowIE, cls).suitable(url))
 
     _TESTS = [{
-        # rtl
-        'url': 'https://www.tvnow.de/rtl/alarm-fuer-cobra-11/freier-fall/player?return=/rtl',
+        'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player',
         'info_dict': {
-            'id': '385314',
-            'display_id': 'alarm-fuer-cobra-11/freier-fall',
+            'id': '331082',
+            'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
             'ext': 'mp4',
-            'title': 'Freier Fall',
-            'description': 'md5:8c2d8f727261adf7e0dc18366124ca02',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'timestamp': 1512677700,
-            'upload_date': '20171207',
-            'duration': 2862.0,
+            'title': 'Der neue Porsche 911 GT 3',
+            'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
+            'timestamp': 1495994400,
+            'upload_date': '20170528',
+            'duration': 5283,
+            'series': 'GRIP - Das Motormagazin',
+            'season_number': 14,
+            'episode_number': 405,
+            'episode': 'Der neue Porsche 911 GT 3',
         },
     }, {
         # rtl2
         'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player',
-        'only_matching': 'True',
+        'only_matching': True,
     }, {
         # rtlnitro
         'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player',
-        'only_matching': 'True',
+        'only_matching': True,
     }, {
         # superrtl
         'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player',
-        'only_matching': 'True',
+        'only_matching': True,
     }, {
         # ntv
         'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player',
-        'only_matching': 'True',
+        'only_matching': True,
     }, {
         # vox
         'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player',
-        'only_matching': 'True',
+        'only_matching': True,
     }, {
         # rtlplus
         'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player',
-        'only_matching': 'True',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        display_id = '%s/%s' % re.match(self._VALID_URL, url).groups()
+        mobj = re.match(self._VALID_URL, url)
+        display_id = '%s/%s' % mobj.group(2, 3)
 
         info = self._call_api(
             'movies/' + display_id, display_id, query={
@@ -124,52 +178,309 @@ class TVNowIE(TVNowBaseIE):
         return self._extract_video(info, display_id)
 
 
-class TVNowListIE(TVNowBaseIE):
-    _VALID_URL = r'(?P<base_url>https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/)list/(?P<id>[^?/#&]+)$'
+class TVNowNewIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?P<base_url>https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/
+                        (?:shows|serien))/
+                        (?P<show>[^/]+)-\d+/
+                        [^/]+/
+                        episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url'))
+        show, episode = mobj.group('show', 'episode')
+        return self.url_result(
+            # Rewrite new URLs to the old format and use extraction via old API
+            # at api.tvnow.de as a loophole for bypassing premium content checks
+            '%s/%s/%s' % (base_url, show, episode),
+            ie=TVNowIE.ie_key(), video_id=mobj.group('id'))
+
+
+class TVNowNewBaseIE(InfoExtractor):
+    def _call_api(self, path, video_id, query={}):
+        result = self._download_json(
+            'https://apigw.tvnow.de/module/' + path, video_id, query=query)
+        error = result.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+        return result
+
+
+r"""
+TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it
+when api.tvnow.de is shut down. This version can't bypass premium checks though.
+class TVNowIE(TVNowNewBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/
+                        (?:shows|serien)/[^/]+/
+                        (?:[^/]+/)+
+                        (?P<display_id>[^/?$&]+)-(?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        # episode with annual navigation
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'info_dict': {
+            'id': '331082',
+            'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
+            'ext': 'mp4',
+            'title': 'Der neue Porsche 911 GT 3',
+            'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1495994400,
+            'upload_date': '20170528',
+            'duration': 5283,
+            'series': 'GRIP - Das Motormagazin',
+            'season_number': 14,
+            'episode_number': 405,
+            'episode': 'Der neue Porsche 911 GT 3',
+        },
+    }, {
+        # rtl2, episode with season navigation
+        'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124',
+        'only_matching': True,
+    }, {
+        # rtlnitro
+        'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822',
+        'only_matching': True,
+    }, {
+        # superrtl
+        'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120',
+        'only_matching': True,
+    }, {
+        # ntv
+        'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630',
+        'only_matching': True,
+    }, {
+        # vox
+        'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'only_matching': True,
+    }]
+
+    def _extract_video(self, info, url, display_id):
+        config = info['config']
+        source = config['source']
+
+        video_id = compat_str(info.get('id') or source['videoId'])
+        title = source['title'].strip()
+
+        paths = []
+        for manifest_url in (info.get('manifest') or {}).values():
+            if not manifest_url:
+                continue
+            manifest_url = update_url_query(manifest_url, {'filter': ''})
+            path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+            if path in paths:
+                continue
+            paths.append(path)
+
+            def url_repl(proto, suffix):
+                return re.sub(
+                    r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+                        r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+                        '.ism/' + suffix, manifest_url))
+
+            formats = self._extract_mpd_formats(
+                url_repl('dash', '.mpd'), video_id,
+                mpd_id='dash', fatal=False)
+            formats.extend(self._extract_ism_formats(
+                url_repl('hss', 'Manifest'),
+                video_id, ism_id='mss', fatal=False))
+            formats.extend(self._extract_m3u8_formats(
+                url_repl('hls', '.m3u8'), video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+            if formats:
+                break
+        else:
+            if try_get(info, lambda x: x['rights']['isDrm']):
+                raise ExtractorError(
+                    'Video %s is DRM protected' % video_id, expected=True)
+            if try_get(config, lambda x: x['boards']['geoBlocking']['block']):
+                raise self.raise_geo_restricted()
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+        self._sort_formats(formats)
+
+        description = source.get('description')
+        thumbnail = url_or_none(source.get('poster'))
+        timestamp = unified_timestamp(source.get('previewStart'))
+        duration = parse_duration(source.get('length'))
+
+        series = source.get('format')
+        season_number = int_or_none(self._search_regex(
+            r'staffel-(\d+)', url, 'season number', default=None))
+        episode_number = int_or_none(self._search_regex(
+            r'episode-(\d+)', url, 'episode number', default=None))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'series': series,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'episode': title,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+        info = self._call_api('player/' + video_id, video_id)
+        return self._extract_video(info, video_id, display_id)
+"""
+
+
+class TVNowListBaseIE(TVNowNewBaseIE):
+    _SHOW_VALID_URL = r'''(?x)
+                    (?P<base_url>
+                        https?://
+                            (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/
+                            [^/?#&]+-(?P<show_id>\d+)
+                    )
+                    '''
 
-    _SHOW_FIELDS = ('title', )
-    _SEASON_FIELDS = ('id', 'headline', 'seoheadline', )
-    _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', )
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url)
+                else super(TVNowListBaseIE, cls).suitable(url))
 
+    def _extract_items(self, url, show_id, list_id, query):
+        items = self._call_api(
+            'teaserrow/format/episode/' + show_id, list_id,
+            query=query)['items']
+
+        entries = []
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            item_url = urljoin(url, item.get('url'))
+            if not item_url:
+                continue
+            video_id = str_or_none(item.get('id') or item.get('videoId'))
+            item_title = item.get('subheadline') or item.get('text')
+            entries.append(self.url_result(
+                item_url, ie=TVNowNewIE.ie_key(), video_id=video_id,
+                video_title=item_title))
+
+        return self.playlist_result(entries, '%s/%s' % (show_id, list_id))
+
+
+class TVNowSeasonIE(TVNowListBaseIE):
+    _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL
     _TESTS = [{
-        'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell',
+        'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13',
         'info_dict': {
-            'id': '28296',
-            'title': '30 Minuten Deutschland - Aktuell',
+            'id': '1815/13',
         },
-        'playlist_mincount': 1,
+        'playlist_mincount': 22,
     }]
 
     def _real_extract(self, url):
-        base_url, show_id, season_id = re.match(self._VALID_URL, url).groups()
-
-        fields = []
-        fields.extend(self._SHOW_FIELDS)
-        fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS)
-        fields.extend(
-            'formatTabs.formatTabPages.container.movies.%s' % field
-            for field in self._VIDEO_FIELDS)
-
-        list_info = self._call_api(
-            'formats/seo', season_id, query={
-                'fields': ','.join(fields),
-                'name': show_id + '.php'
+        _, show_id, season_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_items(
+            url, show_id, season_id, {'season': season_id})
+
+
+class TVNowAnnualIE(TVNowListBaseIE):
+    _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05',
+        'info_dict': {
+            'id': '1669/2017-05',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        _, show_id, year, month = re.match(self._VALID_URL, url).groups()
+        return self._extract_items(
+            url, show_id, '%s-%s' % (year, month), {
+                'year': int(year),
+                'month': int(month),
             })
 
-        season = next(
-            season for season in list_info['formatTabs']['items']
-            if season.get('seoheadline') == season_id)
 
-        title = '%s - %s' % (list_info['title'], season['headline'])
+class TVNowShowIE(TVNowListBaseIE):
+    _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL
+    _TESTS = [{
+        # annual navigationType
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669',
+        'info_dict': {
+            'id': '1669',
+        },
+        'playlist_mincount': 73,
+    }, {
+        # season navigationType
+        'url': 'https://www.tvnow.de/shows/armes-deutschland-11471',
+        'info_dict': {
+            'id': '11471',
+        },
+        'playlist_mincount': 3,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url)
+                else super(TVNowShowIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        base_url, show_id = re.match(self._VALID_URL, url).groups()
+
+        result = self._call_api(
+            'teaserrow/format/navigation/' + show_id, show_id)
+
+        items = result['items']
 
         entries = []
-        for container in season['formatTabPages']['items']:
-            for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []:
-                seo_url = info.get('seoUrl')
-                if not seo_url:
+        navigation = result.get('navigationType')
+        if navigation == 'annual':
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                year = int_or_none(item.get('year'))
+                if year is None:
+                    continue
+                months = item.get('months')
+                if not isinstance(months, list):
+                    continue
+                for month_dict in months:
+                    if not isinstance(month_dict, dict) or not month_dict:
+                        continue
+                    month_number = int_or_none(list(month_dict.keys())[0])
+                    if month_number is None:
+                        continue
+                    entries.append(self.url_result(
+                        '%s/%04d-%02d' % (base_url, year, month_number),
+                        ie=TVNowAnnualIE.ie_key()))
+        elif navigation == 'season':
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                season_number = int_or_none(item.get('season'))
+                if season_number is None:
                     continue
                 entries.append(self.url_result(
-                    base_url + seo_url + '/player', 'TVNow', info.get('id')))
+                    '%s/staffel-%d' % (base_url, season_number),
+                    ie=TVNowSeasonIE.ie_key()))
+        else:
+            raise ExtractorError('Unknown navigationType')
 
-        return self.playlist_result(
-            entries, compat_str(season.get('id') or season_id), title)
+        return self.playlist_result(entries, show_id)