[nrktv:episode] Improve video id extraction (closes #25594, closes #26369, closes...

[youtube-dl] / youtube_dl / extractor / nrk.py
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py

index 7b98626f23a91324febc19032a908bb625118d69..84aacbcda77e699fd9cca81663c857801f529b76 100644 (file)
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,23 +4,40 @@ from __future__ import unicode_literals
  import re
  
  from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
  from ..utils import (
      ExtractorError,
      int_or_none,
+    js_to_json,
+    NO_DEFAULT,
      parse_age_limit,
      parse_duration,
+    try_get,
  )
  
  
  class NRKBaseIE(InfoExtractor):
      _GEO_COUNTRIES = ['NO']
+
+    _api_host = None
+
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        data = self._download_json(
-            'http://%s/mediaelement/%s' % (self._API_HOST, video_id),
-            video_id, 'Downloading mediaelement JSON')
+        api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
+
+        for api_host in api_hosts:
+            data = self._download_json(
+                'http://%s/mediaelement/%s' % (api_host, video_id),
+                video_id, 'Downloading mediaelement JSON',
+                fatal=api_host == api_hosts[-1])
+            if not data:
+                continue
+            self._api_host = api_host
+            break
  
          title = data.get('fullTitle') or data.get('mainTitle') or data['title']
          video_id = data.get('id') or video_id
@@ -28,8 +45,8 @@ class NRKBaseIE(InfoExtractor):
          entries = []
  
          conviva = data.get('convivaStatistics') or {}
-        live = (data.get('mediaElementType') == 'Live' or
-                data.get('isLive') is True or conviva.get('isLive'))
+        live = (data.get('mediaElementType') == 'Live'
+                or data.get('isLive') is True or conviva.get('isLive'))
  
          def make_title(t):
              return self._live_title(t) if live else t
@@ -88,6 +105,7 @@ class NRKBaseIE(InfoExtractor):
              MESSAGES = {
                  'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
                  'ProgramRightsHasExpired': 'Programmet har gått ut',
+                'NoProgramRights': 'Ikke tilgjengelig',
                  'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
              }
              message_type = data.get('messageType', '')
@@ -147,13 +165,34 @@ class NRKBaseIE(InfoExtractor):
  
          vcodec = 'none' if data.get('mediaType') == 'Audio' else None
  
-        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
-
          for entry in entries:
              entry.update(common_info)
              for f in entry['formats']:
                  f['vcodec'] = vcodec
  
+        points = data.get('shortIndexPoints')
+        if isinstance(points, list):
+            chapters = []
+            for next_num, point in enumerate(points, start=1):
+                if not isinstance(point, dict):
+                    continue
+                start_time = parse_duration(point.get('startPoint'))
+                if start_time is None:
+                    continue
+                end_time = parse_duration(
+                    data.get('duration')
+                    if next_num == len(points)
+                    else points[next_num].get('startPoint'))
+                if end_time is None:
+                    continue
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                    'title': point.get('title'),
+                })
+            if chapters and len(entries) == 1:
+                entries[0]['chapters'] = chapters
+
          return self.playlist_result(entries, video_id, title, description)
  
  
@@ -169,17 +208,17 @@ class NRKIE(NRKBaseIE):
                              )
                              (?P<id>[^?#&]+)
                          '''
-    _API_HOST = 'v8-psapi.nrk.no'
+    _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no')
      _TESTS = [{
          # video
          'url': 'http://www.nrk.no/video/PS*150533',
-        'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+        'md5': '706f34cdf1322577589e369e522b50ef',
          'info_dict': {
              'id': '150533',
              'ext': 'mp4',
              'title': 'Dompap og andre fugler i Piip-Show',
              'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
-            'duration': 263,
+            'duration': 262,
          }
      }, {
          # audio
@@ -210,25 +249,36 @@ class NRKTVIE(NRKBaseIE):
      _VALID_URL = r'''(?x)
                          https?://
                              (?:tv|radio)\.nrk(?:super)?\.no/
-                            (?:serie/[^/]+|program)/
+                            (?:serie(?:/[^/]+){1,2}|program)/
                              (?![Ee]pisodes)%s
                              (?:/\d{2}-\d{2}-\d{4})?
                              (?:\#del=(?P<part_id>\d+))?
                      ''' % _EPISODE_RE
-    _API_HOST = 'psapi-we.nrk.no'
-
+    _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
      _TESTS = [{
+        'url': 'https://tv.nrk.no/program/MDDP12000117',
+        'md5': '8270824df46ec629b66aeaa5796b36fb',
+        'info_dict': {
+            'id': 'MDDP12000117AA',
+            'ext': 'mp4',
+            'title': 'Alarm Trolltunga',
+            'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
+            'duration': 2223,
+            'age_limit': 6,
+        },
+    }, {
          'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
-        'md5': '4e9ca6629f09e588ed240fb11619922a',
+        'md5': '9a167e54d04671eb6317a37b7bc8a280',
          'info_dict': {
              'id': 'MUHH48000314AA',
              'ext': 'mp4',
              'title': '20 spørsmål 23.05.2014',
              'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
              'duration': 1741,
-            'series': '20 spørsmål - TV',
+            'series': '20 spørsmål',
              'episode': '23.05.2014',
          },
+        'skip': 'NoProgramRights',
      }, {
          'url': 'https://tv.nrk.no/program/mdfp15000514',
          'info_dict': {
@@ -264,7 +314,7 @@ class NRKTVIE(NRKBaseIE):
                  'id': 'MSPO40010515AH',
                  'ext': 'mp4',
                  'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)',
-                'description': 'md5:c03aba1e917561eface5214020551b7a',
+                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
                  'duration': 772,
                  'series': 'Tour de Ski',
                  'episode': '06.01.2015',
@@ -277,7 +327,7 @@ class NRKTVIE(NRKBaseIE):
                  'id': 'MSPO40010515BH',
                  'ext': 'mp4',
                  'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)',
-                'description': 'md5:c03aba1e917561eface5214020551b7a',
+                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
                  'duration': 6175,
                  'series': 'Tour de Ski',
                  'episode': '06.01.2015',
@@ -289,7 +339,7 @@ class NRKTVIE(NRKBaseIE):
          'info_dict': {
              'id': 'MSPO40010515',
              'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
-            'description': 'md5:c03aba1e917561eface5214020551b7a',
+            'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
          },
          'expected_warnings': ['Video is geo restricted'],
      }, {
@@ -325,9 +375,240 @@ class NRKTVIE(NRKBaseIE):
      }, {
          'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
          'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller',
+        'only_matching': True,
      }]
  
  
+class NRKTVEpisodeIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2',
+        'info_dict': {
+            'id': 'MUHH36005220BA',
+            'ext': 'mp4',
+            'title': 'Kro, krig og kjærlighet 2:6',
+            'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350',
+            'duration': 1563,
+            'series': 'Hellums kro',
+            'season_number': 1,
+            'episode_number': 2,
+            'episode': '2:6',
+            'age_limit': 6,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
+        'info_dict': {
+            'id': 'MSUI14000816AA',
+            'ext': 'mp4',
+            'title': 'Backstage 8:30',
+            'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
+            'duration': 1320,
+            'series': 'Backstage',
+            'season_number': 1,
+            'episode_number': 8,
+            'episode': '8:30',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'ProgramRightsHasExpired',
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        info = self._search_json_ld(webpage, display_id, default={})
+        nrk_id = info.get('@id') or self._html_search_meta(
+            'nrk:program-id', webpage, default=None) or self._search_regex(
+            r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage,
+            'nrk id')
+        assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
+
+        info.update({
+            '_type': 'url_transparent',
+            'id': nrk_id,
+            'url': 'nrk:%s' % nrk_id,
+            'ie_key': NRKIE.ie_key(),
+        })
+        return info
+
+
+class NRKTVSerieBaseIE(InfoExtractor):
+    def _extract_series(self, webpage, display_id, fatal=True):
+        config = self._parse_json(
+            self._search_regex(
+                (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;',
+                 r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'),
+                webpage, 'config', default='{}' if not fatal else NO_DEFAULT),
+            display_id, fatal=False, transform_source=js_to_json)
+        if not config:
+            return
+        return try_get(
+            config,
+            (lambda x: x['initialState']['series'], lambda x: x['series']),
+            dict)
+
+    def _extract_seasons(self, seasons):
+        if not isinstance(seasons, list):
+            return []
+        entries = []
+        for season in seasons:
+            entries.extend(self._extract_episodes(season))
+        return entries
+
+    def _extract_episodes(self, season):
+        if not isinstance(season, dict):
+            return []
+        return self._extract_entries(season.get('episodes'))
+
+    def _extract_entries(self, entry_list):
+        if not isinstance(entry_list, list):
+            return []
+        entries = []
+        for episode in entry_list:
+            nrk_id = episode.get('prfId')
+            if not nrk_id or not isinstance(nrk_id, compat_str):
+                continue
+            entries.append(self.url_result(
+                'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
+        return entries
+
+
+class NRKTVSeasonIE(NRKTVSerieBaseIE):
+    _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
+        'info_dict': {
+            'id': '1',
+            'title': 'Sesong 1',
+        },
+        'playlist_mincount': 30,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
+                else super(NRKTVSeasonIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        series = self._extract_series(webpage, display_id)
+
+        season = next(
+            s for s in series['seasons']
+            if int(display_id) == s.get('seasonNumber'))
+
+        title = try_get(season, lambda x: x['titles']['title'], compat_str)
+        return self.playlist_result(
+            self._extract_episodes(season), display_id, title)
+
+
+class NRKTVSeriesIE(NRKTVSerieBaseIE):
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
+    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/blank',
+        'info_dict': {
+            'id': 'blank',
+            'title': 'Blank',
+            'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e',
+        },
+        'playlist_mincount': 30,
+    }, {
+        # new layout, seasons
+        'url': 'https://tv.nrk.no/serie/backstage',
+        'info_dict': {
+            'id': 'backstage',
+            'title': 'Backstage',
+            'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
+        },
+        'playlist_mincount': 60,
+    }, {
+        # new layout, instalments
+        'url': 'https://tv.nrk.no/serie/groenn-glede',
+        'info_dict': {
+            'id': 'groenn-glede',
+            'title': 'Grønn glede',
+            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
+        },
+        'playlist_mincount': 10,
+    }, {
+        # old layout
+        'url': 'https://tv.nrksuper.no/serie/labyrint',
+        'info_dict': {
+            'id': 'labyrint',
+            'title': 'Labyrint',
+            'description': 'md5:318b597330fdac5959247c9b69fdb1ec',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/saving-the-human-race',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/postmann-pat',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (
+            False if any(ie.suitable(url)
+                         for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
+            else super(NRKTVSeriesIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, series_id)
+
+        # New layout (e.g. https://tv.nrk.no/serie/backstage)
+        series = self._extract_series(webpage, series_id, fatal=False)
+        if series:
+            title = try_get(series, lambda x: x['titles']['title'], compat_str)
+            description = try_get(
+                series, lambda x: x['titles']['subtitle'], compat_str)
+            entries = []
+            entries.extend(self._extract_seasons(series.get('seasons')))
+            entries.extend(self._extract_entries(series.get('instalments')))
+            entries.extend(self._extract_episodes(series.get('extraMaterial')))
+            return self.playlist_result(entries, series_id, title, description)
+
+        # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint)
+        entries = [
+            self.url_result(
+                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
+                    series=series_id, season=season_id))
+            for season_id in re.findall(self._ITEM_RE, webpage)
+        ]
+
+        title = self._html_search_meta(
+            'seriestitle', webpage,
+            'title', default=None) or self._og_search_title(
+            webpage, fatal=False)
+        if title:
+            title = self._search_regex(
+                r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title)
+
+        description = self._html_search_meta(
+            'series_description', webpage,
+            'description', default=None) or self._og_search_description(webpage)
+
+        return self.playlist_result(entries, series_id, title, description)
+
+
  class NRKTVDirekteIE(NRKTVIE):
      IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'
      _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)'
@@ -380,7 +661,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE):
              'title': 'Rivertonprisen til Karin Fossum',
              'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
          },
-        'playlist_count': 5,
+        'playlist_count': 2,
      }]
  
      def _extract_title(self, webpage):
@@ -407,71 +688,13 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE):
              r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
  
  
-class NRKTVSeriesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
-    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
-    _TESTS = [{
-        'url': 'https://tv.nrk.no/serie/groenn-glede',
-        'info_dict': {
-            'id': 'groenn-glede',
-            'title': 'Grønn glede',
-            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
-        },
-        'playlist_mincount': 9,
-    }, {
-        'url': 'http://tv.nrksuper.no/serie/labyrint',
-        'info_dict': {
-            'id': 'labyrint',
-            'title': 'Labyrint',
-            'description': 'md5:58afd450974c89e27d5a19212eee7115',
-        },
-        'playlist_mincount': 3,
-    }, {
-        'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
-        'only_matching': True,
-    }, {
-        'url': 'https://tv.nrk.no/serie/saving-the-human-race',
-        'only_matching': True,
-    }, {
-        'url': 'https://tv.nrk.no/serie/postmann-pat',
-        'only_matching': True,
-    }]
-
-    @classmethod
-    def suitable(cls, url):
-        return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url)
-
-    def _real_extract(self, url):
-        series_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, series_id)
-
-        entries = [
-            self.url_result(
-                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
-                    series=series_id, season=season_id))
-            for season_id in re.findall(self._ITEM_RE, webpage)
-        ]
-
-        title = self._html_search_meta(
-            'seriestitle', webpage,
-            'title', default=None) or self._og_search_title(
-            webpage, fatal=False)
-
-        description = self._html_search_meta(
-            'series_description', webpage,
-            'description', default=None) or self._og_search_description(webpage)
-
-        return self.playlist_result(entries, series_id, title, description)
-
-
  class NRKSkoleIE(InfoExtractor):
      IE_DESC = 'NRK Skole'
      _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'
  
      _TESTS = [{
          'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099',
-        'md5': '6bc936b01f9dd8ed45bc58b252b2d9b6',
+        'md5': '18c12c3d071953c3bf8d54ef6b2587b7',
          'info_dict': {
              'id': '6021',
              'ext': 'mp4',