_ Git - youtube-dl/blob - youtube_dl/extractor/nrk.py

   1 # encoding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_urlparse,
   9     compat_urllib_parse_unquote,
  10 )
  11 from ..utils import (
  12     determine_ext,
  13     ExtractorError,
  14     float_or_none,
  15     parse_duration,
  16     unified_strdate,
  17 )
  18
  19
  20 class NRKIE(InfoExtractor):
  21     _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
  22
  23     _TESTS = [
  24         {
  25             'url': 'http://www.nrk.no/video/PS*150533',
  26             'md5': 'bccd850baebefe23b56d708a113229c2',
  27             'info_dict': {
  28                 'id': '150533',
  29                 'ext': 'flv',
  30                 'title': 'Dompap og andre fugler i Piip-Show',
  31                 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
  32                 'duration': 263,
  33             }
  34         },
  35         {
  36             'url': 'http://www.nrk.no/video/PS*154915',
  37             'md5': '0b1493ba1aae7d9579a5ad5531bc395a',
  38             'info_dict': {
  39                 'id': '154915',
  40                 'ext': 'flv',
  41                 'title': 'Slik høres internett ut når du er blind',
  42                 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
  43                 'duration': 20,
  44             }
  45         },
  46     ]
  47
  48     def _real_extract(self, url):
  49         video_id = self._match_id(url)
  50
  51         data = self._download_json(
  52             'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
  53             video_id, 'Downloading media JSON')
  54
  55         media_url = data.get('mediaUrl')
  56
  57         if not media_url:
  58             if data['usageRights']['isGeoBlocked']:
  59                 raise ExtractorError(
  60                     'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
  61                     expected=True)
  62
  63         if determine_ext(media_url) == 'f4m':
  64             formats = self._extract_f4m_formats(
  65                 media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds')
  66         else:
  67             formats = [{
  68                 'url': media_url,
  69                 'ext': 'flv',
  70             }]
  71
  72         duration = parse_duration(data.get('duration'))
  73
  74         images = data.get('images')
  75         if images:
  76             thumbnails = images['webImages']
  77             thumbnails.sort(key=lambda image: image['pixelWidth'])
  78             thumbnail = thumbnails[-1]['imageUrl']
  79         else:
  80             thumbnail = None
  81
  82         return {
  83             'id': video_id,
  84             'title': data['title'],
  85             'description': data['description'],
  86             'duration': duration,
  87             'thumbnail': thumbnail,
  88             'formats': formats,
  89         }
  90
  91
  92 class NRKPlaylistIE(InfoExtractor):
  93     _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
  94
  95     _TESTS = [{
  96         'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
  97         'info_dict': {
  98             'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763',
  99             'title': 'Gjenopplev den historiske solformørkelsen',
 100             'description': 'md5:c2df8ea3bac5654a26fc2834a542feed',
 101         },
 102         'playlist_count': 2,
 103     }, {
 104         'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449',
 105         'info_dict': {
 106             'id': 'rivertonprisen-til-karin-fossum-1.12266449',
 107             'title': 'Rivertonprisen til Karin Fossum',
 108             'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
 109         },
 110         'playlist_count': 5,
 111     }]
 112
 113     def _real_extract(self, url):
 114         playlist_id = self._match_id(url)
 115
 116         webpage = self._download_webpage(url, playlist_id)
 117
 118         entries = [
 119             self.url_result('nrk:%s' % video_id, 'NRK')
 120             for video_id in re.findall(
 121                 r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"',
 122                 webpage)
 123         ]
 124
 125         playlist_title = self._og_search_title(webpage)
 126         playlist_description = self._og_search_description(webpage)
 127
 128         return self.playlist_result(
 129             entries, playlist_id, playlist_title, playlist_description)
 130
 131
 132 class NRKSkoleIE(InfoExtractor):
 133     IE_DESC = 'NRK Skole'
 134     _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)'
 135
 136     _TESTS = [{
 137         'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532',
 138         'md5': '04cd85877cc1913bce73c5d28a47e00f',
 139         'info_dict': {
 140             'id': '6021',
 141             'ext': 'flv',
 142             'title': 'Genetikk og eneggede tvillinger',
 143             'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
 144             'duration': 399,
 145         },
 146     }, {
 147         'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed',
 148         'only_matching': True,
 149     }, {
 150         'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379',
 151         'only_matching': True,
 152     }]
 153
 154     def _real_extract(self, url):
 155         video_id = compat_urllib_parse_unquote(self._match_id(url))
 156
 157         webpage = self._download_webpage(url, video_id)
 158
 159         nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
 160         return self.url_result('nrk:%s' % nrk_id)
 161
 162
 163 class NRKTVIE(InfoExtractor):
 164     IE_DESC = 'NRK TV and NRK Radio'
 165     _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
 166
 167     _TESTS = [
 168         {
 169             'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
 170             'info_dict': {
 171                 'id': 'MUHH48000314',
 172                 'ext': 'mp4',
 173                 'title': '20 spørsmål',
 174                 'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
 175                 'upload_date': '20140523',
 176                 'duration': 1741.52,
 177             },
 178             'params': {
 179                 # m3u8 download
 180                 'skip_download': True,
 181             },
 182         },
 183         {
 184             'url': 'https://tv.nrk.no/program/mdfp15000514',
 185             'info_dict': {
 186                 'id': 'mdfp15000514',
 187                 'ext': 'mp4',
 188                 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
 189                 'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
 190                 'upload_date': '20140524',
 191                 'duration': 4605.08,
 192             },
 193             'params': {
 194                 # m3u8 download
 195                 'skip_download': True,
 196             },
 197         },
 198         {
 199             # single playlist video
 200             'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
 201             'md5': 'adbd1dbd813edaf532b0a253780719c2',
 202             'info_dict': {
 203                 'id': 'MSPO40010515-part2',
 204                 'ext': 'flv',
 205                 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
 206                 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
 207                 'upload_date': '20150106',
 208             },
 209             'skip': 'Only works from Norway',
 210         },
 211         {
 212             'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
 213             'playlist': [
 214                 {
 215                     'md5': '9480285eff92d64f06e02a5367970a7a',
 216                     'info_dict': {
 217                         'id': 'MSPO40010515-part1',
 218                         'ext': 'flv',
 219                         'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
 220                         'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
 221                         'upload_date': '20150106',
 222                     },
 223                 },
 224                 {
 225                     'md5': 'adbd1dbd813edaf532b0a253780719c2',
 226                     'info_dict': {
 227                         'id': 'MSPO40010515-part2',
 228                         'ext': 'flv',
 229                         'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
 230                         'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
 231                         'upload_date': '20150106',
 232                     },
 233                 },
 234             ],
 235             'info_dict': {
 236                 'id': 'MSPO40010515',
 237                 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
 238                 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
 239                 'upload_date': '20150106',
 240                 'duration': 6947.5199999999995,
 241             },
 242             'skip': 'Only works from Norway',
 243         },
 244         {
 245             'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
 246             'only_matching': True,
 247         }
 248     ]
 249
 250     def _extract_f4m(self, manifest_url, video_id):
 251         return self._extract_f4m_formats(
 252             manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
 253
 254     def _real_extract(self, url):
 255         mobj = re.match(self._VALID_URL, url)
 256         video_id = mobj.group('id')
 257         part_id = mobj.group('part_id')
 258         base_url = mobj.group('baseurl')
 259
 260         webpage = self._download_webpage(url, video_id)
 261
 262         title = self._html_search_meta(
 263             'title', webpage, 'title')
 264         description = self._html_search_meta(
 265             'description', webpage, 'description')
 266
 267         thumbnail = self._html_search_regex(
 268             r'data-posterimage="([^"]+)"',
 269             webpage, 'thumbnail', fatal=False)
 270         upload_date = unified_strdate(self._html_search_meta(
 271             'rightsfrom', webpage, 'upload date', fatal=False))
 272         duration = float_or_none(self._html_search_regex(
 273             r'data-duration="([^"]+)"',
 274             webpage, 'duration', fatal=False))
 275
 276         # playlist
 277         parts = re.findall(
 278             r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
 279         if parts:
 280             entries = []
 281             for current_part_id, stream_url, part_title in parts:
 282                 if part_id and current_part_id != part_id:
 283                     continue
 284                 video_part_id = '%s-part%s' % (video_id, current_part_id)
 285                 formats = self._extract_f4m(stream_url, video_part_id)
 286                 entries.append({
 287                     'id': video_part_id,
 288                     'title': part_title,
 289                     'description': description,
 290                     'thumbnail': thumbnail,
 291                     'upload_date': upload_date,
 292                     'formats': formats,
 293                 })
 294             if part_id:
 295                 if entries:
 296                     return entries[0]
 297             else:
 298                 playlist = self.playlist_result(entries, video_id, title, description)
 299                 playlist.update({
 300                     'thumbnail': thumbnail,
 301                     'upload_date': upload_date,
 302                     'duration': duration,
 303                 })
 304                 return playlist
 305
 306         formats = []
 307
 308         f4m_url = re.search(r'data-media="([^"]+)"', webpage)
 309         if f4m_url:
 310             formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
 311
 312         m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
 313         if m3u8_url:
 314             formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
 315         self._sort_formats(formats)
 316
 317         subtitles_url = self._html_search_regex(
 318             r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
 319             webpage, 'subtitle URL', default=None, group='url')
 320         subtitles = {}
 321         if subtitles_url:
 322             subtitles['no'] = [{
 323                 'ext': 'ttml',
 324                 'url': compat_urlparse.urljoin(base_url, subtitles_url),
 325             }]
 326
 327         return {
 328             'id': video_id,
 329             'title': title,
 330             'description': description,
 331             'thumbnail': thumbnail,
 332             'upload_date': upload_date,
 333             'duration': duration,
 334             'formats': formats,
 335             'subtitles': subtitles,
 336         }