[extractor/generic] Unescape video_id and title extracted from URL
[youtube-dl] / youtube_dl / extractor / nrk.py
index d17b7ed49771530a713e9b69590c6b56504e7ec9..cc70c295014f95fcb7e74f2f009889b5ca135663 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     float_or_none,
@@ -25,6 +24,7 @@ class NRKIE(InfoExtractor):
                 'ext': 'flv',
                 'title': 'Dompap og andre fugler i Piip-Show',
                 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+                'duration': 263,
             }
         },
         {
@@ -35,6 +35,7 @@ class NRKIE(InfoExtractor):
                 'ext': 'flv',
                 'title': 'Slik høres internett ut når du er blind',
                 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+                'duration': 20,
             }
         },
     ]
@@ -53,6 +54,8 @@ class NRKIE(InfoExtractor):
 
         video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
 
+        duration = parse_duration(data.get('duration'))
+
         images = data.get('images')
         if images:
             thumbnails = images['webImages']
@@ -67,10 +70,51 @@ class NRKIE(InfoExtractor):
             'ext': 'flv',
             'title': data['title'],
             'description': data['description'],
+            'duration': duration,
             'thumbnail': thumbnail,
         }
 
 
+class NRKPlaylistIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
+        'info_dict': {
+            'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763',
+            'title': 'Gjenopplev den historiske solformørkelsen',
+            'description': 'md5:c2df8ea3bac5654a26fc2834a542feed',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449',
+        'info_dict': {
+            'id': 'rivertonprisen-til-karin-fossum-1.12266449',
+            'title': 'Rivertonprisen til Karin Fossum',
+            'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
+        },
+        'playlist_count': 5,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('nrk:%s' % video_id, 'NRK')
+            for video_id in re.findall(
+                r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"',
+                webpage)
+        ]
+
+        playlist_title = self._og_search_title(webpage)
+        playlist_description = self._og_search_description(webpage)
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_title, playlist_description)
+
+
 class NRKTVIE(InfoExtractor):
     _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
 
@@ -155,20 +199,10 @@ class NRKTVIE(InfoExtractor):
         url = "%s%s" % (baseurl, subtitlesurl)
         self._debug_print('%s: Subtitle url: %s' % (video_id, url))
         captions = self._download_xml(
-            url, video_id, 'Downloading subtitles',
-            transform_source=lambda s: s.replace(r'<br />', '\r\n'))
+            url, video_id, 'Downloading subtitles')
         lang = captions.get('lang', 'no')
-        ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
-        srt = ''
-        for pos, p in enumerate(ps):
-            begin = parse_duration(p.get('begin'))
-            duration = parse_duration(p.get('dur'))
-            starttime = self._subtitles_timecode(begin)
-            endtime = self._subtitles_timecode(begin + duration)
-            srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text)
         return {lang: [
             {'ext': 'ttml', 'url': url},
-            {'ext': 'srt', 'data': srt},
         ]}
 
     def _extract_f4m(self, manifest_url, video_id):