[nfb] Modernize and extract subtitles
authorSergey M․ <dstftw@gmail.com>
Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
committerSergey M․ <dstftw@gmail.com>
Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
youtube_dl/extractor/nfb.py

index 234e490476941822f6499bae7dc4c93aaf865614..adcc636bc32c062fec74074044783e452a3725d9 100644 (file)
@@ -2,8 +2,12 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    sanitized_Request,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    qualities,
     urlencode_postdata,
+    xpath_text,
 )
 
 
@@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):
         'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
         'info_dict': {
             'id': 'qallunaat_why_white_people_are_funny',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Qallunaat! Why White People Are Funny ',
-            'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
+            'description': 'md5:6b8e32dde3abf91e58857b174916620c',
             'duration': 3128,
+            'creator': 'Mark Sandiford',
             'uploader': 'Mark Sandiford',
-            'uploader_id': 'mark-sandiford',
         },
         'params': {
             # rtmp download
@@ -31,64 +35,78 @@ class NFBIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        page = self._download_webpage(
-            'https://www.nfb.ca/film/%s' % video_id, video_id,
-            'Downloading film page')
 
-        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
-                                              page, 'director id', fatal=False)
-        uploader = self._og_search_property('video:director', page, 'director name')
-
-        request = sanitized_Request(
+        config = self._download_xml(
             'https://www.nfb.ca/film/%s/player_config' % video_id,
-            urlencode_postdata({'getConfig': 'true'}))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
-
-        config = self._download_xml(request, video_id, 'Downloading player config XML')
+            video_id, 'Downloading player config XML',
+            data=urlencode_postdata({'getConfig': 'true'}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
+            })
 
-        title = None
-        description = None
-        thumbnail = None
-        duration = None
-        formats = []
-
-        def extract_thumbnail(media):
-            thumbnails = {}
-            for asset in media.findall('assets/asset'):
-                thumbnails[asset.get('quality')] = asset.find('default/url').text
-            if not thumbnails:
-                return None
-            if 'high' in thumbnails:
-                return thumbnails['high']
-            return list(thumbnails.values())[0]
+        title, description, thumbnail, duration, uploader, author = [None] * 6
+        thumbnails, formats = [[]] * 2
+        subtitles = {}
 
         for media in config.findall('./player/stream/media'):
             if media.get('type') == 'posterImage':
-                thumbnail = extract_thumbnail(media)
+                quality_key = qualities(('low', 'high'))
+                thumbnails = []
+                for asset in media.findall('assets/asset'):
+                    asset_url = xpath_text(asset, 'default/url', default=None)
+                    if not asset_url:
+                        continue
+                    quality = asset.get('quality')
+                    thumbnails.append({
+                        'url': asset_url,
+                        'id': quality,
+                        'preference': quality_key(quality),
+                    })
             elif media.get('type') == 'video':
-                duration = int(media.get('duration'))
-                title = media.find('title').text
-                description = media.find('description').text
-                # It seems assets always go from lower to better quality, so no need to sort
+                title = xpath_text(media, 'title', fatal=True)
                 for asset in media.findall('assets/asset'):
-                    for x in asset:
+                    quality = asset.get('quality')
+                    height = int_or_none(self._search_regex(
+                        r'^(\d+)[pP]$', quality or '', 'height', default=None))
+                    for node in asset:
+                        streamer = xpath_text(node, 'streamerURI', default=None)
+                        if not streamer:
+                            continue
+                        play_path = xpath_text(node, 'url', default=None)
+                        if not play_path:
+                            continue
                         formats.append({
-                            'url': x.find('streamerURI').text,
-                            'app': x.find('streamerURI').text.split('/', 3)[3],
-                            'play_path': x.find('url').text,
+                            'url': streamer,
+                            'app': streamer.split('/', 3)[3],
+                            'play_path': play_path,
                             'rtmp_live': False,
-                            'ext': 'mp4',
-                            'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+                            'ext': 'flv',
+                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
+                            'height': height,
                         })
+                self._sort_formats(formats)
+                description = clean_html(xpath_text(media, 'description'))
+                uploader = xpath_text(media, 'author')
+                duration = int_or_none(media.get('duration'))
+                for subtitle in media.findall('./subtitles/subtitle'):
+                    subtitle_url = xpath_text(subtitle, 'url', default=None)
+                    if not subtitle_url:
+                        continue
+                    lang = xpath_text(subtitle, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': subtitle_url,
+                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
+                    })
 
         return {
             'id': video_id,
             'title': title,
             'description': description,
-            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
             'duration': duration,
+            'creator': uploader,
             'uploader': uploader,
-            'uploader_id': uploader_id,
             'formats': formats,
+            'subtitles': subtitles,
         }