[nfb] Modernize and extract subtitles

author Sergey M․ <dstftw@gmail.com>

Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)

committer Sergey M․ <dstftw@gmail.com>

Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
author Sergey M․ <dstftw@gmail.com>
Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
committer Sergey M․ <dstftw@gmail.com>
Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py

index 234e490476941822f6499bae7dc4c93aaf865614..adcc636bc32c062fec74074044783e452a3725d9 100644 (file)
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -2,8 +2,12 @@ from __future__ import unicode_literals
  
  from .common import InfoExtractor
  from ..utils import (
-    sanitized_Request,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    qualities,
      urlencode_postdata,
+    xpath_text,
  )
  
  
@@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):
          'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
          'info_dict': {
              'id': 'qallunaat_why_white_people_are_funny',
-            'ext': 'mp4',
+            'ext': 'flv',
              'title': 'Qallunaat! Why White People Are Funny ',
-            'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
+            'description': 'md5:6b8e32dde3abf91e58857b174916620c',
              'duration': 3128,
+            'creator': 'Mark Sandiford',
              'uploader': 'Mark Sandiford',
-            'uploader_id': 'mark-sandiford',
          },
          'params': {
              # rtmp download
@@ -31,64 +35,78 @@ class NFBIE(InfoExtractor):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        page = self._download_webpage(
-            'https://www.nfb.ca/film/%s' % video_id, video_id,
-            'Downloading film page')
  
-        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
-                                              page, 'director id', fatal=False)
-        uploader = self._og_search_property('video:director', page, 'director name')
-
-        request = sanitized_Request(
+        config = self._download_xml(
              'https://www.nfb.ca/film/%s/player_config' % video_id,
-            urlencode_postdata({'getConfig': 'true'}))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
-
-        config = self._download_xml(request, video_id, 'Downloading player config XML')
+            video_id, 'Downloading player config XML',
+            data=urlencode_postdata({'getConfig': 'true'}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
+            })
  
-        title = None
-        description = None
-        thumbnail = None
-        duration = None
-        formats = []
-
-        def extract_thumbnail(media):
-            thumbnails = {}
-            for asset in media.findall('assets/asset'):
-                thumbnails[asset.get('quality')] = asset.find('default/url').text
-            if not thumbnails:
-                return None
-            if 'high' in thumbnails:
-                return thumbnails['high']
-            return list(thumbnails.values())[0]
+        title, description, thumbnail, duration, uploader, author = [None] * 6
+        thumbnails, formats = [[]] * 2
+        subtitles = {}
  
          for media in config.findall('./player/stream/media'):
              if media.get('type') == 'posterImage':
-                thumbnail = extract_thumbnail(media)
+                quality_key = qualities(('low', 'high'))
+                thumbnails = []
+                for asset in media.findall('assets/asset'):
+                    asset_url = xpath_text(asset, 'default/url', default=None)
+                    if not asset_url:
+                        continue
+                    quality = asset.get('quality')
+                    thumbnails.append({
+                        'url': asset_url,
+                        'id': quality,
+                        'preference': quality_key(quality),
+                    })
              elif media.get('type') == 'video':
-                duration = int(media.get('duration'))
-                title = media.find('title').text
-                description = media.find('description').text
-                # It seems assets always go from lower to better quality, so no need to sort
+                title = xpath_text(media, 'title', fatal=True)
                  for asset in media.findall('assets/asset'):
-                    for x in asset:
+                    quality = asset.get('quality')
+                    height = int_or_none(self._search_regex(
+                        r'^(\d+)[pP]$', quality or '', 'height', default=None))
+                    for node in asset:
+                        streamer = xpath_text(node, 'streamerURI', default=None)
+                        if not streamer:
+                            continue
+                        play_path = xpath_text(node, 'url', default=None)
+                        if not play_path:
+                            continue
                          formats.append({
-                            'url': x.find('streamerURI').text,
-                            'app': x.find('streamerURI').text.split('/', 3)[3],
-                            'play_path': x.find('url').text,
+                            'url': streamer,
+                            'app': streamer.split('/', 3)[3],
+                            'play_path': play_path,
                              'rtmp_live': False,
-                            'ext': 'mp4',
-                            'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+                            'ext': 'flv',
+                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
+                            'height': height,
                          })
+                self._sort_formats(formats)
+                description = clean_html(xpath_text(media, 'description'))
+                uploader = xpath_text(media, 'author')
+                duration = int_or_none(media.get('duration'))
+                for subtitle in media.findall('./subtitles/subtitle'):
+                    subtitle_url = xpath_text(subtitle, 'url', default=None)
+                    if not subtitle_url:
+                        continue
+                    lang = xpath_text(subtitle, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': subtitle_url,
+                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
+                    })
  
          return {
              'id': video_id,
              'title': title,
              'description': description,
-            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
              'duration': duration,
+            'creator': uploader,
              'uploader': uploader,
-            'uploader_id': uploader_id,
              'formats': formats,
+            'subtitles': subtitles,
          }
author	Sergey M․ <dstftw@gmail.com>
	Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)
committer	Sergey M․ <dstftw@gmail.com>
	Tue, 17 May 2016 18:25:15 +0000 (00:25 +0600)