[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / webofstories.py
index 396cf4e8312ca73f90f45b3e24f3fb3561f54fa8..f2b8d19b439d4279e89b872e41b2eeefd421333e 100644 (file)
@@ -1,8 +1,13 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    orderedSet,
+)
 
 
 class WebOfStoriesIE(InfoExtractor):
@@ -10,54 +15,66 @@ class WebOfStoriesIE(InfoExtractor):
     _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
     _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
     _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
-    _TESTS = [
-        {
-            'url': 'http://www.webofstories.com/play/hans.bethe/71',
-            'md5': '373e4dd915f60cfe3116322642ddf364',
-            'info_dict': {
-                'id': '4536',
-                'ext': 'mp4',
-                'title': 'The temperature of the sun',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'description': 'Hans Bethe talks about calculating the temperature of the sun',
-                'duration': 238,
-            }
+    _TESTS = [{
+        'url': 'http://www.webofstories.com/play/hans.bethe/71',
+        'md5': '373e4dd915f60cfe3116322642ddf364',
+        'info_dict': {
+            'id': '4536',
+            'ext': 'mp4',
+            'title': 'The temperature of the sun',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Hans Bethe talks about calculating the temperature of the sun',
+            'duration': 238,
+        }
+    }, {
+        'url': 'http://www.webofstories.com/play/55908',
+        'md5': '2985a698e1fe3211022422c4b5ed962c',
+        'info_dict': {
+            'id': '55908',
+            'ext': 'mp4',
+            'title': 'The story of Gemmata obscuriglobus',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+            'duration': 169,
         },
-        {
-            'url': 'http://www.webofstories.com/play/55908',
-            'md5': '2985a698e1fe3211022422c4b5ed962c',
-            'info_dict': {
-                'id': '55908',
-                'ext': 'mp4',
-                'title': 'The story of Gemmata obscuriglobus',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
-                'duration': 169,
-            }
+        'skip': 'notfound',
+    }, {
+        # malformed og:title meta
+        'url': 'http://www.webofstories.com/play/54215?o=MS',
+        'info_dict': {
+            'id': '54215',
+            'ext': 'mp4',
+            'title': '"A Leg to Stand On"',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+            'duration': 97,
         },
-    ]
+        'params': {
+            'skip_download': True,
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
+        # Sometimes og:title meta is malformed
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+            r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
         description = self._html_search_meta('description', webpage)
         thumbnail = self._og_search_thumbnail(webpage)
 
-        story_filename = self._search_regex(
-            r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
-        speaker_id = self._search_regex(
-            r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
-        story_id = self._search_regex(
-            r'\.storyId\((\d+)\)', webpage, 'story ID')
-        speaker_type = self._search_regex(
-            r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
-        great_life = self._search_regex(
-            r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
+        embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
+            r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
+            webpage, 'embed params').split(',')]
+
+        (
+            _, speaker_id, story_id, story_duration,
+            speaker_type, great_life, _thumbnail, _has_subtitles,
+            story_filename, _story_order) = embed_params
+
         is_great_life_series = great_life == 'true'
-        duration = int_or_none(self._search_regex(
-            r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
+        duration = int_or_none(story_duration)
 
         # URL building, see: http://www.webofstories.com/scripts/player.js
         ms_prefix = ''
@@ -100,3 +117,44 @@ class WebOfStoriesIE(InfoExtractor):
             'description': description,
             'duration': duration,
         }
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.webofstories.com/playAll/donald.knuth',
+        'info_dict': {
+            'id': 'donald.knuth',
+            'title': 'Donald Knuth (Scientist)',
+        },
+        'playlist_mincount': 97,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result(
+                'http://www.webofstories.com/play/%s' % video_id,
+                'WebOfStories', video_id=video_id)
+            for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
+        ]
+
+        title = self._search_regex(
+            r'<div id="speakerName">\s*<span>([^<]+)</span>',
+            webpage, 'speaker', default=None)
+        if title:
+            field = self._search_regex(
+                r'<span id="primaryField">([^<]+)</span>',
+                webpage, 'field', default=None)
+            if field:
+                title += ' (%s)' % field
+
+        if not title:
+            title = self._search_regex(
+                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+                webpage, 'title')
+
+        return self.playlist_result(entries, playlist_id, title)