[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / voicerepublic.py
index 7d255d6fad9e892652eb28507d45363c8f372faf..a52e40afa2892a10538251ba40e4d2a44a10a67d 100644 (file)
@@ -1,55 +1,62 @@
-# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-
-from ..compat import (
-    compat_urllib_request,
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+    urljoin,
 )
 
 
 class VoiceRepublicIE(InfoExtractor):
-    _VALID_URL = r'https?://voicerepublic\.com/talks/(?P<id>[0-9a-z-]+)'
-    _TEST = {
-        'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
-        'md5': '0554a24d1657915aa8e8f84e15dc9353',
+    _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+    _TESTS = [{
+        'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+        'md5': 'b9174d651323f17783000876347116e3',
         'info_dict': {
             'id': '2296',
+            'display_id': 'watching-the-watchers-building-a-sousveillance-state',
             'ext': 'm4a',
             'title': 'Watching the Watchers: Building a Sousveillance State',
-            'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png',
-            'description': 'md5:715ba964958afa2398df615809cfecb1',
-            'creator': 'M. C. McGrath',
+            'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.',
+            'duration': 1556,
+            'view_count': int,
         }
-    }
+    }, {
+        'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
-        req = compat_urllib_request.Request(url)
-        # Older versions of Firefox get redirected to an "upgrade browser" page
-        req.add_header('User-Agent', 'youtube-dl')
-        webpage = self._download_webpage(req, display_id)
-        thumbnail = self._og_search_thumbnail(webpage)
-        video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id')
-
-        if '<div class=\'vr-player jp-jplayer\'' in webpage:
-            formats = [{
-                'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext),
-                'ext': ext,
-                'format_id': ext,
-                'vcodec': 'none',
-            } for ext in ['m4a', 'mp3', 'ogg']]
-            self._sort_formats(formats)
-        else:
-            # Audio is still queued for processing
-            formats = []
+
+        webpage = self._download_webpage(url, display_id)
+
+        if '>Queued for processing, please stand by...<' in webpage:
+            raise ExtractorError(
+                'Audio is still queued for processing', expected=True)
+
+        talk = self._parse_json(self._search_regex(
+            r'initialSnapshot\s*=\s*({.+?});',
+            webpage, 'talk'), display_id)['talk']
+        title = talk['title']
+        formats = [{
+            'url': urljoin(url, talk_url),
+            'format_id': format_id,
+            'ext': determine_ext(talk_url) or format_id,
+            'vcodec': 'none',
+        } for format_id, talk_url in talk['media_links'].items()]
+        self._sort_formats(formats)
 
         return {
-            'id': video_id,
-            'title': self._og_search_title(webpage),
+            'id': compat_str(talk.get('id') or display_id),
+            'display_id': display_id,
+            'title': title,
+            'description': talk.get('teaser'),
+            'thumbnail': talk.get('image_url'),
+            'duration': int_or_none(talk.get('archived_duration')),
+            'view_count': int_or_none(talk.get('play_count')),
             'formats': formats,
-            'url': self._og_search_url(webpage),
-            'thumbnail': thumbnail,
-            'description': self._og_search_description(webpage),
-            'creator': self._html_search_meta('author', webpage),
         }