[nova:embed] Fix extraction (closes #23672)

[youtube-dl] / youtube_dl / extractor / voicerepublic.py
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py

index a3e40b940139f3a22cc0e9dc4a20af7578c7fc25..a52e40afa2892a10538251ba40e4d2a44a10a67d 100644 (file)
--- a/youtube_dl/extractor/voicerepublic.py
+++ b/youtube_dl/extractor/voicerepublic.py
@@ -1,52 +1,62 @@
-# coding: utf-8
  from __future__ import unicode_literals
  
-import re
-
  from .common import InfoExtractor
-from ..compat import compat_urllib_request
-from ..utils import ExtractorError
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+    urljoin,
+)
  
  
  class VoiceRepublicIE(InfoExtractor):
-    _VALID_URL = r'https?://voicerepublic\.com/talks/(?P<id>[0-9a-z-]+)'
-    _TEST = {
-        'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
-        'md5': '0554a24d1657915aa8e8f84e15dc9353',
+    _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+    _TESTS = [{
+        'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+        'md5': 'b9174d651323f17783000876347116e3',
          'info_dict': {
              'id': '2296',
+            'display_id': 'watching-the-watchers-building-a-sousveillance-state',
              'ext': 'm4a',
              'title': 'Watching the Watchers: Building a Sousveillance State',
-            'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png',
-            'description': 'md5:715ba964958afa2398df615809cfecb1',
+            'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.',
+            'duration': 1556,
+            'view_count': int,
          }
-    }
+    }, {
+        'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          display_id = self._match_id(url)
-        req = compat_urllib_request.Request(url)
-        # Older versions of Firefox get redirected to an "upgrade browser" page
-        req.add_header('User-Agent', 'youtube-dl')
-        webpage = self._download_webpage(req, display_id)
-        thumbnail = self._og_search_thumbnail(webpage)
-        video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id')
  
-        if '<a>Queued for processing, please stand by...</a>' in webpage:
-            raise ExtractorError('Audio is still queued for processing')
+        webpage = self._download_webpage(url, display_id)
+
+        if '>Queued for processing, please stand by...<' in webpage:
+            raise ExtractorError(
+                'Audio is still queued for processing', expected=True)
  
+        talk = self._parse_json(self._search_regex(
+            r'initialSnapshot\s*=\s*({.+?});',
+            webpage, 'talk'), display_id)['talk']
+        title = talk['title']
          formats = [{
-            'url': 'https://voicerepublic.com' + path,
-            'ext': ext,
-            'format_id': ext,
+            'url': urljoin(url, talk_url),
+            'format_id': format_id,
+            'ext': determine_ext(talk_url) or format_id,
              'vcodec': 'none',
-        } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)]
+        } for format_id, talk_url in talk['media_links'].items()]
          self._sort_formats(formats)
  
          return {
-            'id': video_id,
-            'title': self._og_search_title(webpage),
+            'id': compat_str(talk.get('id') or display_id),
+            'display_id': display_id,
+            'title': title,
+            'description': talk.get('teaser'),
+            'thumbnail': talk.get('image_url'),
+            'duration': int_or_none(talk.get('archived_duration')),
+            'view_count': int_or_none(talk.get('play_count')),
              'formats': formats,
-            'url': self._og_search_url(webpage),
-            'thumbnail': thumbnail,
-            'description': self._og_search_description(webpage),
          }