[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / wsj.py
index ec38a2ad8e5d900e2b4143b9ede160072bdb35c2..67236f377d2714b517b1f27e687258547c75ac62 100644 (file)
@@ -11,12 +11,13 @@ from ..utils import (
 
 class WSJIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        (?:
-            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
-            https?://(?:www\.)?wsj\.com/video/[^/]+/|
-            wsj:
-        )
-        (?P<id>[a-zA-Z0-9-]+)'''
+                        (?:
+                            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+                            https?://(?:www\.)?(?:wsj|barrons)\.com/video/(?:[^/]+/)+|
+                            wsj:
+                        )
+                        (?P<id>[a-fA-F0-9-]{36})
+                    '''
     IE_DESC = 'Wall Street Journal'
     _TESTS = [{
         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -34,17 +35,28 @@ class WSJIE(InfoExtractor):
     }, {
         'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
         'only_matching': True,
+    }, {
+        'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.wsj.com/video/series/a-brief-history-of/the-modern-cell-carrier-how-we-got-here/980E2187-401D-48A1-B82B-1486CEE06CB9',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        api_url = (
-            'http://video-api.wsj.com/api-video/find_all_videos.asp?'
-            'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
-            'thumbnailList,author,description,name,duration,videoURL,'
-            'titletag,formattedCreationDate,keywords,editor' % video_id)
-        info = self._download_json(api_url, video_id)['items'][0]
+        info = self._download_json(
+            'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+            query={
+                'type': 'guid',
+                'count': 1,
+                'query': video_id,
+                'fields': ','.join((
+                    'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+                    'description', 'name', 'duration', 'videoURL', 'titletag',
+                    'formattedCreationDate', 'keywords', 'editor')),
+            })['items'][0]
         title = info.get('name', info.get('titletag'))
 
         formats = []
@@ -91,8 +103,8 @@ class WSJIE(InfoExtractor):
 
 
 class WSJArticleIE(InfoExtractor):
-    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)'
-    _TESTS = [{
+    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+    _TEST = {
         'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
         'info_dict': {
             'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
@@ -101,11 +113,11 @@ class WSJArticleIE(InfoExtractor):
             'uploader_id': 'ralcaraz',
             'title': 'Bao Bao the Panda Leaves for China',
         }
-    }]
+    }
 
     def _real_extract(self, url):
         article_id = self._match_id(url)
         webpage = self._download_webpage(url, article_id)
-        video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)',
-                                      webpage, 'video id')
+        video_id = self._search_regex(
+            r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
         return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)