[pornhub] Extract metadata from JSON-LD (closes #26614)

[youtube-dl] / youtube_dl / extractor / jwplatform.py
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py

index 63d0dc998cf1cf281dda3c27f3afaae84f4906c9..c34b5f5e6bd9e7d38e762f5d82f3669ac2c438a2 100644 (file)
--- a/youtube_dl/extractor/jwplatform.py
+++ b/youtube_dl/extractor/jwplatform.py
@@ -4,11 +4,12 @@ from __future__ import unicode_literals
  import re
  
  from .common import InfoExtractor
  import re
  
  from .common import InfoExtractor
+from ..utils import unsmuggle_url
  
  
  class JWPlatformIE(InfoExtractor):
  
  
  class JWPlatformIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
-    _TEST = {
+    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+    _TESTS = [{
          'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
          'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
          'info_dict': {
          'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
          'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
          'info_dict': {
@@ -19,7 +20,10 @@ class JWPlatformIE(InfoExtractor):
              'upload_date': '20081127',
              'timestamp': 1227796140,
          }
              'upload_date': '20081127',
              'timestamp': 1227796140,
          }
-    }
+    }, {
+        'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js',
+        'only_matching': True,
+    }]
  
      @staticmethod
      def _extract_url(webpage):
  
      @staticmethod
      def _extract_url(webpage):
@@ -29,10 +33,14 @@ class JWPlatformIE(InfoExtractor):
      @staticmethod
      def _extract_urls(webpage):
          return re.findall(
      @staticmethod
      def _extract_urls(webpage):
          return re.findall(
-            r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})',
+            r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})',
              webpage)
  
      def _real_extract(self, url):
              webpage)
  
      def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
          video_id = self._match_id(url)
          video_id = self._match_id(url)
-        json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+        json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id)
          return self._parse_jwplayer_data(json_data, video_id)
          return self._parse_jwplayer_data(json_data, video_id)