[nfl] Support team micro-sites (fixes #3831)
authorNaglis Jonaitis <njonaitis@gmail.com>
Sun, 28 Sep 2014 18:48:26 +0000 (21:48 +0300)
committerNaglis Jonaitis <njonaitis@gmail.com>
Sun, 28 Sep 2014 18:48:26 +0000 (21:48 +0300)
youtube_dl/extractor/nfl.py

index 668d99512904f4154bda4c73f8d6698b3ea08e03..4832b3ce4b765d332ea8e827205547bd062c400c 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    compat_urllib_parse,
     int_or_none,
     remove_end,
 )
@@ -13,76 +14,116 @@ from ..utils import (
 
 class NFLIE(InfoExtractor):
     IE_NAME = 'nfl.com'
-    _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
-    _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
-    _TEST = {
-        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
-        'md5': '394ef771ddcd1354f665b471d78ec4c6',
-        'info_dict': {
-            'id': '0ap3000000398478',
-            'ext': 'mp4',
-            'title': 'Week 3: Redskins vs. Eagles highlights',
-            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
-            'upload_date': '20140921',
-            'timestamp': 1411337580,
-            'thumbnail': 're:^https?://.*\.jpg$',
+    _VALID_URL = r'''(?x)https?://
+        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
+        (?:.+?/)*
+        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+    _TESTS = [
+        {
+            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+            'md5': '394ef771ddcd1354f665b471d78ec4c6',
+            'info_dict': {
+                'id': '0ap3000000398478',
+                'ext': 'mp4',
+                'title': 'Week 3: Redskins vs. Eagles highlights',
+                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+                'upload_date': '20140921',
+                'timestamp': 1411337580,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        },
+        {
+            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+            'info_dict': {
+                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+                'ext': 'mp4',
+                'title': 'LIVE: Post Game vs. Browns',
+                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+                'upload_date': '20131229',
+                'timestamp': 1388354455,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        }
+    ]
+
+    @staticmethod
+    def prepend_host(host, url):
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = '/%s' % url
+            url = 'http://{0:}{1:}'.format(host, url)
+        return url
+
+    @staticmethod
+    def format_from_stream(stream, protocol, host, path_prefix='',
+                           preference=0, note=None):
+        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+            protocol=protocol,
+            host=host,
+            prefix=path_prefix,
+            path=stream.get('path'),
+        )
+        return {
+            'url': url,
+            'vbr': int_or_none(stream.get('rate', 0), 1000),
+            'preference': preference,
+            'format_note': note,
         }
-    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id, host = mobj.group('id'), mobj.group('host')
 
-        config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
-                                     note='Downloading player config')
-        url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
-        video_data = self._download_json(url_template.format(id=video_id), video_id)
+        webpage = self._download_webpage(url, video_id)
 
-        cdns = config.get('cdns')
-        if not cdns:
-            raise ExtractorError('Failed to get CDN data', expected=True)
+        config_url = NFLIE.prepend_host(host, self._search_regex(
+            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
+        config = self._download_json(config_url, video_id,
+                                     note='Downloading player config')
+        url_template = NFLIE.prepend_host(
+            host, '{contentURLTemplate:}'.format(**config))
+        video_data = self._download_json(
+            url_template.format(id=video_id), video_id)
 
         formats = []
-        streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
-        for name, cdn in cdns.items():
-            # LimeLight streams don't seem to work
-            if cdn.get('name') == 'LIMELIGHT':
-                continue
-
-            protocol = cdn.get('protocol')
-            host = remove_end(cdn.get('host', ''), '/')
-            if not (protocol and host):
-                continue
-
-            path_prefix = cdn.get('pathprefix', '')
-            if path_prefix and not path_prefix.endswith('/'):
-                path_prefix = '%s/' % path_prefix
-
-            get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
-                protocol=protocol,
-                host=host,
-                prefix=path_prefix,
-                path=p,
-            )
-
-            if protocol == 'rtmp':
-                preference = -1
-            elif 'prog' in name.lower():
-                preference = 1
-            else:
-                preference = 0
-
+        cdn_data = video_data.get('cdnData', {})
+        streams = cdn_data.get('bitrateInfo', [])
+        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+            parts = compat_urllib_parse.urlparse(cdn_data.get('uri'))
+            protocol, host = parts.scheme, parts.netloc
             for stream in streams:
-                path = stream.get('path')
-                if not path:
+                formats.append(
+                    NFLIE.format_from_stream(stream, protocol, host))
+        else:
+            cdns = config.get('cdns')
+            if not cdns:
+                raise ExtractorError('Failed to get CDN data', expected=True)
+
+            for name, cdn in cdns.items():
+                # LimeLight streams don't seem to work
+                if cdn.get('name') == 'LIMELIGHT':
+                    continue
+
+                protocol = cdn.get('protocol')
+                host = remove_end(cdn.get('host', ''), '/')
+                if not (protocol and host):
                     continue
 
-                formats.append({
-                    'url': get_url(path),
-                    'vbr': int_or_none(stream.get('rate', 0), 1000),
-                    'preference': preference,
-                    'format_note': name,
-                })
+                prefix = cdn.get('pathprefix', '')
+                if prefix and not prefix.endswith('/'):
+                    prefix = '%s/' % prefix
+
+                preference = 0
+                if protocol == 'rtmp':
+                    preference = -2
+                elif 'prog' in name.lower():
+                    preference = 1
+
+                for stream in streams:
+                    formats.append(
+                        NFLIE.format_from_stream(stream, protocol, host,
+                                                 prefix, preference, name))
 
         self._sort_formats(formats)