[nbcnews] Support embed widgets
[youtube-dl] / youtube_dl / extractor / nbc.py
index 43d75d3cadf7d134fd4688294824ea0b6b6b35a3..6b7da114961c2e355cddfec59a573bcbe3211b71 100644 (file)
@@ -27,6 +27,9 @@ class NBCIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
                 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
+                'timestamp': 1424246400,
+                'upload_date': '20150218',
+                'uploader': 'NBCU-COM',
             },
             'params': {
                 # m3u8 download
@@ -50,6 +53,9 @@ class NBCIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'Star Wars Teaser',
                 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+                'timestamp': 1417852800,
+                'upload_date': '20141206',
+                'uploader': 'NBCU-COM',
             },
             'params': {
                 # m3u8 download
@@ -61,6 +67,23 @@ class NBCIE(InfoExtractor):
             # This video has expired but with an escaped embedURL
             'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
             'only_matching': True,
+        },
+        {
+            # HLS streams requires the 'hdnea3' cookie
+            'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
+            'info_dict': {
+                'id': 'n1806',
+                'ext': 'mp4',
+                'title': 'Goliath',
+                'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
+                'timestamp': 1237100400,
+                'upload_date': '20090315',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from US',
         }
     ]
 
@@ -78,6 +101,7 @@ class NBCIE(InfoExtractor):
             theplatform_url = 'http:' + theplatform_url
         return {
             '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
             'url': smuggle_url(theplatform_url, {'source_url': url}),
             'id': video_id,
         }
@@ -93,6 +117,9 @@ class NBCSportsVPlayerIE(InfoExtractor):
             'ext': 'flv',
             'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
             'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+            'timestamp': 1426270238,
+            'upload_date': '20150313',
+            'uploader': 'NBCU-SPORTS',
         }
     }, {
         'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
@@ -124,6 +151,9 @@ class NBCSportsIE(InfoExtractor):
             'ext': 'flv',
             'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
             'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+            'uploader': 'NBCU-SPORTS',
+            'upload_date': '20150330',
+            'timestamp': 1427726529,
         }
     }
 
@@ -144,6 +174,9 @@ class CSNNEIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.',
             'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3',
+            'timestamp': 1459369979,
+            'upload_date': '20160330',
+            'uploader': 'NBCU-SPORTS',
         }
     }
 
@@ -159,7 +192,7 @@ class CSNNEIE(InfoExtractor):
 
 
 class NBCNewsIE(ThePlatformIE):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
+    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/
         (?:video/.+?/(?P<id>\d+)|
         ([^/]+/)*(?P<display_id>[^/?]+))
         '''
@@ -217,10 +250,27 @@ class NBCNewsIE(ThePlatformIE):
             },
             'expected_warnings': ['http-6000 is not available']
         },
+        {
+            'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
+            'md5': '118d7ca3f0bea6534f119c68ef539f71',
+            'info_dict': {
+                'id': '669831235788',
+                'ext': 'mp4',
+                'title': 'See the aurora borealis from space in stunning new NASA video',
+                'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
+                'upload_date': '20160420',
+                'timestamp': 1461152093,
+            },
+        },
         {
             'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
             'only_matching': True,
         },
+        {
+            # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
+            'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
@@ -244,15 +294,17 @@ class NBCNewsIE(ThePlatformIE):
             webpage = self._download_webpage(url, display_id)
             info = None
             bootstrap_json = self._search_regex(
-                r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
+                [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
+                 r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
                 webpage, 'bootstrap json', default=None)
-            if bootstrap_json:
-                bootstrap = self._parse_json(bootstrap_json, display_id)
+            bootstrap = self._parse_json(
+                bootstrap_json, display_id, transform_source=unescapeHTML)
+            if 'results' in bootstrap:
                 info = bootstrap['results'][0]['video']
+            elif 'video' in bootstrap:
+                info = bootstrap['video']
             else:
-                player_instance_json = self._search_regex(
-                    r'videoObj\s*:\s*({.+})', webpage, 'player instance')
-                info = self._parse_json(player_instance_json, display_id)
+                info = bootstrap
             video_id = info['mpxId']
             title = info['title']
 
@@ -282,7 +334,7 @@ class NBCNewsIE(ThePlatformIE):
                     formats.extend(tp_formats)
                     subtitles = self._merge_subtitles(subtitles, tp_subtitles)
                 else:
-                    tbr = int_or_none(video_asset.get('bitRate'), 1000)
+                    tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000)
                     format_id = 'http%s' % ('-%d' % tbr if tbr else '')
                     video_url = update_url_query(
                         video_url, {'format': 'redirect'})
@@ -308,10 +360,9 @@ class NBCNewsIE(ThePlatformIE):
                 'id': video_id,
                 'title': title,
                 'description': info.get('description'),
-                'thumbnail': info.get('description'),
                 'thumbnail': info.get('thumbnail'),
                 'duration': int_or_none(info.get('duration')),
-                'timestamp': parse_iso8601(info.get('pubDate')),
+                'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')),
                 'formats': formats,
                 'subtitles': subtitles,
             }
@@ -331,6 +382,7 @@ class MSNBCIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.jpg$',
             'timestamp': 1406937606,
             'upload_date': '20140802',
+            'uploader': 'NBCU-NEWS',
             'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
         },
     }