- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- if video_id is not None:
- all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
- info = all_info.find('video')
-
- return {
- 'id': video_id,
- 'title': info.find('headline').text,
- 'ext': 'flv',
- 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
- 'description': info.find('caption').text,
- 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
- }
- else:
- # "feature" and "nightly-news" pages use theplatform.com
- display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id)
- info = None
- bootstrap_json = self._search_regex(
- r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
- webpage, 'bootstrap json', default=None)
- if bootstrap_json:
- bootstrap = self._parse_json(bootstrap_json, display_id)
- info = bootstrap['results'][0]['video']
- else:
- player_instance_json = self._search_regex(
- r'videoObj\s*:\s*({.+})', webpage, 'player instance', default=None)
- if not player_instance_json:
- player_instance_json = self._html_search_regex(
- r'data-video="([^"]+)"', webpage, 'video json')
- info = self._parse_json(player_instance_json, display_id)
- video_id = info['mpxId']
- title = info['title']
-
- subtitles = {}
- caption_links = info.get('captionLinks')
- if caption_links:
- for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')):
- sub_url = caption_links.get(sub_key)
- if sub_url:
- subtitles.setdefault('en', []).append({
- 'url': sub_url,
- 'ext': sub_ext,
- })
-
- formats = []
- for video_asset in info['videoAssets']:
- video_url = video_asset.get('publicUrl')
- if not video_url:
- continue
- container = video_asset.get('format')
- asset_type = video_asset.get('assetType') or ''
- if container == 'ISM' or asset_type == 'FireTV-Once':
- continue
- elif asset_type == 'OnceURL':
- tp_formats, tp_subtitles = self._extract_theplatform_smil(
- video_url, video_id)
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- else:
- tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000)
- format_id = 'http%s' % ('-%d' % tbr if tbr else '')
- video_url = update_url_query(
- video_url, {'format': 'redirect'})
- # resolve the url so that we can check availability and detect the correct extension
- head = self._request_webpage(
- HEADRequest(video_url), video_id,
- 'Checking %s url' % format_id,
- '%s is not available' % format_id,
- fatal=False)
- if head:
- video_url = head.geturl()
- formats.append({
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(video_asset.get('width')),
- 'height': int_or_none(video_asset.get('height')),
- 'tbr': tbr,
- 'container': video_asset.get('format'),
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': info.get('description'),
- 'thumbnail': info.get('thumbnail'),
- 'duration': int_or_none(info.get('duration')),
- 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')),
- 'formats': formats,
- 'subtitles': subtitles,
- }
-
-
-class MSNBCIE(InfoExtractor):
- # https URLs redirect to corresponding http ones
- _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
+ video_id = self._match_id(url)
+ if not video_id.isdigit():
+ webpage = self._download_webpage(url, video_id)
+
+ data = self._parse_json(self._search_regex(
+ r'window\.__data\s*=\s*({.+});', webpage,
+ 'bootstrap json'), video_id)
+ video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id']
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
+ 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}),
+ 'ie_key': 'ThePlatformFeed',
+ }
+
+
+class NBCOlympicsIE(InfoExtractor):
+ IE_NAME = 'nbcolympics'
+ _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P<id>[a-z-]+)'
+