- list_id = self._match_id(url)
- webpage = self._download_webpage(url, list_id)
-
- list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
-
- pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
- if pubdate:
- pubdate = pubdate.replace('-','')
-
- ret = []
- jsent = []
-
- # works with bbc.com/news/something-something-123456 articles
- jsent = map(
- lambda m: self._parse_json(m,list_id),
- re.findall(r"data-media-meta='({[^']+})'", webpage)
- )
-
- if len(jsent) == 0:
- # http://www.bbc.com/news/video_and_audio/international
- # and single-video articles
- masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
- if masset:
- jmasset = self._parse_json(masset,list_id)
- for key, val in jmasset.get('videos',{}).items():
- for skey, sval in val.items():
- sval['id'] = skey
- jsent.append(sval)
-
- if len(jsent) == 0:
- # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
- # in http://www.bbc.com/news/video_and_audio/international
- # prone to breaking if entries have sourceFiles list
- jsent = map(
- lambda m: self._parse_json(m,list_id),
- re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
- )
-
- if len(jsent) == 0:
- raise ExtractorError('No video found', expected=True)
-
- for jent in jsent:
- programme_id = jent.get('externalId')
- xml_url = jent.get('href')
-
- title = jent.get('caption',list_title)
-
- duration = parse_duration(jent.get('duration'))
- description = list_title
- if jent.get('caption'):
- description += ' - ' + jent.get('caption')
- thumbnail = None
- if jent.has_key('image'):
- thumbnail=jent['image'].get('href')
-
- formats = []
- subtitles = []
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ timestamp = None
+ playlist_title = None
+ playlist_description = None
+
+ ld = self._parse_json(
+ self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.+?)</script>',
+ webpage, 'ld json', default='{}'),
+ playlist_id, fatal=False)
+ if ld:
+ timestamp = parse_iso8601(ld.get('datePublished'))
+ playlist_title = ld.get('headline')
+ playlist_description = ld.get('articleBody')
+
+ if not timestamp:
+ timestamp = parse_iso8601(self._search_regex(
+ [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+ r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
+ r'"datePublished":\s*"([^"]+)'],
+ webpage, 'date', default=None))
+
+ entries = []
+
+ # article with multiple videos embedded with playlist.sxml (e.g.
+ # http://www.bbc.com/sport/0/football/34475836)
+ playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
+ if playlists:
+ entries = [
+ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
+ for playlist_url in playlists]
+
+ # news article with multiple videos embedded with data-playable
+ data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
+ if data_playables:
+ for _, data_playable_json in data_playables:
+ data_playable = self._parse_json(
+ unescapeHTML(data_playable_json), playlist_id, fatal=False)
+ if not data_playable:
+ continue
+ settings = data_playable.get('settings', {})
+ if settings:
+ # data-playable with video vpid in settings.playlistObject.items (e.g.
+ # http://www.bbc.com/news/world-us-canada-34473351)
+ playlist_object = settings.get('playlistObject', {})
+ if playlist_object:
+ items = playlist_object.get('items')
+ if items and isinstance(items, list):
+ title = playlist_object['title']
+ description = playlist_object.get('summary')
+ duration = int_or_none(items[0].get('duration'))
+ programme_id = items[0].get('vpid')
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ else:
+ # data-playable without vpid but with a playlist.sxml URLs
+ # in otherSettings.playlist (e.g.
+ # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
+ playlist = data_playable.get('otherSettings', {}).get('playlist', {})
+ if playlist:
+ entries.append(self._extract_from_playlist_sxml(
+ playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
+
+ if entries:
+ playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
+ playlist_description = playlist_description or self._og_search_description(webpage, default=None)
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+ # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ programme_id = self._search_regex(
+ [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
+ r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
+ r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
+ webpage, 'vpid', default=None)