X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=1b3a33e4e3493b4e6874581d3ca3adb05eb16884;hb=05a3879f1c142cc2bf0287cde4690d8ccadcdc8f;hp=a15e6711487154f8fa6d4a57d561469947d79e38;hpb=78f9d843186977c614c5a0f6004732f5d410cd0c;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a15e67114..1b3a33e4e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, + remove_end, unescapeHTML, ) from ..compat import compat_HTTPError @@ -420,7 +421,7 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el else None + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): @@ -492,6 +493,9 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -501,8 +505,7 @@ class BBCIE(BBCCoUkIE): ] _TESTS = [{ - # article with multiple videos embedded with data-media-meta containing - # playlist.sxml, externalId and no direct video links + # article with multiple videos embedded with data-playable containing vpids 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', @@ -511,7 +514,7 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 2, }, { - # article with multiple videos embedded with data-media-meta (more videos) + # article with multiple videos embedded with data-playable (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', @@ -522,6 +525,7 @@ class BBCIE(BBCCoUkIE): 'skip': 'Save time', }, { # article with multiple videos embedded with `new SMP()` + # broken 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', @@ -529,12 +533,13 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 18, }, { - # single video embedded with mediaAssetPage.init() + # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', @@ -544,15 +549,14 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with single video embedded with data-media-meta containing - # direct video links (for now these are extracted) and playlist.xml (with - # media items as f4m and m3u8 - currently unsupported) + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", - 'duration': 47, 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -560,26 +564,12 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # single video embedded with playlist.sxml in data-playable - 'url': 'http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani', - 'info_dict': { - 'id': '151010_vid_ankara_patlama_ani', - 'ext': 'mp4', - 'title': "Ankara'da patlama anı", - 'timestamp': 1444480325, - 'upload_date': '20151010', - }, - 'params': { - 'skip_download': True, - } - }, { - # single video embedded with mediaAssetPage.init() (regional section) + # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'duration': 87, 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -621,21 +611,20 @@ class BBCIE(BBCCoUkIE): 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', - 'timestamp': 1368473503, - 'upload_date': '20130513', + 'timestamp': 1415867444, + 'upload_date': '20141113', }, 'params': { # rtmp download 'skip_download': True, } }, { - # single video with playlist.sxml URL + # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', 'duration': 140, }, 'params': { @@ -643,7 +632,7 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with multiple videos embedded with playlist.sxml + # article with multiple videos embedded with playlist.sxml in playlist param 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -710,11 +699,26 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = parse_iso8601(self._search_regex( - [r'"datePublished":\s*"([^"]+)', - r']+property="article:published_time"[^>]+content="([^"]+)"', - r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], - webpage, 'date', default=None)) + timestamp = None + playlist_title = None + playlist_description = None + + ld = self._parse_json( + self._search_regex( + r'(?s)', + webpage, 'ld json', default='{}'), + playlist_id, fatal=False) + if ld: + timestamp = parse_iso8601(ld.get('datePublished')) + playlist_title = ld.get('headline') + playlist_description = ld.get('articleBody') + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) entries = [] @@ -767,8 +771,8 @@ class BBCIE(BBCCoUkIE): playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage, default=None) + playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') + playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -794,7 +798,6 @@ class BBCIE(BBCCoUkIE): 'title': title, 'description': description, 'timestamp': timestamp, - 'duration': duration, 'formats': formats, 'subtitles': subtitles, }