X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=e62b3860e99b106d08ef79cf593e180fe8c9496c;hb=2e7e561c1d9dedf1a8e5a206e1ef86cfa4599956;hp=f3ded3f9cf3d958991efb2adb9d7643eb5dab4b0;hpb=55ebae26f937a6a3a1fcb78d2f797972080f8b88;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index f3ded3f9c..e62b3860e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -11,21 +10,34 @@ from ..utils import ( int_or_none, parse_duration, parse_iso8601, - remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'[pb][\da-z]{7}' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/clips[/#]| + radio/player/ + ) + (?P%s) + ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. - # http://www.bbc.co.uk/programmes/b06bp7lf) + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] @@ -44,9 +56,8 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', + 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, }, 'params': { # rtmp download @@ -74,7 +85,7 @@ class BBCCoUkIE(InfoExtractor): 'id': 'b00yng1d', 'ext': 'flv', 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', 'duration': 5100, }, 'params': { @@ -109,16 +120,17 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', 'note': 'Audio', 'info_dict': { - 'id': 'p02frcch', + 'id': 'p022h44j', 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, }, 'params': { # rtmp download @@ -169,13 +181,25 @@ class BBCCoUkIE(InfoExtractor): }, { # iptv-all mediaset fails with geolocation however there is no geo restriction # for this programme at all - 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf', + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', 'info_dict': { - 'id': 'b06bp7kf', + 'id': 'b06rkms3', 'ext': 'flv', - 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie", - 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.', - 'duration': 10800, + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # compact player (https://github.com/rg3/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', }, 'params': { # rtmp download @@ -190,6 +214,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, } ] @@ -220,11 +247,9 @@ class BBCCoUkIE(InfoExtractor): elif transfer_format == 'dash': pass elif transfer_format == 'hls': - m3u8_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=supplier, fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + m3u8_id=supplier, fatal=False)) # Direct link else: formats.append({ @@ -332,7 +357,7 @@ class BBCCoUkIE(InfoExtractor): return self._download_media_selector_url( mediaselector_url % programme_id, programme_id) except BBCCoUkIE.MediaSelectionError as e: - if e.id in ('notukerror', 'geolocation'): + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) @@ -343,8 +368,8 @@ class BBCCoUkIE(InfoExtractor): media_selection = self._download_xml( url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) @@ -421,7 +446,7 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el else None + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): @@ -451,6 +476,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') programme_id = None + duration = None tviplayer = self._search_regex( r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', @@ -463,14 +489,19 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + (r']+id="parent-title"[^>]*>(.+?)', + r']+class="info"[^>]*>\s*

(.+?)

'), webpage, 'title') description = self._search_regex( - r'

([^<]+)

', - webpage, 'description', fatal=False) + (r'

([^<]+)

', + r']+class="info_+synopsis"[^>]*>([^<]+)'), + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) @@ -529,7 +560,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', - 'title': 'BBC Blogs - Adam Curtis - BUGGER', + 'title': 'BUGGER', }, 'playlist_count': 18, }, { @@ -584,6 +615,7 @@ class BBCIE(BBCCoUkIE): 'ext': 'mp4', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', }, 'params': { 'skip_download': True, @@ -625,6 +657,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { @@ -636,9 +669,17 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', - 'title': 'What Liverpool can expect from Klopp', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', }, 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -651,7 +692,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -699,25 +740,25 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = None - playlist_title = None - playlist_description = None + json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + timestamp = json_ld_info.get('timestamp') + + playlist_title = json_ld_info.get('title') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() - ld = self._parse_json( - self._search_regex( - r'(?s)', - webpage, 'ld json', default='{}'), - playlist_id, fatal=False) - if ld: - timestamp = parse_iso8601(ld.get('datePublished')) - playlist_title = ld.get('headline') - playlist_description = ld.get('articleBody') + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) if not timestamp: timestamp = parse_iso8601(self._search_regex( [r']+property="article:published_time"[^>]+content="([^"]+)"', r'itemprop="datePublished"[^>]+datetime="([^"]+)"', - r'"datePublished":\s*"([^"]+)',], + r'"datePublished":\s*"([^"]+)'], webpage, 'date', default=None)) entries = [] @@ -725,6 +766,7 @@ class BBCIE(BBCCoUkIE): # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) if playlists: entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) @@ -771,14 +813,13 @@ class BBCIE(BBCCoUkIE): playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: - playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') - playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -802,10 +843,6 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } - playlist_title = self._html_search_regex( - r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage, default=None) - def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), @@ -813,7 +850,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') @@ -902,3 +939,33 @@ class BBCIE(BBCCoUkIE): }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description)