X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=9d0dfb9611687b15075d0e6fc7d57dfa0244c60a;hb=e7d20845686e0a35fc386238713ea4bdde89bdcf;hp=b98db95b90e0c22d987c62f85c2771bb4851a7c9;hpb=ae8bdfd1a1548c83ab7df378096da927b5374a29;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b98db95b9..9d0dfb961 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -14,18 +13,32 @@ from ..utils import ( remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'[pb][\da-z]{7}' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/clips[/#]| + radio/player/ + ) + (?P%s) + ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. - # http://www.bbc.co.uk/programmes/b06bp7lf) + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] @@ -44,9 +57,8 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', + 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, }, 'params': { # rtmp download @@ -74,7 +86,7 @@ class BBCCoUkIE(InfoExtractor): 'id': 'b00yng1d', 'ext': 'flv', 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', 'duration': 5100, }, 'params': { @@ -109,16 +121,17 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', 'note': 'Audio', 'info_dict': { - 'id': 'p02frcch', + 'id': 'p022h44j', 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, }, 'params': { # rtmp download @@ -169,13 +182,25 @@ class BBCCoUkIE(InfoExtractor): }, { # iptv-all mediaset fails with geolocation however there is no geo restriction # for this programme at all - 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf', + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', 'info_dict': { - 'id': 'b06bp7kf', + 'id': 'b06rkms3', 'ext': 'flv', - 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie", - 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.', - 'duration': 10800, + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # compact player (https://github.com/rg3/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', }, 'params': { # rtmp download @@ -190,6 +215,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, } ] @@ -220,11 +248,9 @@ class BBCCoUkIE(InfoExtractor): elif transfer_format == 'dash': pass elif transfer_format == 'hls': - m3u8_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=supplier, fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + m3u8_id=supplier, fatal=False)) # Direct link else: formats.append({ @@ -332,7 +358,7 @@ class BBCCoUkIE(InfoExtractor): return self._download_media_selector_url( mediaselector_url % programme_id, programme_id) except BBCCoUkIE.MediaSelectionError as e: - if e.id in ('notukerror', 'geolocation'): + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) @@ -343,8 +369,8 @@ class BBCCoUkIE(InfoExtractor): media_selection = self._download_xml( url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) @@ -421,7 +447,7 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el else None + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): @@ -451,6 +477,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') programme_id = None + duration = None tviplayer = self._search_regex( r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', @@ -463,14 +490,19 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + (r']+id="parent-title"[^>]*>(.+?)', + r']+class="info"[^>]*>\s*

(.+?)

'), webpage, 'title') description = self._search_regex( - r'

([^<]+)

', - webpage, 'description', fatal=False) + (r'

([^<]+)

', + r']+class="info_+synopsis"[^>]*>([^<]+)'), + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) @@ -493,6 +525,9 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -534,8 +569,9 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', @@ -577,9 +613,10 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'info_dict': { 'id': 'p02w6qjc', - 'ext': 'flv', + 'ext': 'mp4', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', }, 'params': { 'skip_download': True, @@ -604,7 +641,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', 'timestamp': 1415867444, @@ -619,8 +656,9 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { @@ -647,7 +685,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -695,25 +733,16 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = None - playlist_title = None - playlist_description = None - - ld = self._parse_json( - self._search_regex( - r'(?s)', - webpage, 'ld json', default='{}'), - playlist_id, fatal=False) - if ld: - timestamp = parse_iso8601(ld.get('datePublished')) - playlist_title = ld.get('headline') - playlist_description = ld.get('articleBody') + json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') + playlist_description = json_ld_info.get('description') if not timestamp: timestamp = parse_iso8601(self._search_regex( [r']+property="article:published_time"[^>]+content="([^"]+)"', r'itemprop="datePublished"[^>]+datetime="([^"]+)"', - r'"datePublished":\s*"([^"]+)',], + r'"datePublished":\s*"([^"]+)'], webpage, 'date', default=None)) entries = [] @@ -721,6 +750,7 @@ class BBCIE(BBCCoUkIE): # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) if playlists: entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) @@ -773,8 +803,9 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -809,7 +840,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') @@ -898,3 +929,33 @@ class BBCIE(BBCCoUkIE): }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description)