X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fbbc.py;h=9a1b6e3dce7dd3247b0076b36280e7e4e0550c90;hb=c3124c3085e6a9a83ee31ace3a7d528a324c42da;hp=0f0ea7cfdd382e16d667012e4244403c52701b27;hpb=cb23bcba294563857561914a19e7d06990c71829;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0f0ea7cfd..9a1b6e3dc 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -14,12 +14,15 @@ from ..utils import ( ) from ..compat import compat_HTTPError + class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] _TESTS = [ { @@ -161,6 +164,10 @@ class BBCCoUkIE(InfoExtractor): } ] + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] @@ -211,8 +218,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_medias(self, media_selection): error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) + raise BBCCoUkIE.MediaSelectionError(error.get('id')) return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') def _extract_connections(self, media): @@ -269,17 +275,23 @@ class BBCCoUkIE(InfoExtractor): ] return subtitles + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + def _download_media_selector(self, programme_id): - try: - return self._download_media_selector_url( - self._MEDIASELECTOR_URL % programme_id, programme_id) - except ExtractorError as e: - if hasattr(self, '_MEDIASELECTOR_ALT_URL') and str(e) == 'bbc returned error: notukerror': - # notukerror on bbc.com/travel using bbc news mediaselector: fallback to /mediaselector/5/ - return self._download_media_selector_url( - self._MEDIASELECTOR_ALT_URL % programme_id, programme_id) - else: - raise + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id == 'notukerror': + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): try: @@ -432,10 +444,14 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - # fails with notukerror for some videos ( non news sites such as bbc.com/travel ) - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - # limited selection of formats but may work where the above does not - _MEDIASELECTOR_ALT_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] _TESTS = [{ # article with multiple videos embedded with data-media-meta containing @@ -457,6 +473,14 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 9, 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BBC Blogs - Adam Curtis - BUGGER', + }, + 'playlist_count': 18, }, { # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', @@ -644,12 +668,30 @@ class BBCIE(BBCCoUkIE): playlist_title = self._html_search_regex( r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage) + playlist_description = self._og_search_description(webpage, default=None) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry, 'BBCCoUk') for entry in entries], + playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) - medias = list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(r"data-media-meta='({[^']+})'", webpage)))) + medias = extract_all(r"data-media-meta='({[^']+})'") if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)