X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=8b20c03d6e424b95e42b1bea1ac3fb91e24bea11;hb=54fc90aabfb71968f28af68dfe3f7a3544cc2f0b;hp=83e6d024c7fd4c6ee2adde607e4f1cd203910805;hpb=522f6c066da93b6baec3399b7098556e5ec55f43;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 83e6d024c..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,40 +2,50 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( + clean_html, dict_get, ExtractorError, float_or_none, + get_element_by_class, int_or_none, parse_duration, parse_iso8601, try_get, unescapeHTML, + urlencode_postdata, + urljoin, ) from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ (?: programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| - music/clips[/#]| - radio/player/ + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. @@ -220,8 +230,48 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', 'only_matching': True, - } - ] + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }] + + _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() class MediaSelectionError(Exception): def __init__(self, id): @@ -334,6 +384,15 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + if re.search(self._USP_RE, href): + usp_formats = self._extract_m3u8_formats( + re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), + programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for f in usp_formats: + if f.get('height') and f['height'] > 720: + continue + formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -348,7 +407,7 @@ class BBCCoUkIE(InfoExtractor): fmt.update({ 'width': width, 'height': height, - 'vbr': bitrate, + 'tbr': bitrate, 'vcodec': encoding, }) else: @@ -357,7 +416,7 @@ class BBCCoUkIE(InfoExtractor): 'acodec': encoding, 'vcodec': 'none', }) - if protocol == 'http': + if protocol in ('http', 'https'): # Direct link fmt.update({ 'url': href, @@ -376,6 +435,8 @@ class BBCCoUkIE(InfoExtractor): 'rtmp_live': False, 'ext': 'flv', }) + else: + continue formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) @@ -394,7 +455,7 @@ class BBCCoUkIE(InfoExtractor): description = smp_config['summary'] for item in smp_config['items']: kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) @@ -435,7 +496,7 @@ class BBCCoUkIE(InfoExtractor): for item in self._extract_items(playlist): kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) @@ -468,6 +529,12 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') + error = self._search_regex( + r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + programme_id = None duration = None @@ -1026,7 +1093,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' @@ -1056,19 +1123,35 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): @@ -1117,6 +1200,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True,