X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=b17916137ec51808e8c0c869142d37bf083c90e0;hb=eec45445a8b56219168d57cdc24b54b00a9f389d;hp=0ee096ddaf10b900133a0093aa8ef03bcb440a9c;hpb=9fb64c04cdbe1b58f968fa80489168173ac7e565;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0ee096dda..b17916137 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) @@ -339,7 +341,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, f4m_id=format_id, fatal=False)) else: if not service and not supplier and bitrate: - format_id += '-%d' % bitrate + format_id += '-%d' % bitrate fmt = { 'format_id': format_id, 'filesize': file_size, @@ -660,7 +662,6 @@ class BBCIE(BBCCoUkIE): 'params': { # m3u8 download 'skip_download': True, - 'proxy': '5.101.173.158:8080', }, 'skip': 'Georestricted to UK', }, { @@ -760,7 +761,7 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') playlist_title = json_ld_info.get('title') @@ -945,7 +946,7 @@ class BBCIE(BBCCoUkIE): r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) if entries: return self.playlist_result( - [self.url_result(entry, 'BBCCoUk') for entry in entries], + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) @@ -1027,7 +1028,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' @@ -1057,19 +1058,35 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): @@ -1118,6 +1135,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True,