X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbbc.py;h=e76507951203b2f305a9fb161073e75611e5e919;hb=77d95677b7ab4a9840ef142b14627b07a9a31120;hp=2a0901ee457dd02f051a183434ba174a4593b1fb;hpb=9afa1770d1a6835bc8fee48dc86cd1a702d1f67a;p=youtube-dl diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2a0901ee4..e76507951 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,26 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( + clean_html, + dict_get, ExtractorError, float_or_none, + get_element_by_class, int_or_none, + js_to_json, parse_duration, parse_iso8601, + try_get, + unescapeHTML, + url_or_none, + urlencode_postdata, + urljoin, +) +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_urlparse, ) -from ..compat import compat_HTTPError class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + events/[^/]+/play/[^/]+/ + ) + (?P%s)(?!/(?:episodes|broadcasts|clips)) + ''' % _ID_REGEX + + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams with even better quality that pc mediaset but fails + # with geolocation in some cases when it's even not geo restricted at all (e.g. + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _NAMESPACES = ( + _MEDIASELECTION_NS, + _EMP_PLAYLIST_NS, + ) _TESTS = [ { @@ -28,9 +70,8 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', + 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, }, 'params': { # rtmp download @@ -58,7 +99,7 @@ class BBCCoUkIE(InfoExtractor): 'id': 'b00yng1d', 'ext': 'flv', 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', 'duration': 5100, }, 'params': { @@ -93,16 +134,17 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', 'note': 'Audio', 'info_dict': { - 'id': 'p02frcch', + 'id': 'p022h44j', 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, }, 'params': { # rtmp download @@ -150,6 +192,34 @@ class BBCCoUkIE(InfoExtractor): 'skip_download': True, }, 'skip': 'geolocation', + }, { + # iptv-all mediaset fails with geolocation however there is no geo restriction + # for this programme at all + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', + 'info_dict': { + 'id': 'b06rkms3', + 'ext': 'flv', + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Now it\'s really geo-restricted', + }, { + # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -159,104 +229,96 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', 'only_matching': True, - } - ] + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, + }] + + _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] - def _extract_connection(self, connection, programme_id): - formats = [] - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _findall_ns(self, element, xpath): + elements = [] + for ns in self._NAMESPACES: + elements.extend(element.findall(xpath % ns)) + return elements def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') + error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) + if error is None: + media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') + raise BBCCoUkIE.MediaSelectionError(error.get('id')) + return self._findall_ns(media_selection, './{%s}media') def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') - - def _extract_video(self, media, programme_id): - formats = [] - vbr = int_or_none(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int_or_none(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - }) - formats.extend(conn_formats) - return formats + return self._findall_ns(media, './{%s}connection') def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, compat_etree_Element): + continue lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') subtitles[lang] = [ { @@ -266,34 +328,127 @@ class BBCCoUkIE(InfoExtractor): ] return subtitles + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + def _download_media_selector(self, programme_id): - return self._download_media_selector_url( - self._MEDIASELECTOR_URL % programme_id, programme_id) + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - try: - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) - else: - raise + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None + urls = [] for media in self._extract_medias(media_selection): kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + if re.search(self._USP_RE, href): + usp_formats = self._extract_m3u8_formats( + re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), + programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for f in usp_formats: + if f.get('height') and f['height'] > 720: + continue + formats.append(f) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + if not service and not supplier and bitrate: + format_id += '-%d' % bitrate + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'tbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol in ('http', 'https'): + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + else: + continue + formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) - return formats, subtitles def _download_playlist(self, playlist_id): @@ -309,7 +464,7 @@ class BBCCoUkIE(InfoExtractor): description = smp_config['summary'] for item in smp_config['items']: kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) @@ -335,7 +490,7 @@ class BBCCoUkIE(InfoExtractor): url, playlist_id, 'Downloading legacy playlist XML') def _extract_from_legacy_playlist(self, playlist, playlist_id): - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) if no_items is not None: reason = no_items.get('reason') if reason == 'preAvailability': @@ -350,10 +505,11 @@ class BBCCoUkIE(InfoExtractor): for item in self._extract_items(playlist): kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): @@ -362,16 +518,18 @@ class BBCCoUkIE(InfoExtractor): if value and re.match(r'^[pb][\da-z]{7}$', value): return value get_from_attributes(item) - mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) if mediator is not None: return get_from_attributes(mediator) programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) - # TODO: programme_id can be None and media items can be incorporated right inside - # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - # as f4m and m3u8 - formats, subtitles = self._download_media_selector(programme_id) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id return programme_id, title, description, duration, formats, subtitles @@ -380,7 +538,14 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') + error = self._search_regex( + r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + programme_id = None + duration = None tviplayer = self._search_regex( r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', @@ -393,14 +558,19 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + (r']+id="parent-title"[^>]*>(.+?)', + r']+class="info"[^>]*>\s*

(.+?)

'), webpage, 'title') description = self._search_regex( - r'

([^<]+)

', - webpage, 'description', fatal=False) + (r'

([^<]+)

', + r']+class="info_+synopsis"[^>]*>([^<]+)'), + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) @@ -422,13 +592,20 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - # fails with notukerror for some videos - #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] _TESTS = [{ - # article with multiple videos embedded with data-media-meta containing - # playlist.sxml, externalId and no direct video links + # article with multiple videos embedded with data-playable containing vpids 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', @@ -437,7 +614,7 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 2, }, { - # article with multiple videos embedded with data-media-meta (more videos) + # article with multiple videos embedded with data-playable (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', @@ -447,12 +624,22 @@ class BBCIE(BBCCoUkIE): 'playlist_count': 9, 'skip': 'Save time', }, { - # single video embedded with mediaAssetPage.init() + # article with multiple videos embedded with `new SMP()` + # broken + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BUGGER', + }, + 'playlist_count': 18, + }, { + # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', @@ -462,15 +649,15 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # article with single video embedded with data-media-meta containing - # direct video links (for now these are extracted) and playlist.xml (with - # media items as f4m and m3u8 - currently unsupported) + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", - 'duration': 47, + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -478,19 +665,32 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { - # single video embedded with mediaAssetPage.init() (regional section) + # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'duration': 87, + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'timestamp': 1434713142, 'upload_date': '20150619', }, 'params': { 'skip_download': True, } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + }, + 'params': { + 'skip_download': True, + } }, { # single video story with digitalData 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', @@ -511,30 +711,64 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', - 'timestamp': 1368473503, - 'upload_date': '20130513', + 'timestamp': 1415867444, + 'upload_date': '20141113', }, 'params': { # rtmp download 'skip_download': True, } }, { - # single video with playlist.sxml URL + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to UK', + }, { + # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { # rtmp download 'skip_download': True, } + }, { + # article with multiple videos embedded with playlist.sxml in playlist param + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', + }, + 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -543,11 +777,48 @@ class BBCIE(BBCCoUkIE): # custom redirection to www.bbc.com 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], }] @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -576,40 +847,137 @@ class BBCIE(BBCCoUkIE): return [], [] + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - timestamp = parse_iso8601(self._search_regex( - [r'"datePublished":\s*"([^"]+)', - r']+property="article:published_time"[^>]+content="([^"]+)"', - r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], - webpage, 'date', default=None)) - - # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) - playlist = self._search_regex( - r']+name="playlist"[^>]+value="([^"]+)"', - webpage, 'playlist', default=None) - if playlist: - programme_id, title, description, duration, formats, subtitles = \ - self._process_legacy_playlist_url(playlist, playlist_id) - self._sort_formats(formats) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'subtitles': subtitles, - } + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) + timestamp = json_ld_info.get('timestamp') + + playlist_title = json_ld_info.get('title') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) + + entries = [] + + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) + if playlists: + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entry = None + for key in ('streaming', 'progressiveDownload'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) + except Exception as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) + + if entries: + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) + if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) @@ -631,24 +999,165 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } - playlist_title = self._html_search_regex( - r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage) + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], + playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) - medias = list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(r"data-media-meta='({[^']+})'", webpage)))) + medias = extract_all(r"data-media-meta='({[^']+})'") if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) - media_asset_page = self._parse_json( + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( self._search_regex( - r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + r']+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)', + webpage, 'playlist data'), playlist_id) - medias = [] - for video in media_asset_page.get('videos', {}).values(): - medias.extend(video.values()) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias entries = [] for num, media_meta in enumerate(medias, start=1): @@ -690,3 +1199,146 @@ class BBCIE(BBCCoUkIE): }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' + _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 6, + 'skip': 'This programme is not currently available on BBC iPlayer', + }, { + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }] + + def _extract_title_and_description(self, webpage): + title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) + description = self._search_regex( + r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', + webpage, 'description', fatal=False, group='value') + return title, description + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description