X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fnpo.py;h=eb12fb8102deaf438621d303589488b0e831d774;hb=50e989e2636fc59ed896cc021b1b594bd10e9e17;hp=a5162c0c6cb366b0b8bfbc07f6b841541d9d2f3e;hpb=525daedd5a092b0f5329952eee99a7dac5537433;p=youtube-dl diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a5162c0c6..eb12fb810 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,17 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_request, - compat_urllib_parse, -) from ..utils import ( fix_xml_ampersands, parse_duration, qualities, strip_jsonp, unified_strdate, - url_basename, ) @@ -46,12 +41,15 @@ class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - npo\.nl/(?!live|radio)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| - omroepwnl\.nl/video/fragment/[^/]+__ + (?: + npo:| + https?:// + (?:www\.)? + (?: + npo\.nl/(?!live|radio)(?:[^/]+/){2}| + ntr\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__ + ) ) (?P[^/?#]+) ''' @@ -74,7 +72,7 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VARA_101191800', 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', 'upload_date': '20090227', 'duration': 2400, @@ -86,8 +84,8 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'title': 'Tegenlicht: De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, }, @@ -159,6 +157,13 @@ class NPOIE(NPOBaseIE): # video id is POMS_WNL_853698 but prid is POW_00996502) video_id = metadata.get('prid') or video_id + # titel is too generic in some cases so utilize aflevering_titel as well + # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) + title = metadata['titel'] + sub_title = metadata.get('aflevering_titel') + if sub_title and sub_title != title: + title += ': %s' % sub_title + token = self._get_token(video_id) formats = [] @@ -231,8 +236,8 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': metadata['titel'], - 'description': metadata['info'], + 'title': title, + 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), @@ -401,9 +406,9 @@ class NPORadioFragmentIE(InfoExtractor): } -class TegenlichtVproIE(NPOIE): - IE_NAME = 'tegenlicht.vpro.nl' - _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' +class VPROIE(NPOIE): + IE_NAME = 'vpro' + _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' _TESTS = [ { @@ -412,17 +417,72 @@ class TegenlichtVproIE(NPOIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'title': 'De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, }, + { + 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', + 'info_dict': { + 'id': 'sergio-herman', + 'title': 'Sergio Herman: Fucking perfect', + }, + 'playlist_count': 2, + }, + { + # playlist with youtube embed + 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', + 'info_dict': { + 'id': 'education-education', + 'title': '2Doc', + }, + 'playlist_count': 2, + } ] def _real_extract(self, url): - name = url_basename(url) - webpage = self._download_webpage(url, name) - urn = self._html_search_meta('mediaurn', webpage) - info_page = self._download_json( - 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) - return self._get_info(info_page['mid']) + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) + for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) + ] + + playlist_title = self._search_regex( + r'\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', + webpage, 'playlist title', default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) + + +class WNLIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P[^/]+)__\d+' + + _TEST = { + 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', + 'info_dict': { + 'id': 'vandaag-de-dag-6-mei', + 'title': 'Vandaag de Dag 6 mei', + }, + 'playlist_count': 4, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id, part in re.findall( + r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) + ] + + playlist_title = self._html_search_regex( + r'(?s)]+class="subject"[^>]*>(.+?)', + webpage, 'playlist title') + + return self.playlist_result(entries, playlist_id, playlist_title)