X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fpolskieradio.py;h=978d6f813b6d0a88aafb707aba40de092c926a44;hb=a6211d237b4e7051ca018cc09440502561fedaa7;hp=c51d3d9be0fd45398d903e39b99e817d7f61ccdc;hpb=f009fcac0d22222803c05fa9e31477343e77988c;p=youtube-dl diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index c51d3d9be..978d6f813 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -10,90 +11,13 @@ from ..compat import ( compat_urlparse ) from ..utils import ( + extract_attributes, int_or_none, strip_or_none, unified_timestamp, ) -class PolskieRadioProgrammeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(,[^/]+)?/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', - 'info_dict': { - 'id': '5102', - 'title': 'HISTORIA ŻYWA', - }, - 'playlist_mincount': 34, - }, { - 'url': 'http://www.polskieradio.pl/7/4807', - 'info_dict': { - 'id': '4807', - 'title': 'Vademecum 1050. rocznicy Chrztu Polski' - }, - 'playlist_mincount': 5 - }, { - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True - }, { - 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', - 'info_dict': { - 'id': '4143', - 'title': 'Kierunek Kraków', - }, - 'playlist_mincount': 61 - }, { - 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', - 'only_matching': True - }] - - def _get_entries_from_page_content(self, base_url, content): - entries = [] - - articles = re.findall( - r'
\s+', - content) - for article_id, article_url, _, article_title in articles: - resolved_article_url = compat_urlparse.urljoin(base_url, article_url) - entries.append(self.url_result( - resolved_article_url, - ie='PolskieRadio', - video_id=article_id, - video_title=article_title)) - - return entries - - @classmethod - def suitable(cls, url): - return False if PolskieRadioIE.suitable(url) else super(PolskieRadioProgrammeIE, cls).suitable(url) - - def _real_extract(self, url): - programme_id = self._match_id(url) - webpage = self._download_webpage(url, programme_id) - - title = self._html_search_regex( - r'(.+?)', - webpage, 'title', fatal=False) - description = None - - entries = self._get_entries_from_page_content(url, webpage) - - pages = re.findall(r' 1: - page_url_root = next(url for _, url, _ in pages if len(url) > 0) - for page_number in range(2, page_count + 1): - page_url = page_url_root + str(page_number) - resolved_page_url = compat_urlparse.urljoin(url, page_url) - page_content = self._download_webpage( - resolved_page_url, programme_id, - note="Downloading page number %d" % page_number) - entries.extend(self._get_entries_from_page_content(url, page_content)) - - return self.playlist_result(entries, programme_id, title, description) - - class PolskieRadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P[0-9]+)' _TESTS = [{ @@ -112,7 +36,7 @@ class PolskieRadioIE(InfoExtractor): 'timestamp': 1456594200, 'upload_date': '20160227', 'duration': 2364, - 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], }, { @@ -141,7 +65,7 @@ class PolskieRadioIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) content = self._search_regex( - r'(?s)]+class="audio atarticle"[^>]*>(.+?)