[polskieradio] Add support for downloading whole programmes.
authorJakub Adam Wieczorek <ja.wieczorek@student.uw.edu.pl>
Thu, 25 Aug 2016 21:04:59 +0000 (23:04 +0200)
committerJakub Adam Wieczorek <ja.wieczorek@student.uw.edu.pl>
Tue, 6 Sep 2016 19:34:44 +0000 (21:34 +0200)
This extends the Polskie Radio (the Polish national radio) extractor to
enable the user to download all the broadcasts of a single programme.

youtube_dl/extractor/extractors.py
youtube_dl/extractor/polskieradio.py

index f72faa8a7b1ce583c2618ad42a086337e5180d7e..f8f91f13dcf63e5c680818fe9b8e3eef47f90606 100644 (file)
@@ -667,7 +667,7 @@ from .pluralsight import (
 )
 from .podomatic import PodomaticIE
 from .pokemon import PokemonIE
 )
 from .podomatic import PodomaticIE
 from .pokemon import PokemonIE
-from .polskieradio import PolskieRadioIE
+from .polskieradio import PolskieRadioIE, PolskieRadioProgrammeIE
 from .porn91 import Porn91IE
 from .porncom import PornComIE
 from .pornhd import PornHdIE
 from .porn91 import Porn91IE
 from .porncom import PornComIE
 from .pornhd import PornHdIE
index f559b899f44d216d0f59a5e282de0658cbf4af48..c51d3d9be0fd45398d903e39b99e817d7f61ccdc 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_str,
     compat_urllib_parse_unquote,
 from ..compat import (
     compat_str,
     compat_urllib_parse_unquote,
+    compat_urlparse
 )
 from ..utils import (
     int_or_none,
 )
 from ..utils import (
     int_or_none,
@@ -15,6 +16,84 @@ from ..utils import (
 )
 
 
 )
 
 
+class PolskieRadioProgrammeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(,[^/]+)?/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
+        'info_dict': {
+            'id': '5102',
+            'title': 'HISTORIA ŻYWA',
+        },
+        'playlist_mincount': 34,
+    }, {
+        'url': 'http://www.polskieradio.pl/7/4807',
+        'info_dict': {
+            'id': '4807',
+            'title': 'Vademecum 1050. rocznicy Chrztu Polski'
+        },
+        'playlist_mincount': 5
+    }, {
+        'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
+        'only_matching': True
+    }, {
+        'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
+        'info_dict': {
+            'id': '4143',
+            'title': 'Kierunek Kraków',
+        },
+        'playlist_mincount': 61
+    }, {
+        'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
+        'only_matching': True
+    }]
+
+    def _get_entries_from_page_content(self, base_url, content):
+        entries = []
+
+        articles = re.findall(
+            r'<article class="ID-(\d+) article">\s+<a href="([^"]+)"( data-layer="[^"]*")? class="[^"]*" title="([^"]+)">',
+            content)
+        for article_id, article_url, _, article_title in articles:
+            resolved_article_url = compat_urlparse.urljoin(base_url, article_url)
+            entries.append(self.url_result(
+                resolved_article_url,
+                ie='PolskieRadio',
+                video_id=article_id,
+                video_title=article_title))
+
+        return entries
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PolskieRadioIE.suitable(url) else super(PolskieRadioProgrammeIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        programme_id = self._match_id(url)
+        webpage = self._download_webpage(url, programme_id)
+
+        title = self._html_search_regex(
+            r'<a href="[^"]+" id=".*_linkCategory" title="[^"]+">(.+?)</a>',
+            webpage, 'title', fatal=False)
+        description = None
+
+        entries = self._get_entries_from_page_content(url, webpage)
+
+        pages = re.findall(r'<a( href="([^"]+/Strona/)\d+")? id="[^"]+" title="strona&#32;(\d+)"', webpage)
+        page_count = max(int(page_number) for _, _, page_number in pages) if pages else 1
+
+        if page_count > 1:
+            page_url_root = next(url for _, url, _ in pages if len(url) > 0)
+            for page_number in range(2, page_count + 1):
+                page_url = page_url_root + str(page_number)
+                resolved_page_url = compat_urlparse.urljoin(url, page_url)
+                page_content = self._download_webpage(
+                    resolved_page_url, programme_id,
+                    note="Downloading page number %d" % page_number)
+                entries.extend(self._get_entries_from_page_content(url, page_content))
+
+        return self.playlist_result(entries, programme_id, title, description)
+
+
 class PolskieRadioIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
     _TESTS = [{
 class PolskieRadioIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
     _TESTS = [{