Fix some regexes

[youtube-dl] / youtube_dl / extractor / npo.py
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py

index be10fc48613a7646fe13a572565b9b3f0ff7c013..b8fe244071d05e1daac7514b932be148802c21a7 100644 (file)
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -28,17 +28,17 @@ class NPOBaseIE(InfoExtractor):
  
  class NPOIE(NPOBaseIE):
      IE_NAME = 'npo'
-    IE_DESC = 'npo.nl and ntr.nl'
+    IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
      _VALID_URL = r'''(?x)
                      (?:
                          npo:|
                          https?://
                              (?:www\.)?
                              (?:
-                                npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+                                npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|
                                  ntr\.nl/(?:[^/]+/){2,}|
                                  omroepwnl\.nl/video/fragment/[^/]+__|
-                                zapp\.nl/[^/]+/[^/]+/
+                                (?:zapp|npo3)\.nl/(?:[^/]+/){2}
                              )
                          )
                          (?P<id>[^/?#]+)
@@ -125,6 +125,18 @@ class NPOIE(NPOBaseIE):
          'params': {
              'skip_download': True,
          }
+    }, {
+        # audio
+        'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
+        'info_dict': {
+            'id': 'RBX_FUNX_6683215',
+            'ext': 'mp3',
+            'title': 'Jouw Stad Rotterdam',
+            'description': 'md5:db251505244f097717ec59fabc372d9f',
+        },
+        'params': {
+            'skip_download': True,
+        }
      }, {
          'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
          'only_matching': True,
@@ -134,10 +146,16 @@ class NPOIE(NPOBaseIE):
      }, {
          'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
          'only_matching': True,
+    }, {
+        'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
+        'only_matching': True,
      }, {
          # live stream
          'url': 'npo:LI_NL1_4188102',
          'only_matching': True,
+    }, {
+        'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
@@ -193,7 +211,7 @@ class NPOIE(NPOBaseIE):
                  })
  
              # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
-            if item.get('contentType') == 'url':
+            if item.get('contentType') in ('url', 'audio'):
                  add_format_url(item_url)
                  continue
  
@@ -201,7 +219,7 @@ class NPOIE(NPOBaseIE):
                  stream_info = self._download_json(
                      item_url + '&type=json', video_id,
                      'Downloading %s stream JSON'
-                    % item.get('label') or format_id or num)
+                    % item.get('label') or item.get('format') or format_id or num)
              except ExtractorError as ee:
                  if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
                      error = (self._parse_json(
@@ -301,9 +319,9 @@ class NPOIE(NPOBaseIE):
  
  class NPOLiveIE(NPOBaseIE):
      IE_NAME = 'npo.nl:live'
-    _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.npo.nl/live/npo-1',
          'info_dict': {
              'id': 'LI_NL1_4188102',
@@ -315,15 +333,18 @@ class NPOLiveIE(NPOBaseIE):
          'params': {
              'skip_download': True,
          }
-    }
+    }, {
+        'url': 'http://www.npo.nl/live',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
+        display_id = self._match_id(url) or 'npo-1'
  
          webpage = self._download_webpage(url, display_id)
  
          live_id = self._search_regex(
-            r'data-prid="([^"]+)"', webpage, 'live id')
+            [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
  
          return {
              '_type': 'url_transparent',
@@ -448,7 +469,7 @@ class SchoolTVIE(NPODataMidEmbedIE):
  
  class HetKlokhuisIE(NPODataMidEmbedIE):
      IE_NAME = 'hetklokhuis'
-    _VALID_URL = r'https?://(?:www\.)?hetklokhuis.nl/[^/]+/\d+/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P<id>[^/?#&]+)'
  
      _TEST = {
          'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven',