[bbc] Extend vpid regex (Closes #9003)

[youtube-dl] / youtube_dl / extractor / bbc.py
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 6ddee686ce3e4dafb0e02ab3cb697a95f6089ac3..dedf721bde2724cb7f5a06186584069b3f5221a1 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@ from ..utils import (
      int_or_none,
      parse_duration,
      parse_iso8601,
-    remove_end,
      unescapeHTML,
  )
  from ..compat import (
@@ -86,7 +85,7 @@ class BBCCoUkIE(InfoExtractor):
                  'id': 'b00yng1d',
                  'ext': 'flv',
                  'title': 'The Voice UK: Series 3: Blind Auditions 5',
-                'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
+                'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
                  'duration': 5100,
              },
              'params': {
@@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
          'info_dict': {
              'id': '3662a707-0af9-3149-963f-47bea720b460',
-            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+            'title': 'BUGGER',
          },
          'playlist_count': 18,
      }, {
@@ -670,9 +669,17 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/sport/0/football/34475836',
          'info_dict': {
              'id': '34475836',
-            'title': 'What Liverpool can expect from Klopp',
+            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
          },
          'playlist_count': 3,
+    }, {
+        # school report article with single video
+        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+        'info_dict': {
+            'id': '35744779',
+            'title': 'School which breaks down barriers in Jerusalem',
+        },
+        'playlist_count': 1,
      }, {
          # single video with playlist URL from weather section
          'url': 'http://www.bbc.com/weather/features/33601775',
@@ -681,6 +688,10 @@ class BBCIE(BBCCoUkIE):
          # custom redirection to www.bbc.com
          'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
          'only_matching': True,
+    }, {
+        # single video article embedded with data-media-vpid
+        'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -735,8 +746,17 @@ class BBCIE(BBCCoUkIE):
  
          json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
          timestamp = json_ld_info.get('timestamp')
+
          playlist_title = json_ld_info.get('title')
-        playlist_description = json_ld_info.get('description')
+        if not playlist_title:
+            playlist_title = self._og_search_title(
+                webpage, default=None) or self._html_search_regex(
+                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+            if playlist_title:
+                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+        playlist_description = json_ld_info.get(
+            'description') or self._og_search_description(webpage, default=None)
  
          if not timestamp:
              timestamp = parse_iso8601(self._search_regex(
@@ -797,13 +817,11 @@ class BBCIE(BBCCoUkIE):
                                  playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
  
          if entries:
-            playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
-            playlist_description = playlist_description or self._og_search_description(webpage, default=None)
              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  
          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
          programme_id = self._search_regex(
-            [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
+            [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
               r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
               r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
              webpage, 'vpid', default=None)
@@ -829,10 +847,6 @@ class BBCIE(BBCCoUkIE):
                  'subtitles': subtitles,
              }
  
-        playlist_title = self._html_search_regex(
-            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage, default=None)
-
          def extract_all(pattern):
              return list(filter(None, map(
                  lambda s: self._parse_json(s, playlist_id, fatal=False),
@@ -932,7 +946,7 @@ class BBCIE(BBCCoUkIE):
  
  
  class BBCCoUkArticleIE(InfoExtractor):
-    _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+    _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
      IE_NAME = 'bbc.co.uk:article'
      IE_DESC = 'BBC articles'