[bbc] Add support for direct bbc.co.uk embeds

[youtube-dl] / youtube_dl / extractor / bbc.py
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 2a0901ee457dd02f051a183434ba174a4593b1fb..c0433eabd5af2130c0e13d83370784f3cab4b036 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -172,6 +172,7 @@ class BBCCoUkIE(InfoExtractor):
          supplier = connection.get('supplier')
          if protocol == 'http':
              href = connection.get('href')
+            transfer_format = connection.get('transferFormat')
              # ASX playlist
              if supplier == 'asx':
                  for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -179,6 +180,9 @@ class BBCCoUkIE(InfoExtractor):
                          'url': ref,
                          'format_id': 'ref%s_%s' % (i, supplier),
                      })
+            # Skip DASH until supported
+            elif transfer_format == 'dash':
+                pass
              # Direct link
              else:
                  formats.append({
@@ -423,7 +427,7 @@ class BBCIE(BBCCoUkIE):
      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
  
      # fails with notukerror for some videos
-    #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
+    # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
      _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
  
      _TESTS = [{
@@ -446,6 +450,14 @@ class BBCIE(BBCCoUkIE):
          },
          'playlist_count': 9,
          'skip': 'Save time',
+    }, {
+        # article with multiple videos embedded with `new SMP()`
+        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+        'info_dict': {
+            'id': '3662a707-0af9-3149-963f-47bea720b460',
+            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+        },
+        'playlist_count': 18,
      }, {
          # single video embedded with mediaAssetPage.init()
          'url': 'http://www.bbc.com/news/world-europe-32041533',
@@ -633,12 +645,30 @@ class BBCIE(BBCCoUkIE):
  
          playlist_title = self._html_search_regex(
              r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage)
+        playlist_description = self._og_search_description(webpage, default=None)
+
+        def extract_all(pattern):
+            return list(filter(None, map(
+                lambda s: self._parse_json(s, playlist_id, fatal=False),
+                re.findall(pattern, webpage))))
+
+        # Multiple video article (e.g.
+        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]*)?'
+        entries = []
+        for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+            if embed_url and re.match(EMBED_URL, embed_url):
+                entries.append(embed_url)
+        entries.extend(re.findall(
+            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+        if entries:
+            return self.playlist_result(
+                [self.url_result(entry, 'BBCCoUk') for entry in entries],
+                playlist_id, playlist_title, playlist_description)
  
          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
-        medias = list(filter(None, map(
-            lambda s: self._parse_json(s, playlist_id, fatal=False),
-            re.findall(r"data-media-meta='({[^']+})'", webpage))))
+        medias = extract_all(r"data-media-meta='({[^']+})'")
  
          if not medias:
              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)