Merge branch 'vlive' of https://github.com/ping/youtube-dl into ping-vlive

[youtube-dl] / youtube_dl / extractor / bbc.py
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 2a0901ee457dd02f051a183434ba174a4593b1fb..abc5a44a1b97567dc3153896d572dc5e86716542 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor):
      IE_DESC = 'BBC iPlayer'
      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
  
-    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
+    _MEDIASELECTOR_URLS = [
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+    ]
  
      _TESTS = [
          {
@@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):
          }
      ]
  
+    class MediaSelectionError(Exception):
+        def __init__(self, id):
+            self.id = id
+
      def _extract_asx_playlist(self, connection, programme_id):
          asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
          return [ref.get('href') for ref in asx.findall('./Entry/ref')]
@@ -172,6 +178,7 @@ class BBCCoUkIE(InfoExtractor):
          supplier = connection.get('supplier')
          if protocol == 'http':
              href = connection.get('href')
+            transfer_format = connection.get('transferFormat')
              # ASX playlist
              if supplier == 'asx':
                  for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -179,6 +186,9 @@ class BBCCoUkIE(InfoExtractor):
                          'url': ref,
                          'format_id': 'ref%s_%s' % (i, supplier),
                      })
+            # Skip DASH until supported
+            elif transfer_format == 'dash':
+                pass
              # Direct link
              else:
                  formats.append({
@@ -208,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):
      def _extract_medias(self, media_selection):
          error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
          if error is not None:
-            raise ExtractorError(
-                '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
+            raise BBCCoUkIE.MediaSelectionError(error.get('id'))
          return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
  
      def _extract_connections(self, media):
@@ -266,9 +275,23 @@ class BBCCoUkIE(InfoExtractor):
              ]
          return subtitles
  
+    def _raise_extractor_error(self, media_selection_error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+            expected=True)
+
      def _download_media_selector(self, programme_id):
-        return self._download_media_selector_url(
-            self._MEDIASELECTOR_URL % programme_id, programme_id)
+        last_exception = None
+        for mediaselector_url in self._MEDIASELECTOR_URLS:
+            try:
+                return self._download_media_selector_url(
+                    mediaselector_url % programme_id, programme_id)
+            except BBCCoUkIE.MediaSelectionError as e:
+                if e.id == 'notukerror':
+                    last_exception = e
+                    continue
+                self._raise_extractor_error(e)
+        self._raise_extractor_error(last_exception)
  
      def _download_media_selector_url(self, url, programme_id=None):
          try:
@@ -293,7 +316,6 @@ class BBCCoUkIE(InfoExtractor):
                  formats.extend(self._extract_video(media, programme_id))
              elif kind == 'captions':
                  subtitles = self.extract_subtitles(media, programme_id)
-
          return formats, subtitles
  
      def _download_playlist(self, playlist_id):
@@ -422,9 +444,14 @@ class BBCIE(BBCCoUkIE):
      IE_DESC = 'BBC'
      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
  
-    # fails with notukerror for some videos
-    #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
-    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
+    _MEDIASELECTOR_URLS = [
+        # Provides more formats, namely direct mp4 links, but fails on some videos with
+        # notukerror for non UK (?) users (e.g.
+        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+        # Provides fewer formats, but works everywhere for everybody (hopefully)
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+    ]
  
      _TESTS = [{
          # article with multiple videos embedded with data-media-meta containing
@@ -446,12 +473,20 @@ class BBCIE(BBCCoUkIE):
          },
          'playlist_count': 9,
          'skip': 'Save time',
+    }, {
+        # article with multiple videos embedded with `new SMP()`
+        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+        'info_dict': {
+            'id': '3662a707-0af9-3149-963f-47bea720b460',
+            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+        },
+        'playlist_count': 18,
      }, {
          # single video embedded with mediaAssetPage.init()
          'url': 'http://www.bbc.com/news/world-europe-32041533',
          'info_dict': {
              'id': 'p02mprgb',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
              'duration': 47,
              'timestamp': 1427219242,
@@ -491,6 +526,18 @@ class BBCIE(BBCCoUkIE):
          'params': {
              'skip_download': True,
          }
+    }, {
+        # single video from video playlist embedded with vxp-playlist-data JSON
+        'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+        'info_dict': {
+            'id': 'p02w6qjc',
+            'ext': 'mp4',
+            'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+            'duration': 56,
+        },
+        'params': {
+            'skip_download': True,
+        }
      }, {
          # single video story with digitalData
          'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
@@ -511,7 +558,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
          'info_dict': {
              'id': 'p018zqqg',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Hyundai Santa Fe Sport: Rock star',
              'description': 'md5:b042a26142c4154a6e472933cf20793d',
              'timestamp': 1368473503,
@@ -526,7 +573,7 @@ class BBCIE(BBCCoUkIE):
          'url': 'http://www.bbc.com/sport/0/football/33653409',
          'info_dict': {
              'id': 'p02xycnp',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
              'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
              'duration': 140,
@@ -633,22 +680,63 @@ class BBCIE(BBCCoUkIE):
  
          playlist_title = self._html_search_regex(
              r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage)
+        playlist_description = self._og_search_description(webpage, default=None)
+
+        def extract_all(pattern):
+            return list(filter(None, map(
+                lambda s: self._parse_json(s, playlist_id, fatal=False),
+                re.findall(pattern, webpage))))
+
+        # Multiple video article (e.g.
+        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+        entries = []
+        for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+            if embed_url and re.match(EMBED_URL, embed_url):
+                entries.append(embed_url)
+        entries.extend(re.findall(
+            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+        if entries:
+            return self.playlist_result(
+                [self.url_result(entry, 'BBCCoUk') for entry in entries],
+                playlist_id, playlist_title, playlist_description)
  
          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
-        medias = list(filter(None, map(
-            lambda s: self._parse_json(s, playlist_id, fatal=False),
-            re.findall(r"data-media-meta='({[^']+})'", webpage))))
+        medias = extract_all(r"data-media-meta='({[^']+})'")
  
          if not medias:
              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
-            media_asset_page = self._parse_json(
+            media_asset = self._search_regex(
+                r'mediaAssetPage\.init\(\s*({.+?}), "/',
+                webpage, 'media asset', default=None)
+            if media_asset:
+                media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
+                medias = []
+                for video in media_asset_page.get('videos', {}).values():
+                    medias.extend(video.values())
+
+        if not medias:
+            # Multiple video playlist with single `now playing` entry (e.g.
+            # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+            vxp_playlist = self._parse_json(
                  self._search_regex(
-                    r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'),
+                    r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+                    webpage, 'playlist data'),
                  playlist_id)
-            medias = []
-            for video in media_asset_page.get('videos', {}).values():
-                medias.extend(video.values())
+            playlist_medias = []
+            for item in vxp_playlist:
+                media = item.get('media')
+                if not media:
+                    continue
+                playlist_medias.append(media)
+                # Download single video if found media with asset id matching the video id from URL
+                if item.get('advert', {}).get('assetId') == playlist_id:
+                    medias = [media]
+                    break
+            # Fallback to the whole playlist
+            if not medias:
+                medias = playlist_medias
  
          entries = []
          for num, media_meta in enumerate(medias, start=1):