[downloader/http] Simplify
[youtube-dl] / youtube_dl / extractor / bbc.py
index 0f0ea7cfdd382e16d667012e4244403c52701b27..9a1b6e3dce7dd3247b0076b36280e7e4e0550c90 100644 (file)
@@ -14,12 +14,15 @@ from ..utils import (
 )
 from ..compat import compat_HTTPError
 
+
 class BBCCoUkIE(InfoExtractor):
     IE_NAME = 'bbc.co.uk'
     IE_DESC = 'BBC iPlayer'
     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
 
-    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
+    _MEDIASELECTOR_URLS = [
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+    ]
 
     _TESTS = [
         {
@@ -161,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):
         }
     ]
 
+    class MediaSelectionError(Exception):
+        def __init__(self, id):
+            self.id = id
+
     def _extract_asx_playlist(self, connection, programme_id):
         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
@@ -211,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):
     def _extract_medias(self, media_selection):
         error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
         if error is not None:
-            raise ExtractorError(
-                '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
+            raise BBCCoUkIE.MediaSelectionError(error.get('id'))
         return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
 
     def _extract_connections(self, media):
@@ -269,17 +275,23 @@ class BBCCoUkIE(InfoExtractor):
             ]
         return subtitles
 
+    def _raise_extractor_error(self, media_selection_error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+            expected=True)
+
     def _download_media_selector(self, programme_id):
-        try:
-            return self._download_media_selector_url(
-                self._MEDIASELECTOR_URL % programme_id, programme_id)
-        except ExtractorError as e:
-            if hasattr(self, '_MEDIASELECTOR_ALT_URL') and str(e) == 'bbc returned error: notukerror':
-                 # notukerror on bbc.com/travel using bbc news mediaselector: fallback to /mediaselector/5/
-                 return self._download_media_selector_url(
-                     self._MEDIASELECTOR_ALT_URL % programme_id, programme_id)
-            else:
-                 raise
+        last_exception = None
+        for mediaselector_url in self._MEDIASELECTOR_URLS:
+            try:
+                return self._download_media_selector_url(
+                    mediaselector_url % programme_id, programme_id)
+            except BBCCoUkIE.MediaSelectionError as e:
+                if e.id == 'notukerror':
+                    last_exception = e
+                    continue
+                self._raise_extractor_error(e)
+        self._raise_extractor_error(last_exception)
 
     def _download_media_selector_url(self, url, programme_id=None):
         try:
@@ -432,10 +444,14 @@ class BBCIE(BBCCoUkIE):
     IE_DESC = 'BBC'
     _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
 
-    # fails with notukerror for some videos ( non news sites such as bbc.com/travel )
-    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
-    # limited selection of formats but may work where the above does not
-    _MEDIASELECTOR_ALT_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
+    _MEDIASELECTOR_URLS = [
+        # Provides more formats, namely direct mp4 links, but fails on some videos with
+        # notukerror for non UK (?) users (e.g.
+        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+        # Provides fewer formats, but works everywhere for everybody (hopefully)
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+    ]
 
     _TESTS = [{
         # article with multiple videos embedded with data-media-meta containing
@@ -457,6 +473,14 @@ class BBCIE(BBCCoUkIE):
         },
         'playlist_count': 9,
         'skip': 'Save time',
+    }, {
+        # article with multiple videos embedded with `new SMP()`
+        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+        'info_dict': {
+            'id': '3662a707-0af9-3149-963f-47bea720b460',
+            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+        },
+        'playlist_count': 18,
     }, {
         # single video embedded with mediaAssetPage.init()
         'url': 'http://www.bbc.com/news/world-europe-32041533',
@@ -644,12 +668,30 @@ class BBCIE(BBCCoUkIE):
 
         playlist_title = self._html_search_regex(
             r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage)
+        playlist_description = self._og_search_description(webpage, default=None)
+
+        def extract_all(pattern):
+            return list(filter(None, map(
+                lambda s: self._parse_json(s, playlist_id, fatal=False),
+                re.findall(pattern, webpage))))
+
+        # Multiple video article (e.g.
+        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+        entries = []
+        for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+            if embed_url and re.match(EMBED_URL, embed_url):
+                entries.append(embed_url)
+        entries.extend(re.findall(
+            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+        if entries:
+            return self.playlist_result(
+                [self.url_result(entry, 'BBCCoUk') for entry in entries],
+                playlist_id, playlist_title, playlist_description)
 
         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
-        medias = list(filter(None, map(
-            lambda s: self._parse_json(s, playlist_id, fatal=False),
-            re.findall(r"data-media-meta='({[^']+})'", webpage))))
+        medias = extract_all(r"data-media-meta='({[^']+})'")
 
         if not medias:
             # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)