[bbc] extract more and better qulities from Unified Streaming Platform m3u8 manifests
[youtube-dl] / youtube_dl / extractor / bbc.py
index e62b3860e99b106d08ef79cf593e180fe8c9496c..50c1da185b74695f9197826e7e217fbaffe495c8 100644 (file)
@@ -31,7 +31,7 @@ class BBCCoUkIE(InfoExtractor):
                             music/clips[/#]|
                             radio/player/
                         )
-                        (?P<id>%s)
+                        (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
 
     _MEDIASELECTOR_URLS = [
@@ -44,6 +44,8 @@ class BBCCoUkIE(InfoExtractor):
 
     _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
     _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
+    # Unified Streaming Platform
+    _USP_RE = r'/([^/]+)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
 
     _NAMESPACES = (
         _MEDIASELECTION_NS,
@@ -55,12 +57,11 @@ class BBCCoUkIE(InfoExtractor):
             'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
             'info_dict': {
                 'id': 'b039d07m',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
                 'description': 'The Canadian poet and songwriter reflects on his musical career.',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             }
         },
@@ -92,7 +93,7 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-            'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+            'skip': 'this episode is not currently available',
         },
         {
             'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
@@ -107,7 +108,7 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-            'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+            'skip': 'this episode is not currently available',
         }, {
             'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
             'info_dict': {
@@ -127,13 +128,12 @@ class BBCCoUkIE(InfoExtractor):
             'note': 'Audio',
             'info_dict': {
                 'id': 'p022h44j',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
                 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
                 'duration': 227,
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             }
         }, {
@@ -141,13 +141,12 @@ class BBCCoUkIE(InfoExtractor):
             'note': 'Video',
             'info_dict': {
                 'id': 'p025c103',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
                 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
                 'duration': 226,
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             }
         }, {
@@ -163,7 +162,7 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-            'skip': 'geolocation',
+            'skip': 'this episode is not currently available',
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
             'info_dict': {
@@ -177,7 +176,7 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
-            'skip': 'geolocation',
+            'skip': 'this episode is not currently available',
         }, {
             # iptv-all mediaset fails with geolocation however there is no geo restriction
             # for this programme at all
@@ -192,17 +191,17 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'this episode is not currently available on BBC iPlayer Radio',
         }, {
             # compact player (https://github.com/rg3/youtube-dl/issues/8147)
             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
             'info_dict': {
                 'id': 'p028bfkj',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
                 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         }, {
@@ -247,9 +246,15 @@ class BBCCoUkIE(InfoExtractor):
             elif transfer_format == 'dash':
                 pass
             elif transfer_format == 'hls':
-                formats.extend(self._extract_m3u8_formats(
+                is_unified_streaming = re.search(self._USP_RE, href)
+                if is_unified_streaming:
+                    href = re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href)
+                m3u8_formats = self._extract_m3u8_formats(
                     href, programme_id, ext='mp4', entry_protocol='m3u8_native',
-                    m3u8_id=supplier, fatal=False))
+                    m3u8_id=supplier, fatal=False)
+                if is_unified_streaming:
+                    self._check_formats(m3u8_formats, programme_id)
+                formats.extend(m3u8_formats)
             # Direct link
             else:
                 formats.append({
@@ -304,13 +309,14 @@ class BBCCoUkIE(InfoExtractor):
         for connection in self._extract_connections(media):
             conn_formats = self._extract_connection(connection, programme_id)
             for format in conn_formats:
-                format.update({
-                    'width': width,
-                    'height': height,
-                    'vbr': vbr,
-                    'vcodec': vcodec,
-                    'filesize': file_size,
-                })
+                if format.get('protocol') != 'm3u8_native':
+                    format.update({
+                        'width': width,
+                        'height': height,
+                        'vbr': vbr,
+                        'vcodec': vcodec,
+                        'filesize': file_size,
+                    })
                 if service:
                     format['format_id'] = '%s_%s' % (service, format['format_id'])
             formats.extend(conn_formats)
@@ -328,6 +334,7 @@ class BBCCoUkIE(InfoExtractor):
                     'format_id': '%s_%s' % (service, format['format_id']),
                     'abr': abr,
                     'acodec': acodec,
+                    'vcodec': 'none',
                 })
             formats.extend(conn_formats)
         return formats
@@ -670,6 +677,7 @@ class BBCIE(BBCCoUkIE):
         'info_dict': {
             'id': '34475836',
             'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
+            'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
         },
         'playlist_count': 3,
     }, {
@@ -688,11 +696,17 @@ class BBCIE(BBCCoUkIE):
         # custom redirection to www.bbc.com
         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
         'only_matching': True,
+    }, {
+        # single video article embedded with data-media-vpid
+        'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
+        'only_matching': True,
     }]
 
     @classmethod
     def suitable(cls, url):
-        return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
+        EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+        return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
+                else super(BBCIE, cls).suitable(url))
 
     def _extract_from_media_meta(self, media_meta, video_id):
         # Direct links to media in media metadata (e.g.
@@ -817,7 +831,7 @@ class BBCIE(BBCCoUkIE):
 
         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
         programme_id = self._search_regex(
-            [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
+            [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
              r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
              r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
             webpage, 'vpid', default=None)
@@ -942,7 +956,7 @@ class BBCIE(BBCCoUkIE):
 
 
 class BBCCoUkArticleIE(InfoExtractor):
-    _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+    _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
     IE_NAME = 'bbc.co.uk:article'
     IE_DESC = 'BBC articles'
 
@@ -969,3 +983,72 @@ class BBCCoUkArticleIE(InfoExtractor):
             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
 
         return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkPlaylistBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
+            for video_id in re.findall(
+                self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
+
+        title, description = self._extract_title_and_description(webpage)
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:iplayer:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
+    _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+    _TEST = {
+        'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 6,
+    }
+
+    def _extract_title_and_description(self, webpage):
+        title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
+        description = self._search_regex(
+            r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
+            webpage, 'description', fatal=False, group='value')
+        return title, description
+
+
+class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
+    _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
+    _TESTS = [{
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance - Clips - BBC Four',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 7,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
+        'only_matching': True,
+    }]
+
+    def _extract_title_and_description(self, webpage):
+        title = self._og_search_title(webpage, fatal=False)
+        description = self._og_search_description(webpage)
+        return title, description