[brightcove] Extract more formats (#8862)

[youtube-dl] / youtube_dl / extractor / brightcove.py
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index c6ad1d065512a138b6f4535cce577eb81238debf..c9e43a2751f1f94a622895cd7dfbdabfa3d0355f 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -3,35 +3,36 @@ from __future__ import unicode_literals
  
  import re
  import json
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..compat import (
+    compat_etree_fromstring,
      compat_parse_qs,
      compat_str,
-    compat_urllib_parse,
      compat_urllib_parse_urlparse,
-    compat_urllib_request,
      compat_urlparse,
      compat_xml_parse_error,
+    compat_HTTPError,
  )
  from ..utils import (
      determine_ext,
      ExtractorError,
      find_xpath_attr,
      fix_xml_ampersands,
-    unescapeHTML,
-    unsmuggle_url,
+    float_or_none,
      js_to_json,
      int_or_none,
      parse_iso8601,
-    extract_attributes,
+    unescapeHTML,
+    unsmuggle_url,
+    update_url_query,
  )
  
  
-class BrightcoveIE(InfoExtractor):
+class BrightcoveLegacyIE(InfoExtractor):
+    IE_NAME = 'brightcove:legacy'
      _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
-    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+    _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated'
  
      _TESTS = [
          {
@@ -123,7 +124,7 @@ class BrightcoveIE(InfoExtractor):
          object_str = fix_xml_ampersands(object_str)
  
          try:
-            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+            object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
          except compat_xml_parse_error:
              return
  
@@ -135,13 +136,16 @@ class BrightcoveIE(InfoExtractor):
          else:
              flashvars = {}
  
+        data_url = object_doc.attrib.get('data', '')
+        data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
+
          def find_param(name):
              if name in flashvars:
                  return flashvars[name]
              node = find_xpath_attr(object_doc, './param', 'name', name)
              if node is not None:
                  return node.attrib['value']
-            return None
+            return data_url_params.get(name)
  
          params = {}
  
@@ -154,8 +158,8 @@ class BrightcoveIE(InfoExtractor):
          # Not all pages define this value
          if playerKey is not None:
              params['playerKey'] = playerKey
-        # The three fields hold the id of the video
-        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
+        # These fields hold the id of the video
+        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
          if videoPlayer is not None:
              params['@videoPlayer'] = videoPlayer
          linkBase = find_param('linkBaseURL')
@@ -183,8 +187,7 @@ class BrightcoveIE(InfoExtractor):
  
      @classmethod
      def _make_brightcove_url(cls, params):
-        data = compat_urllib_parse.urlencode(params)
-        return cls._FEDERATED_URL_TEMPLATE % data
+        return update_url_query(cls._FEDERATED_URL, params)
  
      @classmethod
      def _extract_brightcove_url(cls, webpage):
@@ -238,7 +241,7 @@ class BrightcoveIE(InfoExtractor):
              # We set the original url as the default 'Referer' header
              referer = smuggled_data.get('Referer', url)
              return self._get_video_info(
-                videoPlayer[0], query_str, query, referer=referer)
+                videoPlayer[0], query, referer=referer)
          elif 'playerKey' in query:
              player_key = query['playerKey']
              return self._get_playlist_info(player_key[0])
@@ -247,15 +250,14 @@ class BrightcoveIE(InfoExtractor):
                  'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
                  expected=True)
  
-    def _get_video_info(self, video_id, query_str, query, referer=None):
-        request_url = self._FEDERATED_URL_TEMPLATE % query_str
-        req = compat_urllib_request.Request(request_url)
+    def _get_video_info(self, video_id, query, referer=None):
+        headers = {}
          linkBase = query.get('linkBaseURL')
          if linkBase is not None:
              referer = linkBase[0]
          if referer is not None:
-            req.add_header('Referer', referer)
-        webpage = self._download_webpage(req, video_id)
+            headers['Referer'] = referer
+        webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query)
  
          error_msg = self._html_search_regex(
              r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,
@@ -295,7 +297,7 @@ class BrightcoveIE(InfoExtractor):
              'uploader': video_info.get('publisherName'),
          }
  
-        renditions = video_info.get('renditions')
+        renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', [])
          if renditions:
              formats = []
              for rend in renditions:
@@ -317,13 +319,23 @@ class BrightcoveIE(InfoExtractor):
                  if ext is None:
                      ext = determine_ext(url)
                  size = rend.get('size')
-                formats.append({
+                a_format = {
                      'url': url,
                      'ext': ext,
                      'height': rend.get('frameHeight'),
                      'width': rend.get('frameWidth'),
                      'filesize': size if size != 0 else None,
-                })
+                }
+
+                # m3u8 manifests with remote == false are media playlists
+                # Not calling _extract_m3u8_formats here to save network traffic
+                if ext == 'm3u8':
+                    a_format.update({
+                        'ext': 'mp4',
+                        'protocol': 'm3u8',
+                    })
+
+                formats.append(a_format)
              self._sort_formats(formats)
              info['formats'] = formats
          elif video_info.get('FLVFullLengthURL') is not None:
@@ -352,9 +364,10 @@ class BrightcoveIE(InfoExtractor):
          return info
  
  
-class BrightcoveInPageEmbedIE(InfoExtractor):
-    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)'
-    _TEST = {
+class BrightcoveNewIE(InfoExtractor):
+    IE_NAME = 'brightcove:new'
+    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
+    _TESTS = [{
          'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
          'md5': 'c8100925723840d4b0d243f7025703be',
          'info_dict': {
@@ -362,82 +375,189 @@ class BrightcoveInPageEmbedIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'Meet the man behind Popcorn Time',
              'description': 'md5:eac376a4fe366edc70279bfb681aea16',
+            'duration': 165.768,
              'timestamp': 1441391203,
              'upload_date': '20150904',
-            'duration': 165768,
              'uploader_id': '929656772001',
+            'formats': 'mincount:22',
+        },
+    }, {
+        # with rtmp streams
+        'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
+        'info_dict': {
+            'id': '4279049078001',
+            'ext': 'mp4',
+            'title': 'Titansgrave: Chapter 0',
+            'description': 'Titansgrave: Chapter 0',
+            'duration': 1242.058,
+            'timestamp': 1433556729,
+            'upload_date': '20150606',
+            'uploader_id': '4036320279001',
+            'formats': 'mincount:41',
+        },
+        'params': {
+            'skip_download': True,
          }
-    }
+    }, {
+        # ref: prefixed video id
+        'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
+        'only_matching': True,
+    }, {
+        # non numeric ref: prefixed video id
+        'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
+        'only_matching': True,
+    }]
  
      @staticmethod
      def _extract_url(webpage):
-        video_attributes = re.search(r'(?s)<video([^>]*)>.*?</(?:video|audio)>', webpage)
-        if video_attributes:
-            video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']')
-            account_id = video_attributes.get('account')
-            player_id = video_attributes.get('player')
-            embed = video_attributes.get('embed')
-            video_id = video_attributes.get('video-id')
-            if account_id and player_id and embed and video_id:
-                return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)
-        return None
+        urls = BrightcoveNewIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Reference:
+        # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
+        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
+        # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+
+        entries = []
+
+        # Look for iframe embeds [1]
+        for _, url in re.findall(
+                r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
+            entries.append(url if url.startswith('http') else 'http:' + url)
+
+        # Look for embed_in_page embeds [2]
+        for video_id, account_id, player_id, embed in re.findall(
+                # According to examples from [3] it's unclear whether video id
+                # may be optional and what to do when it is
+                # According to [4] data-video-id may be prefixed with ref:
+                r'''(?sx)
+                    <video[^>]+
+                        data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
+                    </video>.*?
+                    <script[^>]+
+                        src=["\'](?:https?:)?//players\.brightcove\.net/
+                        (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js
+                ''', webpage):
+            entries.append(
+                'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+                % (account_id, player_id, embed, video_id))
+
+        return entries
  
      def _real_extract(self, url):
          account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
  
-        webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id)
-
-        catalog = self._parse_json(
-            js_to_json(
-                self._search_regex(
-                    r'catalog\(({[^}]+})\);',
-                    webpage,
-                    'catalog'
-                )
-            ),
-            video_id
-        )
-        policy_key = catalog['policyKey']
-
-        req = compat_urllib_request.Request(
-            'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id),
-            headers={'Accept': 'application/json;pk=%s' % policy_key})
-        json_data = self._download_json(req, video_id)
+        webpage = self._download_webpage(
+            'http://players.brightcove.net/%s/%s_%s/index.min.js'
+            % (account_id, player_id, embed), video_id)
+
+        policy_key = None
+
+        catalog = self._search_regex(
+            r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
+        if catalog:
+            catalog = self._parse_json(
+                js_to_json(catalog), video_id, fatal=False)
+            if catalog:
+                policy_key = catalog.get('policyKey')
+
+        if not policy_key:
+            policy_key = self._search_regex(
+                r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
+                webpage, 'policy key', group='pk')
+
+        api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id)
+        try:
+            json_data = self._download_json(api_url, video_id, headers={
+                'Accept': 'application/json;pk=%s' % policy_key
+            })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                json_data = self._parse_json(e.cause.read().decode(), video_id)
+                raise ExtractorError(json_data[0]['message'], expected=True)
+            raise
  
          title = json_data['name']
-        description = json_data.get('description')
-        thumbnail = json_data.get('thumbnail')
-        timestamp = parse_iso8601(json_data.get('published_at'))
-        duration = int_or_none(json_data.get('duration'))
  
          formats = []
-        for source in json_data.get('sources'):
+        for source in json_data.get('sources', []):
+            container = source.get('container')
              source_type = source.get('type')
-            if source_type == 'application/x-mpegURL':
-                formats.extend(self._extract_m3u8_formats(source.get('src'), video_id))
+            src = source.get('src')
+            if source_type == 'application/x-mpegURL' or container == 'M2TS':
+                if not src:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            elif source_type == 'application/dash+xml':
+                if not src:
+                    continue
+                formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
              else:
-                src = source.get('src') or source.get('streaming_src')
-                if src:
-                    formats.append({
-                        'url': src,
-                        'tbr': source.get('avg_bitrate'),
-                        'width': int_or_none(source.get('width')),
-                        'height': int_or_none(source.get('height')),
-                        'filesize': source.get('size'),
-                        'container': source.get('container'),
+                streaming_src = source.get('streaming_src')
+                stream_name, app_name = source.get('stream_name'), source.get('app_name')
+                if not src and not streaming_src and (not stream_name or not app_name):
+                    continue
+                tbr = float_or_none(source.get('avg_bitrate'), 1000)
+                height = int_or_none(source.get('height'))
+                width = int_or_none(source.get('width'))
+                f = {
+                    'tbr': tbr,
+                    'filesize': int_or_none(source.get('size')),
+                    'container': container,
+                    'ext': container.lower(),
+                }
+                if width == 0 and height == 0:
+                    f.update({
+                        'vcodec': 'none',
+                    })
+                else:
+                    f.update({
+                        'width': width,
+                        'height': height,
                          'vcodec': source.get('codec'),
-                        'ext': source.get('container').lower(),
                      })
  
+                def build_format_id(kind):
+                    format_id = kind
+                    if tbr:
+                        format_id += '-%dk' % int(tbr)
+                    if height:
+                        format_id += '-%dp' % height
+                    return format_id
+
+                if src or streaming_src:
+                    f.update({
+                        'url': src or streaming_src,
+                        'format_id': build_format_id('http' if src else 'http-streaming'),
+                        'preference': 2 if src else 1,
+                    })
+                else:
+                    f.update({
+                        'url': app_name,
+                        'play_path': stream_name,
+                        'format_id': build_format_id('rtmp'),
+                    })
+                formats.append(f)
          self._sort_formats(formats)
  
+        description = json_data.get('description')
+        thumbnail = json_data.get('thumbnail')
+        timestamp = parse_iso8601(json_data.get('published_at'))
+        duration = float_or_none(json_data.get('duration'), 1000)
+        tags = json_data.get('tags', [])
+
          return {
              'id': video_id,
              'title': title,
              'description': description,
              'thumbnail': thumbnail,
-            'timestamp': timestamp,
              'duration': duration,
-            'formats': formats,
+            'timestamp': timestamp,
              'uploader_id': account_id,
+            'formats': formats,
+            'tags': tags,
          }