X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fbrightcove.py;h=2c7d968a848a2c9d4dbad8960aeac2e6f0b55cfd;hb=a2973eb59733c5f86a249c627d654b789020bc7d;hp=0733bece7c45880ab5c20b916d5bd8c9700da548;hpb=d47c26e16803abc1d15a677d88bbee78f7680db6;p=youtube-dl diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0733bece7..2c7d968a8 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, @@ -21,6 +22,10 @@ from ..utils import ( fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -117,7 +122,10 @@ class BrightcoveIE(InfoExtractor): object_str = re.sub(r'(]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + try: + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: @@ -153,6 +161,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P\d+)["\']\s*,\s* # playerID + ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -169,7 +199,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*''', + ).+?>\s*''', webpage) - return [cls._build_brighcove_url(m) for m in matches] + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -315,3 +350,94 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + _TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'duration': 165768, + 'uploader_id': '929656772001', + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('thumbnail') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') or source.get('streaming_src') + if src: + formats.append({ + 'url': src, + 'tbr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'uploader_id': account_id, + }