[brightcove] Support videos that only provide flv versions (fixes #1504)
[youtube-dl] / youtube_dl / extractor / brightcove.py
1 # encoding: utf-8
2
3 import re
4 import json
5 import xml.etree.ElementTree
6
7 from .common import InfoExtractor
8 from ..utils import (
9     compat_urllib_parse,
10     find_xpath_attr,
11     compat_urlparse,
12
13     ExtractorError,
14 )
15
16 class BrightcoveIE(InfoExtractor):
17     _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
18     _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
19     _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
20
21     _TESTS = [
22         {
23             u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
24             u'file': u'2371591881001.mp4',
25             u'md5': u'9e80619e0a94663f0bdc849b4566af19',
26             u'note': u'Test Brightcove downloads and detection in GenericIE',
27             u'info_dict': {
28                 u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
29                 u'uploader': u'8TV',
30                 u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
31             }
32         },
33         {
34             u'url': u'http://medianetwork.oracle.com/video/player/1785452137001',
35             u'file': u'1785452137001.flv',
36             u'info_dict': {
37                 u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
38                 u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.',
39                 u'uploader': u'Oracle',
40             },
41         },
42     ]
43
44     @classmethod
45     def _build_brighcove_url(cls, object_str):
46         """
47         Build a Brightcove url from a xml string containing
48         <object class="BrightcoveExperience">{params}</object>
49         """
50         object_doc = xml.etree.ElementTree.fromstring(object_str)
51         assert u'BrightcoveExperience' in object_doc.attrib['class']
52         params = {'flashID': object_doc.attrib['id'],
53                   'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
54                   }
55         playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
56         # Not all pages define this value
57         if playerKey is not None:
58             params['playerKey'] = playerKey.attrib['value']
59         videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
60         if videoPlayer is not None:
61             params['@videoPlayer'] = videoPlayer.attrib['value']
62         data = compat_urllib_parse.urlencode(params)
63         return cls._FEDERATED_URL_TEMPLATE % data
64
65     def _real_extract(self, url):
66         mobj = re.match(self._VALID_URL, url)
67         query_str = mobj.group('query')
68         query = compat_urlparse.parse_qs(query_str)
69
70         videoPlayer = query.get('@videoPlayer')
71         if videoPlayer:
72             return self._get_video_info(videoPlayer[0], query_str)
73         else:
74             player_key = query['playerKey']
75             return self._get_playlist_info(player_key[0])
76
77     def _get_video_info(self, video_id, query):
78         request_url = self._FEDERATED_URL_TEMPLATE % query
79         webpage = self._download_webpage(request_url, video_id)
80
81         self.report_extraction(video_id)
82         info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
83         info = json.loads(info)['data']
84         video_info = info['programmedContent']['videoPlayer']['mediaDTO']
85
86         return self._extract_video_info(video_info)
87
88     def _get_playlist_info(self, player_key):
89         playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
90                                                player_key, u'Downloading playlist information')
91
92         playlist_info = json.loads(playlist_info)['videoList']
93         videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
94
95         return self.playlist_result(videos, playlist_id=playlist_info['id'],
96                                     playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
97
98     def _extract_video_info(self, video_info):
99         info = {
100             'id': video_info['id'],
101             'title': video_info['displayName'],
102             'description': video_info.get('shortDescription'),
103             'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
104             'uploader': video_info.get('publisherName'),
105         }
106
107         renditions = video_info.get('renditions')
108         if renditions:
109             renditions = sorted(renditions, key=lambda r: r['size'])
110             best_format = renditions[-1]
111             info.update({
112                 'url': best_format['defaultURL'],
113                 'ext': 'mp4',
114             })
115         elif video_info.get('FLVFullLengthURL') is not None:
116             info.update({
117                 'url': video_info['FLVFullLengthURL'],
118                 'ext': 'flv',
119             })
120         else:
121             raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
122         return info