[viu] New extractor for viu.com
[youtube-dl] / youtube_dl / extractor / viu.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8     ExtractorError,
9     int_or_none,
10     clean_html,
11 )
12
13
14 class ViuBaseIE(InfoExtractor):
15
16     def _get_viu_auth(self, video_id):
17         viu_auth_res = self._request_webpage(
18             'https://www.viu.com/api/apps/v2/authenticate', video_id,
19             note='Requesting Viu auth',
20             query={
21                 'acct': 'test', 'appid': 'viu_desktop', 'fmt': 'json',
22                 'iid': 'guest', 'languageid': 'default', 'platform': 'desktop',
23                 'userid': 'guest', 'useridtype': 'guest', 'ver': '1.0'
24             })
25         return viu_auth_res.info().get('X-VIU-AUTH')
26
27
28 class ViuIE(ViuBaseIE):
29     IE_NAME = 'viu:show'
30     _VALID_URL = r'https?://www\.viu\.com/.+/(?:vod|media)/(?P<id>[0-9]+)'
31     _TESTS = [{
32         'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I',
33         'info_dict': {
34             'id': '3421',
35             'ext': 'mp4',
36             'title': 'The Prime Minister and I - Episode 17',
37             'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c',
38         },
39         'params': {
40             'skip_download': 'm3u8 download',
41         },
42         'skip': 'Geo-restricted to Singapore',
43     }, {
44         'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90',
45         'info_dict': {
46             'id': '7123',
47             'ext': 'mp4',
48             'title': '大人女子 - Episode 10',
49             'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f',
50         },
51         'params': {
52             'skip_download': 'm3u8 download',
53         },
54         'skip': 'Geo-restricted to Hong Kong',
55     }, {
56         'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',
57         'info_dict': {
58             'id': '1116705532',
59             'ext': 'mp4',
60             'title': 'Citizen Khan - Episode 1',
61             'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e',
62         },
63         'params': {
64             'skip_download': 'm3u8 download',
65         },
66         'skip': 'Geo-restricted to India',
67     }, {
68         'url': 'https://www.viu.com/en/media/1130599965',
69         'info_dict': {
70             'id': '1130599965',
71             'ext': 'mp4',
72             'title': 'Jealousy Incarnate - Episode 1',
73             'description': 'md5:d3d82375cab969415d2720b6894361e9',
74         },
75         'params': {
76             'skip_download': 'm3u8 download',
77         },
78         'skip': 'Geo-restricted to Indonesia',
79     }]
80
81     def _real_extract(self, url):
82         video_id = self._match_id(url)
83
84         webpage = self._download_webpage(
85             url, video_id, note='Downloading video page')
86
87         mobj = re.search(
88             r'<div class=["\']error-title[^<>]+?>(?P<err>.+?)</div>', webpage, flags=re.DOTALL)
89
90         if mobj:
91             raise ExtractorError(clean_html(mobj.group('err')), expected=True)
92
93         config_js_url = self._search_regex(
94             r'src=(["\'])(?P<api_url>.+?/js/config\.js)(?:\?.+?)?\1', webpage, 'config_js',
95             group='api_url', default=None)
96
97         if not config_js_url:
98             # content is from ID, IN, MY
99             video_info = self._download_json(
100                 'https://www.viu.com/api/clip/load', video_id,
101                 headers={'X-VIU-AUTH': self._get_viu_auth(video_id)},
102                 query={'appid': 'viu_desktop', 'fmt': 'json', 'id': video_id},
103                 note='Downloading video info').get('response', {}).get('item', [{}])[0]
104
105             formats = self._extract_m3u8_formats(
106                 video_info['href'], video_id, 'mp4',
107                 m3u8_id='hls', fatal=False)
108             self._sort_formats(formats)
109
110             subtitles = {}
111             for key, value in list(video_info.items()):
112                 mobj = re.match(r'^subtitle_(?P<lang>[^_]+?)_(?P<ext>(vtt|srt))', key)
113                 if not mobj:
114                     continue
115                 if not subtitles.get(mobj.group('lang')):
116                     subtitles[mobj.group('lang')] = []
117                 subtitles[mobj.group('lang')].append(
118                     {'url': value, 'ext': mobj.group('ext')})
119
120             title = '%s - Episode %s' % (video_info['moviealbumshowname'],
121                                          video_info.get('episodeno'))
122             description = video_info.get('description')
123             duration = int_or_none(video_info.get('duration'))
124             series = video_info.get('moviealbumshowname')
125             episode_title = video_info.get('title')
126             episode_num = int_or_none(video_info.get('episodeno'))
127
128             return {
129                 'id': video_id,
130                 'title': title,
131                 'description': description,
132                 'series': series,
133                 'episode': episode_title,
134                 'episode_number': episode_num,
135                 'duration': duration,
136                 'formats': formats,
137                 'subtitles': subtitles,
138             }
139
140         # content from HK, SG
141         config_js = self._download_webpage(
142             'http://www.viu.com' + config_js_url, video_id, note='Downloading config js')
143         
144         # try to strip away commented code which contains test urls
145         config_js = re.sub(r'^//.*?$', '', config_js, flags=re.MULTILINE)
146         config_js = re.sub(r'/\*.*?\*/', '', config_js, flags=re.DOTALL)
147         
148         # Slightly different api_url between HK and SG config.js
149         # http://www.viu.com/ott/hk/v1/js/config.js =>  '//www.viu.com/ott/hk/index.php?r='
150         # http://www.viu.com/ott/sg/v1/js/config.js => 'http://www.viu.com/ott/sg/index.php?r='
151         api_url = self._proto_relative_url(
152             self._search_regex(
153                 r'var\s+api_url\s*=\s*(["\'])(?P<api_url>(?:https?:)?//.+?\?r=)\1',
154                 config_js, 'api_url', group='api_url'), scheme='http:')
155
156         stream_info_url = self._proto_relative_url(
157             self._search_regex(
158                 r'var\s+video_url\s*=\s*(["\'])(?P<video_url>(?:https?:)?//.+?\?ccs_product_id=)\1',
159                 config_js, 'video_url', group='video_url'), scheme='http:')
160
161         if url.startswith('https://'):
162             api_url = re.sub('^http://', 'https://', api_url)
163
164         video_info = self._download_json(
165             api_url + 'vod/ajax-detail&platform_flag_label=web&product_id=' + video_id,
166             video_id, note='Downloading video info').get('data', {})
167
168         ccs_product_id = video_info.get('current_product', {}).get('ccs_product_id')
169
170         if not ccs_product_id:
171             raise ExtractorError('This video is not available in your region.', expected=True)
172
173         stream_info = self._download_json(
174             stream_info_url + ccs_product_id, video_id,
175             note='Downloading stream info').get('data', {}).get('stream', {})
176
177         formats = []
178         for vid_format, stream_url in stream_info.get('url', {}).items():
179             br = int_or_none(self._search_regex(
180                 r's(?P<br>[0-9]+)p', vid_format, 'bitrate', group='br'))
181             formats.append({
182                 'format_id': vid_format,
183                 'url': stream_url,
184                 'vbr': br,
185                 'ext': 'mp4',
186                 'filesize': stream_info.get('size', {}).get(vid_format)
187             })
188         self._sort_formats(formats)
189
190         subtitles = {}
191         if video_info.get('current_product', {}).get('subtitle', []):
192             for sub in video_info.get('current_product', {}).get('subtitle', []):
193                 subtitles[sub.get('name')] = [{
194                     'url': sub.get('url'),
195                     'ext': 'srt',
196                 }]
197
198         episode_info = next(
199             p for p in video_info.get('series', {}).get('product', [])
200             if p.get('product_id') == video_id)
201
202         title = '%s - Episode %s' % (video_info.get('series', {}).get('name'),
203                                      episode_info.get('number'))
204         description = episode_info.get('description')
205         thumbnail = episode_info.get('cover_image_url')
206         duration = int_or_none(stream_info.get('duration'))
207         series = video_info.get('series', {}).get('name')
208         episode_title = episode_info.get('synopsis')
209         episode_num = int_or_none(episode_info.get('number'))
210
211         return {
212             'id': video_id,
213             'title': title,
214             'description': description,
215             'series': series,
216             'episode': episode_title,
217             'episode_number': episode_num,
218             'duration': duration,
219             'thumbnail': thumbnail,
220             'formats': formats,
221             'subtitles': subtitles,
222         }
223
224
225 class ViuPlaylistIE(ViuBaseIE):
226     IE_NAME = 'viu:playlist'
227     _VALID_URL = r'https?://www\.viu\.com/.+/listing/(?P<id>playlist\-[0-9]+)'
228     _TEST = {
229         'url': 'https://www.viu.com/en/listing/playlist-22461380',
230         'info_dict': {
231             'id': 'playlist-22461380',
232             'title': 'The Good Wife',
233         },
234         'playlist_count': 16,
235         'skip': 'Geo-restricted to Indonesia',
236     }
237
238     def _real_extract(self, url):
239         playlist_id = self._match_id(url)
240         playlist_info = self._download_json(
241             'https://www.viu.com/api/container/load', playlist_id,
242             headers={'X-VIU-AUTH': self._get_viu_auth(playlist_id)},
243             query={'appid': 'viu_desktop', 'fmt': 'json', 'id': playlist_id},
244             note='Downloading playlist info').get('response', {}).get('container')
245
246         name = playlist_info['title']
247         entries = [
248             self.url_result(
249                 'https://www.viu.com/en/media/%s' % item['id'],
250                 'Viu', item['id'])
251             for item in playlist_info['item'] if item['id']]
252
253         return self.playlist_result(entries, playlist_id, name)