[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / medialaan.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .gigya import GigyaBaseIE
6
7 from ..compat import compat_str
8 from ..utils import (
9     int_or_none,
10     parse_duration,
11     try_get,
12     unified_timestamp,
13 )
14
15
16 class MedialaanIE(GigyaBaseIE):
17     _VALID_URL = r'''(?x)
18                     https?://
19                         (?:www\.|nieuws\.)?
20                         (?:
21                             (?P<site_id>vtm|q2|vtmkzoom)\.be/
22                             (?:
23                                 video(?:/[^/]+/id/|/?\?.*?\baid=)|
24                                 (?:[^/]+/)*
25                             )
26                         )
27                         (?P<id>[^/?#&]+)
28                     '''
29     _NETRC_MACHINE = 'medialaan'
30     _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-'
31     _SITE_TO_APP_ID = {
32         'vtm': 'vtm_watch',
33         'q2': 'q2',
34         'vtmkzoom': 'vtmkzoom',
35     }
36     _TESTS = [{
37         # vod
38         'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch',
39         'info_dict': {
40             'id': 'vtm_20170219_VM0678361_vtmwatch',
41             'ext': 'mp4',
42             'title': 'Allemaal Chris afl. 6',
43             'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2',
44             'timestamp': 1487533280,
45             'upload_date': '20170219',
46             'duration': 2562,
47             'series': 'Allemaal Chris',
48             'season': 'Allemaal Chris',
49             'season_number': 1,
50             'season_id': '256936078124527',
51             'episode': 'Allemaal Chris afl. 6',
52             'episode_number': 6,
53             'episode_id': '256936078591527',
54         },
55         'params': {
56             'skip_download': True,
57         },
58         'skip': 'Requires account credentials',
59     }, {
60         # clip
61         'url': 'http://vtm.be/video?aid=168332',
62         'info_dict': {
63             'id': '168332',
64             'ext': 'mp4',
65             'title': '"Veronique liegt!"',
66             'description': 'md5:1385e2b743923afe54ba4adc38476155',
67             'timestamp': 1489002029,
68             'upload_date': '20170308',
69             'duration': 96,
70         },
71     }, {
72         # vod
73         'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000',
74         'only_matching': True,
75     }, {
76         # vod
77         'url': 'http://vtm.be/video?aid=163157',
78         'only_matching': True,
79     }, {
80         # vod
81         'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2',
82         'only_matching': True,
83     }, {
84         # clip
85         'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
86         'only_matching': True,
87     }, {
88         # http/s redirect
89         'url': 'https://vtmkzoom.be/video?aid=45724',
90         'info_dict': {
91             'id': '257136373657000',
92             'ext': 'mp4',
93             'title': 'K3 Dansstudio Ushuaia afl.6',
94         },
95         'params': {
96             'skip_download': True,
97         },
98         'skip': 'Requires account credentials',
99     }, {
100         # nieuws.vtm.be
101         'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma',
102         'only_matching': True,
103     }]
104
105     def _real_initialize(self):
106         self._logged_in = False
107
108     def _login(self):
109         username, password = self._get_login_info()
110         if username is None:
111             self.raise_login_required()
112
113         auth_data = {
114             'APIKey': self._APIKEY,
115             'sdk': 'js_6.1',
116             'format': 'json',
117             'loginID': username,
118             'password': password,
119         }
120
121         auth_info = self._gigya_login(auth_data)
122
123         self._uid = auth_info['UID']
124         self._uid_signature = auth_info['UIDSignature']
125         self._signature_timestamp = auth_info['signatureTimestamp']
126
127         self._logged_in = True
128
129     def _real_extract(self, url):
130         mobj = re.match(self._VALID_URL, url)
131         video_id, site_id = mobj.group('id', 'site_id')
132
133         webpage = self._download_webpage(url, video_id)
134
135         config = self._parse_json(
136             self._search_regex(
137                 r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);',
138                 webpage, 'config', default='{}'), video_id,
139             transform_source=lambda s: s.replace(
140                 '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'"))
141
142         vod_id = config.get('vodId') or self._search_regex(
143             (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"',
144              r'"vodId"\s*:\s*"(.+?)"',
145              r'<[^>]+id=["\']vod-(\d+)'),
146             webpage, 'video_id', default=None)
147
148         # clip, no authentication required
149         if not vod_id:
150             player = self._parse_json(
151                 self._search_regex(
152                     r'vmmaplayer\(({.+?})\);', webpage, 'vmma player',
153                     default=''),
154                 video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
155             if player:
156                 video = player[-1]
157                 if video['videoUrl'] in ('http', 'https'):
158                     return self.url_result(video['url'], MedialaanIE.ie_key())
159                 info = {
160                     'id': video_id,
161                     'url': video['videoUrl'],
162                     'title': video['title'],
163                     'thumbnail': video.get('imageUrl'),
164                     'timestamp': int_or_none(video.get('createdDate')),
165                     'duration': int_or_none(video.get('duration')),
166                 }
167             else:
168                 info = self._parse_html5_media_entries(
169                     url, webpage, video_id, m3u8_id='hls')[0]
170                 info.update({
171                     'id': video_id,
172                     'title': self._html_search_meta('description', webpage),
173                     'duration': parse_duration(self._html_search_meta('duration', webpage)),
174                 })
175         # vod, authentication required
176         else:
177             if not self._logged_in:
178                 self._login()
179
180             settings = self._parse_json(
181                 self._search_regex(
182                     r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
183                     webpage, 'drupal settings', default='{}'),
184                 video_id)
185
186             def get(container, item):
187                 return try_get(
188                     settings, lambda x: x[container][item],
189                     compat_str) or self._search_regex(
190                     r'"%s"\s*:\s*"([^"]+)' % item, webpage, item,
191                     default=None)
192
193             app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch')
194             sso = get('vod', 'gigyaDatabase') or 'vtm-sso'
195
196             data = self._download_json(
197                 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id,
198                 video_id, query={
199                     'app_id': app_id,
200                     'user_network': sso,
201                     'UID': self._uid,
202                     'UIDSignature': self._uid_signature,
203                     'signatureTimestamp': self._signature_timestamp,
204                 })
205
206             formats = self._extract_m3u8_formats(
207                 data['response']['uri'], video_id, entry_protocol='m3u8_native',
208                 ext='mp4', m3u8_id='hls')
209
210             self._sort_formats(formats)
211
212             info = {
213                 'id': vod_id,
214                 'formats': formats,
215             }
216
217             api_key = get('vod', 'apiKey')
218             channel = get('medialaanGigya', 'channel')
219
220             if api_key:
221                 videos = self._download_json(
222                     'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False,
223                     query={
224                         'channels': channel,
225                         'ids': vod_id,
226                         'limit': 1,
227                         'apikey': api_key,
228                     })
229                 if videos:
230                     video = try_get(
231                         videos, lambda x: x['response']['videos'][0], dict)
232                     if video:
233                         def get(container, item, expected_type=None):
234                             return try_get(
235                                 video, lambda x: x[container][item], expected_type)
236
237                         def get_string(container, item):
238                             return get(container, item, compat_str)
239
240                         info.update({
241                             'series': get_string('program', 'title'),
242                             'season': get_string('season', 'title'),
243                             'season_number': int_or_none(get('season', 'number')),
244                             'season_id': get_string('season', 'id'),
245                             'episode': get_string('episode', 'title'),
246                             'episode_number': int_or_none(get('episode', 'number')),
247                             'episode_id': get_string('episode', 'id'),
248                             'duration': int_or_none(
249                                 video.get('duration')) or int_or_none(
250                                 video.get('durationMillis'), scale=1000),
251                             'title': get_string('episode', 'title'),
252                             'description': get_string('episode', 'text'),
253                             'timestamp': unified_timestamp(get_string(
254                                 'publication', 'begin')),
255                         })
256
257             if not info.get('title'):
258                 info['title'] = try_get(
259                     config, lambda x: x['videoConfig']['title'],
260                     compat_str) or self._html_search_regex(
261                     r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title',
262                     default=None) or self._og_search_title(webpage)
263
264         if not info.get('description'):
265             info['description'] = self._html_search_regex(
266                 r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>',
267                 webpage, 'description', default=None)
268
269         return info