[twitch] Add support for streams (Closes #893, closes #3693, closes #1884)
[youtube-dl] / youtube_dl / extractor / twitch.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_str,
10     compat_urllib_parse,
11     compat_urllib_request,
12 )
13 from ..utils import (
14     ExtractorError,
15     parse_iso8601,
16 )
17
18
19 class TwitchBaseIE(InfoExtractor):
20     _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
21
22     _API_BASE = 'https://api.twitch.tv'
23     _USHER_BASE = 'http://usher.twitch.tv'
24     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
25
26     def _handle_error(self, response):
27         if not isinstance(response, dict):
28             return
29         error = response.get('error')
30         if error:
31             raise ExtractorError(
32                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
33                 expected=True)
34
35     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
36         response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
37         self._handle_error(response)
38         return response
39
40     def _real_initialize(self):
41         self._login()
42
43     def _login(self):
44         (username, password) = self._get_login_info()
45         if username is None:
46             return
47
48         login_page = self._download_webpage(
49             self._LOGIN_URL, None, 'Downloading login page')
50
51         authenticity_token = self._search_regex(
52             r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
53             login_page, 'authenticity token')
54
55         login_form = {
56             'utf8': '✓'.encode('utf-8'),
57             'authenticity_token': authenticity_token,
58             'redirect_on_login': '',
59             'embed_form': 'false',
60             'mp_source_action': '',
61             'follow': '',
62             'user[login]': username,
63             'user[password]': password,
64         }
65
66         request = compat_urllib_request.Request(
67             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
68         request.add_header('Referer', self._LOGIN_URL)
69         response = self._download_webpage(
70             request, None, 'Logging in as %s' % username)
71
72         m = re.search(
73             r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
74         if m:
75             raise ExtractorError(
76                 'Unable to login: %s' % m.group('msg').strip(), expected=True)
77
78
79 class TwitchItemBaseIE(TwitchBaseIE):
80     def _download_info(self, item, item_id):
81         return self._extract_info(self._download_json(
82             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
83             'Downloading %s info JSON' % self._ITEM_TYPE))
84
85     def _extract_media(self, item_id):
86         info = self._download_info(self._ITEM_SHORTCUT, item_id)
87         response = self._download_json(
88             '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
89             'Downloading %s playlist JSON' % self._ITEM_TYPE)
90         entries = []
91         chunks = response['chunks']
92         qualities = list(chunks.keys())
93         for num, fragment in enumerate(zip(*chunks.values()), start=1):
94             formats = []
95             for fmt_num, fragment_fmt in enumerate(fragment):
96                 format_id = qualities[fmt_num]
97                 fmt = {
98                     'url': fragment_fmt['url'],
99                     'format_id': format_id,
100                     'quality': 1 if format_id == 'live' else 0,
101                 }
102                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
103                 if m:
104                     fmt['height'] = int(m.group('height'))
105                 formats.append(fmt)
106             self._sort_formats(formats)
107             entry = dict(info)
108             entry['id'] = '%s_%d' % (entry['id'], num)
109             entry['title'] = '%s part %d' % (entry['title'], num)
110             entry['formats'] = formats
111             entries.append(entry)
112         return self.playlist_result(entries, info['id'], info['title'])
113
114     def _extract_info(self, info):
115         return {
116             'id': info['_id'],
117             'title': info['title'],
118             'description': info['description'],
119             'duration': info['length'],
120             'thumbnail': info['preview'],
121             'uploader': info['channel']['display_name'],
122             'uploader_id': info['channel']['name'],
123             'timestamp': parse_iso8601(info['recorded_at']),
124             'view_count': info['views'],
125         }
126
127     def _real_extract(self, url):
128         return self._extract_media(self._match_id(url))
129
130
131 class TwitchVideoIE(TwitchItemBaseIE):
132     IE_NAME = 'twitch:video'
133     _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
134     _ITEM_TYPE = 'video'
135     _ITEM_SHORTCUT = 'a'
136
137     _TEST = {
138         'url': 'http://www.twitch.tv/riotgames/b/577357806',
139         'info_dict': {
140             'id': 'a577357806',
141             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
142         },
143         'playlist_mincount': 12,
144     }
145
146
147 class TwitchChapterIE(TwitchItemBaseIE):
148     IE_NAME = 'twitch:chapter'
149     _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
150     _ITEM_TYPE = 'chapter'
151     _ITEM_SHORTCUT = 'c'
152
153     _TEST = {
154         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
155         'info_dict': {
156             'id': 'c5285812',
157             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
158         },
159         'playlist_mincount': 3,
160     }
161
162
163 class TwitchVodIE(TwitchItemBaseIE):
164     IE_NAME = 'twitch:vod'
165     _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
166     _ITEM_TYPE = 'vod'
167     _ITEM_SHORTCUT = 'v'
168
169     _TEST = {
170         'url': 'http://www.twitch.tv/ksptv/v/3622000',
171         'info_dict': {
172             'id': 'v3622000',
173             'ext': 'mp4',
174             'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
175             'thumbnail': 're:^https?://.*\.jpg$',
176             'duration': 6951,
177             'timestamp': 1419028564,
178             'upload_date': '20141219',
179             'uploader': 'KSPTV',
180             'uploader_id': 'ksptv',
181             'view_count': int,
182         },
183         'params': {
184             # m3u8 download
185             'skip_download': True,
186         },
187     }
188
189     def _real_extract(self, url):
190         item_id = self._match_id(url)
191         info = self._download_info(self._ITEM_SHORTCUT, item_id)
192         access_token = self._download_json(
193             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
194             'Downloading %s access token' % self._ITEM_TYPE)
195         formats = self._extract_m3u8_formats(
196             '%s/vod/%s?nauth=%s&nauthsig=%s'
197             % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
198             item_id, 'mp4')
199         info['formats'] = formats
200         return info
201
202
203 class TwitchPlaylistBaseIE(TwitchBaseIE):
204     _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
205     _PAGE_LIMIT = 100
206
207     def _extract_playlist(self, channel_id):
208         info = self._download_json(
209             '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
210             channel_id, 'Downloading channel info JSON')
211         channel_name = info.get('display_name') or info.get('name')
212         entries = []
213         offset = 0
214         limit = self._PAGE_LIMIT
215         for counter in itertools.count(1):
216             response = self._download_json(
217                 self._PLAYLIST_URL % (channel_id, offset, limit),
218                 channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
219             videos = response['videos']
220             if not videos:
221                 break
222             entries.extend([self.url_result(video['url']) for video in videos])
223             offset += limit
224         return self.playlist_result(entries, channel_id, channel_name)
225
226     def _real_extract(self, url):
227         return self._extract_playlist(self._match_id(url))
228
229
230 class TwitchProfileIE(TwitchPlaylistBaseIE):
231     IE_NAME = 'twitch:profile'
232     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
233     _PLAYLIST_TYPE = 'profile'
234
235     _TEST = {
236         'url': 'http://www.twitch.tv/vanillatv/profile',
237         'info_dict': {
238             'id': 'vanillatv',
239             'title': 'VanillaTV',
240         },
241         'playlist_mincount': 412,
242     }
243
244
245 class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
246     IE_NAME = 'twitch:past_broadcasts'
247     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
248     _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
249     _PLAYLIST_TYPE = 'past broadcasts'
250
251     _TEST = {
252         'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
253         'info_dict': {
254             'id': 'spamfish',
255             'title': 'Spamfish',
256         },
257         'playlist_mincount': 54,
258     }
259
260
261 class TwitchStreamIE(TwitchBaseIE):
262     IE_NAME = 'twitch:stream'
263     _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
264
265     _TEST = {
266         'url': 'http://www.twitch.tv/shroomztv',
267         'info_dict': {
268             'id': '12772022048',
269             'display_id': 'shroomztv',
270             'ext': 'mp4',
271             'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
272             'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
273             'is_live': True,
274             'timestamp': 1421928037,
275             'upload_date': '20150122',
276             'uploader': 'ShroomzTV',
277             'uploader_id': 'shroomztv',
278             'view_count': int,
279         },
280         'params': {
281             # m3u8 download
282             'skip_download': True,
283         },
284     }
285
286     def _real_extract(self, url):
287         channel_id = self._match_id(url)
288
289         stream = self._download_json(
290             '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id,
291             'Downloading stream JSON').get('stream')
292
293         # Fallback on profile extraction if stream is offline
294         if not stream:
295             return self.url_result(
296                 'http://www.twitch.tv/%s/profile' % channel_id,
297                 'TwitchProfile', channel_id)
298
299         access_token = self._download_json(
300             '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
301             'Downloading channel access token')
302
303         query = {
304             'allow_source': 'true',
305             'p': '9386337',
306             'player': 'twitchweb',
307             'segment_preference': '4',
308             'sig': access_token['sig'],
309             'token': access_token['token'],
310         }
311
312         formats = self._extract_m3u8_formats(
313             '%s/api/channel/hls/%s.m3u8?%s'
314             % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
315             channel_id, 'mp4')
316
317         view_count = stream.get('viewers')
318         timestamp = parse_iso8601(stream.get('created_at'))
319
320         channel = stream['channel']
321         title = self._live_title(channel.get('display_name') or channel.get('name'))
322         description = channel.get('status')
323
324         thumbnails = []
325         for thumbnail_key, thumbnail_url in stream['preview'].items():
326             m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
327             if not m:
328                 continue
329             thumbnails.append({
330                 'url': thumbnail_url,
331                 'width': int(m.group('width')),
332                 'height': int(m.group('height')),
333             })
334
335         return {
336             'id': compat_str(stream['_id']),
337             'display_id': channel_id,
338             'title': title,
339             'description': description,
340             'thumbnails': thumbnails,
341             'uploader': channel.get('display_name'),
342             'uploader_id': channel.get('name'),
343             'timestamp': timestamp,
344             'view_count': view_count,
345             'formats': formats,
346             'is_live': True,
347         }