]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/twitch.py
Merge remote-tracking branch 'yan12125/download-dash-segments' (#5886)
[youtube-dl] / youtube_dl / extractor / twitch.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import re
6 import random
7
8 from .common import InfoExtractor
9 from ..compat import (
10     compat_str,
11     compat_urllib_parse,
12     compat_urllib_request,
13 )
14 from ..utils import (
15     ExtractorError,
16     parse_iso8601,
17 )
18
19
20 class TwitchBaseIE(InfoExtractor):
21     _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
22
23     _API_BASE = 'https://api.twitch.tv'
24     _USHER_BASE = 'http://usher.twitch.tv'
25     _LOGIN_URL = 'https://secure.twitch.tv/login'
26     _LOGIN_POST_URL = 'https://passport.twitch.tv/authorize'
27     _NETRC_MACHINE = 'twitch'
28
29     def _handle_error(self, response):
30         if not isinstance(response, dict):
31             return
32         error = response.get('error')
33         if error:
34             raise ExtractorError(
35                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
36                 expected=True)
37
38     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
39         headers = {
40             'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2',
41             'X-Requested-With': 'XMLHttpRequest',
42         }
43         for cookie in self._downloader.cookiejar:
44             if cookie.name == 'api_token':
45                 headers['Twitch-Api-Token'] = cookie.value
46         request = compat_urllib_request.Request(url, headers=headers)
47         response = super(TwitchBaseIE, self)._download_json(request, video_id, note)
48         self._handle_error(response)
49         return response
50
51     def _real_initialize(self):
52         self._login()
53
54     def _login(self):
55         (username, password) = self._get_login_info()
56         if username is None:
57             return
58
59         login_page = self._download_webpage(
60             self._LOGIN_URL, None, 'Downloading login page')
61
62         login_form = self._hidden_inputs(login_page)
63
64         login_form.update({
65             'login': username.encode('utf-8'),
66             'password': password.encode('utf-8'),
67         })
68
69         request = compat_urllib_request.Request(
70             self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
71         request.add_header('Referer', self._LOGIN_URL)
72         response = self._download_webpage(
73             request, None, 'Logging in as %s' % username)
74
75         error_message = self._search_regex(
76             r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
77             response, 'error message', default=None)
78         if error_message:
79             raise ExtractorError(
80                 'Unable to login. Twitch said: %s' % error_message, expected=True)
81
82         if '>Reset your password<' in response:
83             self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
84
85     def _prefer_source(self, formats):
86         try:
87             source = next(f for f in formats if f['format_id'] == 'Source')
88             source['preference'] = 10
89         except StopIteration:
90             pass  # No Source stream present
91         self._sort_formats(formats)
92
93
94 class TwitchItemBaseIE(TwitchBaseIE):
95     def _download_info(self, item, item_id):
96         return self._extract_info(self._download_json(
97             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
98             'Downloading %s info JSON' % self._ITEM_TYPE))
99
100     def _extract_media(self, item_id):
101         info = self._download_info(self._ITEM_SHORTCUT, item_id)
102         response = self._download_json(
103             '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
104             'Downloading %s playlist JSON' % self._ITEM_TYPE)
105         entries = []
106         chunks = response['chunks']
107         qualities = list(chunks.keys())
108         for num, fragment in enumerate(zip(*chunks.values()), start=1):
109             formats = []
110             for fmt_num, fragment_fmt in enumerate(fragment):
111                 format_id = qualities[fmt_num]
112                 fmt = {
113                     'url': fragment_fmt['url'],
114                     'format_id': format_id,
115                     'quality': 1 if format_id == 'live' else 0,
116                 }
117                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
118                 if m:
119                     fmt['height'] = int(m.group('height'))
120                 formats.append(fmt)
121             self._sort_formats(formats)
122             entry = dict(info)
123             entry['id'] = '%s_%d' % (entry['id'], num)
124             entry['title'] = '%s part %d' % (entry['title'], num)
125             entry['formats'] = formats
126             entries.append(entry)
127         return self.playlist_result(entries, info['id'], info['title'])
128
129     def _extract_info(self, info):
130         return {
131             'id': info['_id'],
132             'title': info['title'],
133             'description': info['description'],
134             'duration': info['length'],
135             'thumbnail': info['preview'],
136             'uploader': info['channel']['display_name'],
137             'uploader_id': info['channel']['name'],
138             'timestamp': parse_iso8601(info['recorded_at']),
139             'view_count': info['views'],
140         }
141
142     def _real_extract(self, url):
143         return self._extract_media(self._match_id(url))
144
145
146 class TwitchVideoIE(TwitchItemBaseIE):
147     IE_NAME = 'twitch:video'
148     _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
149     _ITEM_TYPE = 'video'
150     _ITEM_SHORTCUT = 'a'
151
152     _TEST = {
153         'url': 'http://www.twitch.tv/riotgames/b/577357806',
154         'info_dict': {
155             'id': 'a577357806',
156             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
157         },
158         'playlist_mincount': 12,
159     }
160
161
162 class TwitchChapterIE(TwitchItemBaseIE):
163     IE_NAME = 'twitch:chapter'
164     _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
165     _ITEM_TYPE = 'chapter'
166     _ITEM_SHORTCUT = 'c'
167
168     _TESTS = [{
169         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
170         'info_dict': {
171             'id': 'c5285812',
172             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
173         },
174         'playlist_mincount': 3,
175     }, {
176         'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
177         'only_matching': True,
178     }]
179
180
181 class TwitchVodIE(TwitchItemBaseIE):
182     IE_NAME = 'twitch:vod'
183     _VALID_URL = r'%s/[^/]+/v/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
184     _ITEM_TYPE = 'vod'
185     _ITEM_SHORTCUT = 'v'
186
187     _TEST = {
188         'url': 'http://www.twitch.tv/riotgames/v/6528877',
189         'info_dict': {
190             'id': 'v6528877',
191             'ext': 'mp4',
192             'title': 'LCK Summer Split - Week 6 Day 1',
193             'thumbnail': 're:^https?://.*\.jpg$',
194             'duration': 17208,
195             'timestamp': 1435131709,
196             'upload_date': '20150624',
197             'uploader': 'Riot Games',
198             'uploader_id': 'riotgames',
199             'view_count': int,
200         },
201         'params': {
202             # m3u8 download
203             'skip_download': True,
204         },
205     }
206
207     def _real_extract(self, url):
208         item_id = self._match_id(url)
209         info = self._download_info(self._ITEM_SHORTCUT, item_id)
210         access_token = self._download_json(
211             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
212             'Downloading %s access token' % self._ITEM_TYPE)
213         formats = self._extract_m3u8_formats(
214             '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
215             % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
216             item_id, 'mp4')
217         self._prefer_source(formats)
218         info['formats'] = formats
219         return info
220
221
222 class TwitchPlaylistBaseIE(TwitchBaseIE):
223     _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
224     _PAGE_LIMIT = 100
225
226     def _extract_playlist(self, channel_id):
227         info = self._download_json(
228             '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
229             channel_id, 'Downloading channel info JSON')
230         channel_name = info.get('display_name') or info.get('name')
231         entries = []
232         offset = 0
233         limit = self._PAGE_LIMIT
234         for counter in itertools.count(1):
235             response = self._download_json(
236                 self._PLAYLIST_URL % (channel_id, offset, limit),
237                 channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
238             page_entries = self._extract_playlist_page(response)
239             if not page_entries:
240                 break
241             entries.extend(page_entries)
242             offset += limit
243         return self.playlist_result(
244             [self.url_result(entry) for entry in set(entries)],
245             channel_id, channel_name)
246
247     def _extract_playlist_page(self, response):
248         videos = response.get('videos')
249         return [video['url'] for video in videos] if videos else []
250
251     def _real_extract(self, url):
252         return self._extract_playlist(self._match_id(url))
253
254
255 class TwitchProfileIE(TwitchPlaylistBaseIE):
256     IE_NAME = 'twitch:profile'
257     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
258     _PLAYLIST_TYPE = 'profile'
259
260     _TEST = {
261         'url': 'http://www.twitch.tv/vanillatv/profile',
262         'info_dict': {
263             'id': 'vanillatv',
264             'title': 'VanillaTV',
265         },
266         'playlist_mincount': 412,
267     }
268
269
270 class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
271     IE_NAME = 'twitch:past_broadcasts'
272     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
273     _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
274     _PLAYLIST_TYPE = 'past broadcasts'
275
276     _TEST = {
277         'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
278         'info_dict': {
279             'id': 'spamfish',
280             'title': 'Spamfish',
281         },
282         'playlist_mincount': 54,
283     }
284
285
286 class TwitchBookmarksIE(TwitchPlaylistBaseIE):
287     IE_NAME = 'twitch:bookmarks'
288     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
289     _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
290     _PLAYLIST_TYPE = 'bookmarks'
291
292     _TEST = {
293         'url': 'http://www.twitch.tv/ognos/profile/bookmarks',
294         'info_dict': {
295             'id': 'ognos',
296             'title': 'Ognos',
297         },
298         'playlist_mincount': 3,
299     }
300
301     def _extract_playlist_page(self, response):
302         entries = []
303         for bookmark in response.get('bookmarks', []):
304             video = bookmark.get('video')
305             if not video:
306                 continue
307             entries.append(video['url'])
308         return entries
309
310
311 class TwitchStreamIE(TwitchBaseIE):
312     IE_NAME = 'twitch:stream'
313     _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
314
315     _TESTS = [{
316         'url': 'http://www.twitch.tv/shroomztv',
317         'info_dict': {
318             'id': '12772022048',
319             'display_id': 'shroomztv',
320             'ext': 'mp4',
321             'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
322             'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
323             'is_live': True,
324             'timestamp': 1421928037,
325             'upload_date': '20150122',
326             'uploader': 'ShroomzTV',
327             'uploader_id': 'shroomztv',
328             'view_count': int,
329         },
330         'params': {
331             # m3u8 download
332             'skip_download': True,
333         },
334     }, {
335         'url': 'http://www.twitch.tv/miracle_doto#profile-0',
336         'only_matching': True,
337     }]
338
339     def _real_extract(self, url):
340         channel_id = self._match_id(url)
341
342         stream = self._download_json(
343             '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id,
344             'Downloading stream JSON').get('stream')
345
346         # Fallback on profile extraction if stream is offline
347         if not stream:
348             return self.url_result(
349                 'http://www.twitch.tv/%s/profile' % channel_id,
350                 'TwitchProfile', channel_id)
351
352         # Channel name may be typed if different case than the original channel name
353         # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
354         # an invalid m3u8 URL. Working around by use of original channel name from stream
355         # JSON and fallback to lowercase if it's not available.
356         channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
357
358         access_token = self._download_json(
359             '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
360             'Downloading channel access token')
361
362         query = {
363             'allow_source': 'true',
364             'p': random.randint(1000000, 10000000),
365             'player': 'twitchweb',
366             'segment_preference': '4',
367             'sig': access_token['sig'].encode('utf-8'),
368             'token': access_token['token'].encode('utf-8'),
369         }
370         formats = self._extract_m3u8_formats(
371             '%s/api/channel/hls/%s.m3u8?%s'
372             % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
373             channel_id, 'mp4')
374         self._prefer_source(formats)
375
376         view_count = stream.get('viewers')
377         timestamp = parse_iso8601(stream.get('created_at'))
378
379         channel = stream['channel']
380         title = self._live_title(channel.get('display_name') or channel.get('name'))
381         description = channel.get('status')
382
383         thumbnails = []
384         for thumbnail_key, thumbnail_url in stream['preview'].items():
385             m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
386             if not m:
387                 continue
388             thumbnails.append({
389                 'url': thumbnail_url,
390                 'width': int(m.group('width')),
391                 'height': int(m.group('height')),
392             })
393
394         return {
395             'id': compat_str(stream['_id']),
396             'display_id': channel_id,
397             'title': title,
398             'description': description,
399             'thumbnails': thumbnails,
400             'uploader': channel.get('display_name'),
401             'uploader_id': channel.get('name'),
402             'timestamp': timestamp,
403             'view_count': view_count,
404             'formats': formats,
405             'is_live': True,
406         }