[twitch] Add basic support for two-factor authentication
[youtube-dl] / youtube_dl / extractor / twitch.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import re
6 import random
7
8 from .common import InfoExtractor
9 from ..compat import (
10     compat_HTTPError,
11     compat_parse_qs,
12     compat_str,
13     compat_urllib_parse_urlencode,
14     compat_urllib_parse_urlparse,
15 )
16 from ..utils import (
17     clean_html,
18     ExtractorError,
19     int_or_none,
20     js_to_json,
21     orderedSet,
22     parse_duration,
23     parse_iso8601,
24     update_url_query,
25     urlencode_postdata,
26     urljoin,
27 )
28
29
30 class TwitchBaseIE(InfoExtractor):
31     _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
32
33     _API_BASE = 'https://api.twitch.tv'
34     _USHER_BASE = 'https://usher.ttvnw.net'
35     _LOGIN_URL = 'https://www.twitch.tv/login'
36     _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6'
37     _NETRC_MACHINE = 'twitch'
38
39     def _handle_error(self, response):
40         if not isinstance(response, dict):
41             return
42         error = response.get('error')
43         if error:
44             raise ExtractorError(
45                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
46                 expected=True)
47
48     def _call_api(self, path, item_id, note):
49         response = self._download_json(
50             '%s/%s' % (self._API_BASE, path), item_id, note,
51             headers={'Client-ID': self._CLIENT_ID})
52         self._handle_error(response)
53         return response
54
55     def _real_initialize(self):
56         self._login()
57
58     def _login(self):
59         (username, password) = self._get_login_info()
60         if username is None:
61             return
62
63         def fail(message):
64             raise ExtractorError(
65                 'Unable to login. Twitch said: %s' % message, expected=True)
66
67         def post_login_form(page, urlh, note, data):
68             form = self._hidden_inputs(page)
69             form.update(data)
70
71             page_url = urlh.geturl()
72             post_url = self._search_regex(
73                 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
74                 'post url', default=page_url, group='url')
75             post_url = urljoin(page_url, post_url)
76
77             headers = {'Referer': page_url}
78
79             try:
80                 response = self._download_json(
81                     post_url, None, note,
82                     data=urlencode_postdata(form),
83                     headers=headers)
84             except ExtractorError as e:
85                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
86                     response = self._parse_json(
87                         e.cause.read().decode('utf-8'), None)
88                     fail(response['message'])
89                 raise
90
91             if response.get('redirect'):
92                 redirect_url = urljoin(post_url, response['redirect'])
93                 return self._download_webpage_handle(
94                     redirect_url, None, 'Downloading login redirect page',
95                     headers=headers)
96
97         login_page, handle = self._download_webpage_handle(
98             self._LOGIN_URL, None, 'Downloading login page')
99
100         # Some TOR nodes and public proxies are blocked completely
101         if 'blacklist_message' in login_page:
102             fail(clean_html(login_page))
103
104         login_data = {
105             'username': username,
106             'password': password,
107         }
108         redirect_res = post_login_form(
109             login_page, handle, 'Logging in as %s' % username, login_data)
110
111         if not redirect_res:
112             return
113         redirect_page, handle = redirect_res
114
115         if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None:
116             # TODO: Add mechanism to request an SMS or phone call
117             tfa_token = self._get_tfa_info('two-factor authentication token')
118             tfa_data = {
119                 'authy_token': tfa_token,
120                 'remember_2fa': 'true',
121             }
122             post_login_form(redirect_page, handle, 'Submitting TFA token', tfa_data)
123
124     def _prefer_source(self, formats):
125         try:
126             source = next(f for f in formats if f['format_id'] == 'Source')
127             source['preference'] = 10
128         except StopIteration:
129             pass  # No Source stream present
130         self._sort_formats(formats)
131
132
133 class TwitchItemBaseIE(TwitchBaseIE):
134     def _download_info(self, item, item_id):
135         return self._extract_info(self._call_api(
136             'kraken/videos/%s%s' % (item, item_id), item_id,
137             'Downloading %s info JSON' % self._ITEM_TYPE))
138
139     def _extract_media(self, item_id):
140         info = self._download_info(self._ITEM_SHORTCUT, item_id)
141         response = self._call_api(
142             'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id,
143             'Downloading %s playlist JSON' % self._ITEM_TYPE)
144         entries = []
145         chunks = response['chunks']
146         qualities = list(chunks.keys())
147         for num, fragment in enumerate(zip(*chunks.values()), start=1):
148             formats = []
149             for fmt_num, fragment_fmt in enumerate(fragment):
150                 format_id = qualities[fmt_num]
151                 fmt = {
152                     'url': fragment_fmt['url'],
153                     'format_id': format_id,
154                     'quality': 1 if format_id == 'live' else 0,
155                 }
156                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
157                 if m:
158                     fmt['height'] = int(m.group('height'))
159                 formats.append(fmt)
160             self._sort_formats(formats)
161             entry = dict(info)
162             entry['id'] = '%s_%d' % (entry['id'], num)
163             entry['title'] = '%s part %d' % (entry['title'], num)
164             entry['formats'] = formats
165             entries.append(entry)
166         return self.playlist_result(entries, info['id'], info['title'])
167
168     def _extract_info(self, info):
169         return {
170             'id': info['_id'],
171             'title': info.get('title') or 'Untitled Broadcast',
172             'description': info.get('description'),
173             'duration': int_or_none(info.get('length')),
174             'thumbnail': info.get('preview'),
175             'uploader': info.get('channel', {}).get('display_name'),
176             'uploader_id': info.get('channel', {}).get('name'),
177             'timestamp': parse_iso8601(info.get('recorded_at')),
178             'view_count': int_or_none(info.get('views')),
179         }
180
181     def _real_extract(self, url):
182         return self._extract_media(self._match_id(url))
183
184
185 class TwitchVideoIE(TwitchItemBaseIE):
186     IE_NAME = 'twitch:video'
187     _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
188     _ITEM_TYPE = 'video'
189     _ITEM_SHORTCUT = 'a'
190
191     _TEST = {
192         'url': 'http://www.twitch.tv/riotgames/b/577357806',
193         'info_dict': {
194             'id': 'a577357806',
195             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
196         },
197         'playlist_mincount': 12,
198         'skip': 'HTTP Error 404: Not Found',
199     }
200
201
202 class TwitchChapterIE(TwitchItemBaseIE):
203     IE_NAME = 'twitch:chapter'
204     _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
205     _ITEM_TYPE = 'chapter'
206     _ITEM_SHORTCUT = 'c'
207
208     _TESTS = [{
209         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
210         'info_dict': {
211             'id': 'c5285812',
212             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
213         },
214         'playlist_mincount': 3,
215         'skip': 'HTTP Error 404: Not Found',
216     }, {
217         'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
218         'only_matching': True,
219     }]
220
221
222 class TwitchVodIE(TwitchItemBaseIE):
223     IE_NAME = 'twitch:vod'
224     _VALID_URL = r'''(?x)
225                     https?://
226                         (?:
227                             (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/|
228                             player\.twitch\.tv/\?.*?\bvideo=v
229                         )
230                         (?P<id>\d+)
231                     '''
232     _ITEM_TYPE = 'vod'
233     _ITEM_SHORTCUT = 'v'
234
235     _TESTS = [{
236         'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
237         'info_dict': {
238             'id': 'v6528877',
239             'ext': 'mp4',
240             'title': 'LCK Summer Split - Week 6 Day 1',
241             'thumbnail': r're:^https?://.*\.jpg$',
242             'duration': 17208,
243             'timestamp': 1435131709,
244             'upload_date': '20150624',
245             'uploader': 'Riot Games',
246             'uploader_id': 'riotgames',
247             'view_count': int,
248             'start_time': 310,
249         },
250         'params': {
251             # m3u8 download
252             'skip_download': True,
253         },
254     }, {
255         # Untitled broadcast (title is None)
256         'url': 'http://www.twitch.tv/belkao_o/v/11230755',
257         'info_dict': {
258             'id': 'v11230755',
259             'ext': 'mp4',
260             'title': 'Untitled Broadcast',
261             'thumbnail': r're:^https?://.*\.jpg$',
262             'duration': 1638,
263             'timestamp': 1439746708,
264             'upload_date': '20150816',
265             'uploader': 'BelkAO_o',
266             'uploader_id': 'belkao_o',
267             'view_count': int,
268         },
269         'params': {
270             # m3u8 download
271             'skip_download': True,
272         },
273         'skip': 'HTTP Error 404: Not Found',
274     }, {
275         'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877',
276         'only_matching': True,
277     }, {
278         'url': 'https://www.twitch.tv/videos/6528877',
279         'only_matching': True,
280     }]
281
282     def _real_extract(self, url):
283         item_id = self._match_id(url)
284
285         info = self._download_info(self._ITEM_SHORTCUT, item_id)
286         access_token = self._call_api(
287             'api/vods/%s/access_token' % item_id, item_id,
288             'Downloading %s access token' % self._ITEM_TYPE)
289
290         formats = self._extract_m3u8_formats(
291             '%s/vod/%s?%s' % (
292                 self._USHER_BASE, item_id,
293                 compat_urllib_parse_urlencode({
294                     'allow_source': 'true',
295                     'allow_audio_only': 'true',
296                     'allow_spectre': 'true',
297                     'player': 'twitchweb',
298                     'nauth': access_token['token'],
299                     'nauthsig': access_token['sig'],
300                 })),
301             item_id, 'mp4', entry_protocol='m3u8_native')
302
303         self._prefer_source(formats)
304         info['formats'] = formats
305
306         parsed_url = compat_urllib_parse_urlparse(url)
307         query = compat_parse_qs(parsed_url.query)
308         if 't' in query:
309             info['start_time'] = parse_duration(query['t'][0])
310
311         if info.get('timestamp') is not None:
312             info['subtitles'] = {
313                 'rechat': [{
314                     'url': update_url_query(
315                         'https://rechat.twitch.tv/rechat-messages', {
316                             'video_id': 'v%s' % item_id,
317                             'start': info['timestamp'],
318                         }),
319                     'ext': 'json',
320                 }],
321             }
322
323         return info
324
325
326 class TwitchPlaylistBaseIE(TwitchBaseIE):
327     _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d'
328     _PAGE_LIMIT = 100
329
330     def _extract_playlist(self, channel_id):
331         info = self._call_api(
332             'kraken/channels/%s' % channel_id,
333             channel_id, 'Downloading channel info JSON')
334         channel_name = info.get('display_name') or info.get('name')
335         entries = []
336         offset = 0
337         limit = self._PAGE_LIMIT
338         broken_paging_detected = False
339         counter_override = None
340         for counter in itertools.count(1):
341             response = self._call_api(
342                 self._PLAYLIST_PATH % (channel_id, offset, limit),
343                 channel_id,
344                 'Downloading %s JSON page %s'
345                 % (self._PLAYLIST_TYPE, counter_override or counter))
346             page_entries = self._extract_playlist_page(response)
347             if not page_entries:
348                 break
349             total = int_or_none(response.get('_total'))
350             # Since the beginning of March 2016 twitch's paging mechanism
351             # is completely broken on the twitch side. It simply ignores
352             # a limit and returns the whole offset number of videos.
353             # Working around by just requesting all videos at once.
354             # Upd: pagination bug was fixed by twitch on 15.03.2016.
355             if not broken_paging_detected and total and len(page_entries) > limit:
356                 self.report_warning(
357                     'Twitch pagination is broken on twitch side, requesting all videos at once',
358                     channel_id)
359                 broken_paging_detected = True
360                 offset = total
361                 counter_override = '(all at once)'
362                 continue
363             entries.extend(page_entries)
364             if broken_paging_detected or total and len(page_entries) >= total:
365                 break
366             offset += limit
367         return self.playlist_result(
368             [self.url_result(entry) for entry in orderedSet(entries)],
369             channel_id, channel_name)
370
371     def _extract_playlist_page(self, response):
372         videos = response.get('videos')
373         return [video['url'] for video in videos] if videos else []
374
375     def _real_extract(self, url):
376         return self._extract_playlist(self._match_id(url))
377
378
379 class TwitchProfileIE(TwitchPlaylistBaseIE):
380     IE_NAME = 'twitch:profile'
381     _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
382     _PLAYLIST_TYPE = 'profile'
383
384     _TEST = {
385         'url': 'http://www.twitch.tv/vanillatv/profile',
386         'info_dict': {
387             'id': 'vanillatv',
388             'title': 'VanillaTV',
389         },
390         'playlist_mincount': 412,
391     }
392
393
394 class TwitchVideosBaseIE(TwitchPlaylistBaseIE):
395     _VALID_URL_VIDEOS_BASE = r'%s/(?P<id>[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE
396     _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type='
397
398
399 class TwitchAllVideosIE(TwitchVideosBaseIE):
400     IE_NAME = 'twitch:videos:all'
401     _VALID_URL = r'%s/all' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
402     _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight'
403     _PLAYLIST_TYPE = 'all videos'
404
405     _TEST = {
406         'url': 'https://www.twitch.tv/spamfish/videos/all',
407         'info_dict': {
408             'id': 'spamfish',
409             'title': 'Spamfish',
410         },
411         'playlist_mincount': 869,
412     }
413
414
415 class TwitchUploadsIE(TwitchVideosBaseIE):
416     IE_NAME = 'twitch:videos:uploads'
417     _VALID_URL = r'%s/uploads' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
418     _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload'
419     _PLAYLIST_TYPE = 'uploads'
420
421     _TEST = {
422         'url': 'https://www.twitch.tv/spamfish/videos/uploads',
423         'info_dict': {
424             'id': 'spamfish',
425             'title': 'Spamfish',
426         },
427         'playlist_mincount': 0,
428     }
429
430
431 class TwitchPastBroadcastsIE(TwitchVideosBaseIE):
432     IE_NAME = 'twitch:videos:past-broadcasts'
433     _VALID_URL = r'%s/past-broadcasts' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
434     _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive'
435     _PLAYLIST_TYPE = 'past broadcasts'
436
437     _TEST = {
438         'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts',
439         'info_dict': {
440             'id': 'spamfish',
441             'title': 'Spamfish',
442         },
443         'playlist_mincount': 0,
444     }
445
446
447 class TwitchHighlightsIE(TwitchVideosBaseIE):
448     IE_NAME = 'twitch:videos:highlights'
449     _VALID_URL = r'%s/highlights' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
450     _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight'
451     _PLAYLIST_TYPE = 'highlights'
452
453     _TEST = {
454         'url': 'https://www.twitch.tv/spamfish/videos/highlights',
455         'info_dict': {
456             'id': 'spamfish',
457             'title': 'Spamfish',
458         },
459         'playlist_mincount': 805,
460     }
461
462
463 class TwitchStreamIE(TwitchBaseIE):
464     IE_NAME = 'twitch:stream'
465     _VALID_URL = r'''(?x)
466                     https?://
467                         (?:
468                             (?:www\.)?twitch\.tv/|
469                             player\.twitch\.tv/\?.*?\bchannel=
470                         )
471                         (?P<id>[^/#?]+)
472                     '''
473
474     _TESTS = [{
475         'url': 'http://www.twitch.tv/shroomztv',
476         'info_dict': {
477             'id': '12772022048',
478             'display_id': 'shroomztv',
479             'ext': 'mp4',
480             'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
481             'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
482             'is_live': True,
483             'timestamp': 1421928037,
484             'upload_date': '20150122',
485             'uploader': 'ShroomzTV',
486             'uploader_id': 'shroomztv',
487             'view_count': int,
488         },
489         'params': {
490             # m3u8 download
491             'skip_download': True,
492         },
493     }, {
494         'url': 'http://www.twitch.tv/miracle_doto#profile-0',
495         'only_matching': True,
496     }, {
497         'url': 'https://player.twitch.tv/?channel=lotsofs',
498         'only_matching': True,
499     }]
500
501     @classmethod
502     def suitable(cls, url):
503         return (False
504                 if any(ie.suitable(url) for ie in (
505                     TwitchVideoIE,
506                     TwitchChapterIE,
507                     TwitchVodIE,
508                     TwitchProfileIE,
509                     TwitchAllVideosIE,
510                     TwitchUploadsIE,
511                     TwitchPastBroadcastsIE,
512                     TwitchHighlightsIE))
513                 else super(TwitchStreamIE, cls).suitable(url))
514
515     def _real_extract(self, url):
516         channel_id = self._match_id(url)
517
518         stream = self._call_api(
519             'kraken/streams/%s?stream_type=all' % channel_id, channel_id,
520             'Downloading stream JSON').get('stream')
521
522         if not stream:
523             raise ExtractorError('%s is offline' % channel_id, expected=True)
524
525         # Channel name may be typed if different case than the original channel name
526         # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
527         # an invalid m3u8 URL. Working around by use of original channel name from stream
528         # JSON and fallback to lowercase if it's not available.
529         channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
530
531         access_token = self._call_api(
532             'api/channels/%s/access_token' % channel_id, channel_id,
533             'Downloading channel access token')
534
535         query = {
536             'allow_source': 'true',
537             'allow_audio_only': 'true',
538             'allow_spectre': 'true',
539             'p': random.randint(1000000, 10000000),
540             'player': 'twitchweb',
541             'segment_preference': '4',
542             'sig': access_token['sig'].encode('utf-8'),
543             'token': access_token['token'].encode('utf-8'),
544         }
545         formats = self._extract_m3u8_formats(
546             '%s/api/channel/hls/%s.m3u8?%s'
547             % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)),
548             channel_id, 'mp4')
549         self._prefer_source(formats)
550
551         view_count = stream.get('viewers')
552         timestamp = parse_iso8601(stream.get('created_at'))
553
554         channel = stream['channel']
555         title = self._live_title(channel.get('display_name') or channel.get('name'))
556         description = channel.get('status')
557
558         thumbnails = []
559         for thumbnail_key, thumbnail_url in stream['preview'].items():
560             m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
561             if not m:
562                 continue
563             thumbnails.append({
564                 'url': thumbnail_url,
565                 'width': int(m.group('width')),
566                 'height': int(m.group('height')),
567             })
568
569         return {
570             'id': compat_str(stream['_id']),
571             'display_id': channel_id,
572             'title': title,
573             'description': description,
574             'thumbnails': thumbnails,
575             'uploader': channel.get('display_name'),
576             'uploader_id': channel.get('name'),
577             'timestamp': timestamp,
578             'view_count': view_count,
579             'formats': formats,
580             'is_live': True,
581         }
582
583
584 class TwitchClipsIE(InfoExtractor):
585     IE_NAME = 'twitch:clips'
586     _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
587
588     _TESTS = [{
589         'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound',
590         'md5': '761769e1eafce0ffebfb4089cb3847cd',
591         'info_dict': {
592             'id': 'AggressiveCobraPoooound',
593             'ext': 'mp4',
594             'title': 'EA Play 2016 Live from the Novo Theatre',
595             'thumbnail': r're:^https?://.*\.jpg',
596             'creator': 'EA',
597             'uploader': 'stereotype_',
598             'uploader_id': 'stereotype_',
599         },
600     }, {
601         # multiple formats
602         'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy',
603         'only_matching': True,
604     }]
605
606     def _real_extract(self, url):
607         video_id = self._match_id(url)
608
609         webpage = self._download_webpage(url, video_id)
610
611         clip = self._parse_json(
612             self._search_regex(
613                 r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'),
614             video_id, transform_source=js_to_json)
615
616         title = clip.get('channel_title') or self._og_search_title(webpage)
617
618         formats = [{
619             'url': option['source'],
620             'format_id': option.get('quality'),
621             'height': int_or_none(option.get('quality')),
622         } for option in clip.get('quality_options', []) if option.get('source')]
623
624         if not formats:
625             formats = [{
626                 'url': clip['clip_video_url'],
627             }]
628
629         self._sort_formats(formats)
630
631         return {
632             'id': video_id,
633             'title': title,
634             'thumbnail': self._og_search_thumbnail(webpage),
635             'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'),
636             'uploader': clip.get('curator_login'),
637             'uploader_id': clip.get('curator_display_name'),
638             'formats': formats,
639         }