_ Git - youtube-dl/blob - youtube_dl/extractor/twitch.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_urllib_parse,
  10     compat_urllib_request,
  11 )
  12 from ..utils import (
  13     ExtractorError,
  14     parse_iso8601,
  15 )
  16
  17
  18 class TwitchIE(InfoExtractor):
  19     # TODO: One broadcast may be split into multiple videos. The key
  20     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  21     # starts at 1 and increases. Can we treat all parts as one video?
  22     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
  23         (?:
  24             (?P<channelid>[^/]+)|
  25             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  26             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  27         )
  28         /?(?:\#.*)?$
  29         """
  30     _PAGE_LIMIT = 100
  31     _API_BASE = 'https://api.twitch.tv'
  32     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
  33     _TESTS = [{
  34         'url': 'http://www.twitch.tv/riotgames/b/577357806',
  35         'info_dict': {
  36             'id': 'a577357806',
  37             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
  38         },
  39         'playlist_mincount': 12,
  40     }, {
  41         'url': 'http://www.twitch.tv/acracingleague/c/5285812',
  42         'info_dict': {
  43             'id': 'c5285812',
  44             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
  45         },
  46         'playlist_mincount': 3,
  47     }, {
  48         'url': 'http://www.twitch.tv/vanillatv',
  49         'info_dict': {
  50             'id': 'vanillatv',
  51             'title': 'VanillaTV',
  52         },
  53         'playlist_mincount': 412,
  54     }]
  55
  56     def _handle_error(self, response):
  57         if not isinstance(response, dict):
  58             return
  59         error = response.get('error')
  60         if error:
  61             raise ExtractorError(
  62                 '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
  63                 expected=True)
  64
  65     def _download_json(self, url, video_id, note='Downloading JSON metadata'):
  66         response = super(TwitchIE, self)._download_json(url, video_id, note)
  67         self._handle_error(response)
  68         return response
  69
  70     def _extract_media(self, item, item_id):
  71         ITEMS = {
  72             'a': 'video',
  73             'c': 'chapter',
  74         }
  75         info = self._extract_info(self._download_json(
  76             '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
  77             'Downloading %s info JSON' % ITEMS[item]))
  78         response = self._download_json(
  79             '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
  80             'Downloading %s playlist JSON' % ITEMS[item])
  81         entries = []
  82         chunks = response['chunks']
  83         qualities = list(chunks.keys())
  84         for num, fragment in enumerate(zip(*chunks.values()), start=1):
  85             formats = []
  86             for fmt_num, fragment_fmt in enumerate(fragment):
  87                 format_id = qualities[fmt_num]
  88                 fmt = {
  89                     'url': fragment_fmt['url'],
  90                     'format_id': format_id,
  91                     'quality': 1 if format_id == 'live' else 0,
  92                 }
  93                 m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
  94                 if m:
  95                     fmt['height'] = int(m.group('height'))
  96                 formats.append(fmt)
  97             self._sort_formats(formats)
  98             entry = dict(info)
  99             entry['id'] = '%s_%d' % (entry['id'], num)
 100             entry['title'] = '%s part %d' % (entry['title'], num)
 101             entry['formats'] = formats
 102             entries.append(entry)
 103         return self.playlist_result(entries, info['id'], info['title'])
 104
 105     def _extract_info(self, info):
 106         return {
 107             'id': info['_id'],
 108             'title': info['title'],
 109             'description': info['description'],
 110             'duration': info['length'],
 111             'thumbnail': info['preview'],
 112             'uploader': info['channel']['display_name'],
 113             'uploader_id': info['channel']['name'],
 114             'timestamp': parse_iso8601(info['recorded_at']),
 115             'view_count': info['views'],
 116         }
 117
 118     def _real_initialize(self):
 119         self._login()
 120
 121     def _login(self):
 122         (username, password) = self._get_login_info()
 123         if username is None:
 124             return
 125
 126         login_page = self._download_webpage(
 127             self._LOGIN_URL, None, 'Downloading login page')
 128
 129         authenticity_token = self._search_regex(
 130             r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
 131             login_page, 'authenticity token')
 132
 133         login_form = {
 134             'utf8': '✓'.encode('utf-8'),
 135             'authenticity_token': authenticity_token,
 136             'redirect_on_login': '',
 137             'embed_form': 'false',
 138             'mp_source_action': '',
 139             'follow': '',
 140             'user[login]': username,
 141             'user[password]': password,
 142         }
 143
 144         request = compat_urllib_request.Request(
 145             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
 146         request.add_header('Referer', self._LOGIN_URL)
 147         response = self._download_webpage(
 148             request, None, 'Logging in as %s' % username)
 149
 150         m = re.search(
 151             r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
 152         if m:
 153             raise ExtractorError(
 154                 'Unable to login: %s' % m.group('msg').strip(), expected=True)
 155
 156     def _real_extract(self, url):
 157         mobj = re.match(self._VALID_URL, url)
 158         if mobj.group('chapterid'):
 159             return self._extract_media('c', mobj.group('chapterid'))
 160
 161             """
 162             webpage = self._download_webpage(url, chapter_id)
 163             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 164             if not m:
 165                 raise ExtractorError('Cannot find archive of a chapter')
 166             archive_id = m.group(1)
 167
 168             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 169             doc = self._download_xml(
 170                 api, chapter_id,
 171                 note='Downloading chapter information',
 172                 errnote='Chapter information download failed')
 173             for a in doc.findall('.//archive'):
 174                 if archive_id == a.find('./id').text:
 175                     break
 176             else:
 177                 raise ExtractorError('Could not find chapter in chapter information')
 178
 179             video_url = a.find('./video_file_url').text
 180             video_ext = video_url.rpartition('.')[2] or 'flv'
 181
 182             chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
 183             chapter_info = self._download_json(
 184                 chapter_api_url, 'c' + chapter_id,
 185                 note='Downloading chapter metadata',
 186                 errnote='Download of chapter metadata failed')
 187
 188             bracket_start = int(doc.find('.//bracket_start').text)
 189             bracket_end = int(doc.find('.//bracket_end').text)
 190
 191             # TODO determine start (and probably fix up file)
 192             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 193             #video_url += '?start=' + TODO:start_timestamp
 194             # bracket_start is 13290, but we want 51670615
 195             self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
 196                                             'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 197
 198             info = {
 199                 'id': 'c' + chapter_id,
 200                 'url': video_url,
 201                 'ext': video_ext,
 202                 'title': chapter_info['title'],
 203                 'thumbnail': chapter_info['preview'],
 204                 'description': chapter_info['description'],
 205                 'uploader': chapter_info['channel']['display_name'],
 206                 'uploader_id': chapter_info['channel']['name'],
 207             }
 208             return info
 209             """
 210         elif mobj.group('videoid'):
 211             return self._extract_media('a', mobj.group('videoid'))
 212         elif mobj.group('channelid'):
 213             channel_id = mobj.group('channelid')
 214             info = self._download_json(
 215                 '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
 216                 channel_id, 'Downloading channel info JSON')
 217             channel_name = info.get('display_name') or info.get('name')
 218             entries = []
 219             offset = 0
 220             limit = self._PAGE_LIMIT
 221             for counter in itertools.count(1):
 222                 response = self._download_json(
 223                     '%s/kraken/channels/%s/videos/?offset=%d&limit=%d'
 224                     % (self._API_BASE, channel_id, offset, limit),
 225                     channel_id, 'Downloading channel videos JSON page %d' % counter)
 226                 videos = response['videos']
 227                 if not videos:
 228                     break
 229                 entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
 230                 offset += limit
 231             return self.playlist_result(entries, channel_id, channel_name)