[vgtv] Add new extractor
[youtube-dl] / youtube_dl / extractor / justintv.py
1 from __future__ import unicode_literals
2
3 import json
4 import os
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     compat_str,
10     ExtractorError,
11     formatSeconds,
12 )
13
14
15 class JustinTVIE(InfoExtractor):
16     """Information extractor for justin.tv and twitch.tv"""
17     # TODO: One broadcast may be split into multiple videos. The key
18     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
19     # starts at 1 and increases. Can we treat all parts as one video?
20
21     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
22         (?:
23             (?P<channelid>[^/]+)|
24             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
25             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
26         )
27         /?(?:\#.*)?$
28         """
29     _JUSTIN_PAGE_LIMIT = 100
30     IE_NAME = 'justin.tv'
31     IE_DESC = 'justin.tv and twitch.tv'
32     _TEST = {
33         'url': 'http://www.twitch.tv/thegamedevhub/b/296128360',
34         'md5': 'ecaa8a790c22a40770901460af191c9a',
35         'info_dict': {
36             'id': '296128360',
37             'ext': 'flv',
38             'upload_date': '20110927',
39             'uploader_id': 25114803,
40             'uploader': 'thegamedevhub',
41             'title': 'Beginner Series - Scripting With Python Pt.1'
42         }
43     }
44
45     # Return count of items, list of *valid* items
46     def _parse_page(self, url, video_id):
47         info_json = self._download_webpage(url, video_id,
48                                            'Downloading video info JSON',
49                                            'unable to download video info JSON')
50
51         response = json.loads(info_json)
52         if type(response) != list:
53             error_text = response.get('error', 'unknown error')
54             raise ExtractorError('Justin.tv API: %s' % error_text)
55         info = []
56         for clip in response:
57             video_url = clip['video_file_url']
58             if video_url:
59                 video_extension = os.path.splitext(video_url)[1][1:]
60                 video_date = re.sub('-', '', clip['start_time'][:10])
61                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
62                 video_id = clip['id']
63                 video_title = clip.get('title', video_id)
64                 info.append({
65                     'id': compat_str(video_id),
66                     'url': video_url,
67                     'title': video_title,
68                     'uploader': clip.get('channel_name', video_uploader_id),
69                     'uploader_id': video_uploader_id,
70                     'upload_date': video_date,
71                     'ext': video_extension,
72                 })
73         return (len(response), info)
74
75     def _real_extract(self, url):
76         mobj = re.match(self._VALID_URL, url)
77
78         api_base = 'http://api.justin.tv'
79         paged = False
80         if mobj.group('channelid'):
81             paged = True
82             video_id = mobj.group('channelid')
83             api = api_base + '/channel/archives/%s.json' % video_id
84         elif mobj.group('chapterid'):
85             chapter_id = mobj.group('chapterid')
86
87             webpage = self._download_webpage(url, chapter_id)
88             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
89             if not m:
90                 raise ExtractorError('Cannot find archive of a chapter')
91             archive_id = m.group(1)
92
93             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
94             doc = self._download_xml(
95                 api, chapter_id,
96                 note='Downloading chapter information',
97                 errnote='Chapter information download failed')
98             for a in doc.findall('.//archive'):
99                 if archive_id == a.find('./id').text:
100                     break
101             else:
102                 raise ExtractorError('Could not find chapter in chapter information')
103
104             video_url = a.find('./video_file_url').text
105             video_ext = video_url.rpartition('.')[2] or 'flv'
106
107             chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
108             chapter_info = self._download_json(
109                 chapter_api_url, 'c' + chapter_id,
110                 note='Downloading chapter metadata',
111                 errnote='Download of chapter metadata failed')
112
113             bracket_start = int(doc.find('.//bracket_start').text)
114             bracket_end = int(doc.find('.//bracket_end').text)
115
116             # TODO determine start (and probably fix up file)
117             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
118             #video_url += '?start=' + TODO:start_timestamp
119             # bracket_start is 13290, but we want 51670615
120             self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
121                                             'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
122
123             info = {
124                 'id': 'c' + chapter_id,
125                 'url': video_url,
126                 'ext': video_ext,
127                 'title': chapter_info['title'],
128                 'thumbnail': chapter_info['preview'],
129                 'description': chapter_info['description'],
130                 'uploader': chapter_info['channel']['display_name'],
131                 'uploader_id': chapter_info['channel']['name'],
132             }
133             return info
134         else:
135             video_id = mobj.group('videoid')
136             api = api_base + '/broadcast/by_archive/%s.json' % video_id
137
138         entries = []
139         offset = 0
140         limit = self._JUSTIN_PAGE_LIMIT
141         while True:
142             if paged:
143                 self.report_download_page(video_id, offset)
144             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
145             page_count, page_info = self._parse_page(page_url, video_id)
146             entries.extend(page_info)
147             if not paged or page_count != limit:
148                 break
149             offset += limit
150         return {
151             '_type': 'playlist',
152             'id': video_id,
153             'entries': entries,
154         }