[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / douyutv.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import time
5 import hashlib
6 import re
7
8 from .common import InfoExtractor
9 from ..utils import (
10     ExtractorError,
11     unescapeHTML,
12     unified_strdate,
13     urljoin,
14 )
15
16
17 class DouyuTVIE(InfoExtractor):
18     IE_DESC = '斗鱼'
19     _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)'
20     _TESTS = [{
21         'url': 'http://www.douyutv.com/iseven',
22         'info_dict': {
23             'id': '17732',
24             'display_id': 'iseven',
25             'ext': 'flv',
26             'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
27             'description': r're:.*m7show@163\.com.*',
28             'thumbnail': r're:^https?://.*\.jpg$',
29             'uploader': '7师傅',
30             'is_live': True,
31         },
32         'params': {
33             'skip_download': True,
34         },
35     }, {
36         'url': 'http://www.douyutv.com/85982',
37         'info_dict': {
38             'id': '85982',
39             'display_id': '85982',
40             'ext': 'flv',
41             'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
42             'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
43             'thumbnail': r're:^https?://.*\.jpg$',
44             'uploader': 'douyu小漠',
45             'is_live': True,
46         },
47         'params': {
48             'skip_download': True,
49         },
50         'skip': 'Room not found',
51     }, {
52         'url': 'http://www.douyutv.com/17732',
53         'info_dict': {
54             'id': '17732',
55             'display_id': '17732',
56             'ext': 'flv',
57             'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
58             'description': r're:.*m7show@163\.com.*',
59             'thumbnail': r're:^https?://.*\.jpg$',
60             'uploader': '7师傅',
61             'is_live': True,
62         },
63         'params': {
64             'skip_download': True,
65         },
66     }, {
67         'url': 'http://www.douyu.com/xiaocang',
68         'only_matching': True,
69     }, {
70         # \"room_id\"
71         'url': 'http://www.douyu.com/t/lpl',
72         'only_matching': True,
73     }]
74
75     def _real_extract(self, url):
76         video_id = self._match_id(url)
77
78         if video_id.isdigit():
79             room_id = video_id
80         else:
81             page = self._download_webpage(url, video_id)
82             room_id = self._html_search_regex(
83                 r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
84
85         # Grab metadata from mobile API
86         room = self._download_json(
87             'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,
88             note='Downloading room info')['data']
89
90         # 1 = live, 2 = offline
91         if room.get('show_status') == '2':
92             raise ExtractorError('Live stream is offline', expected=True)
93
94         # Grab the URL from PC client API
95         # The m3u8 url from mobile API requires re-authentication every 5 minutes
96         tt = int(time.time())
97         signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt)
98         sign = hashlib.md5(signContent.encode('ascii')).hexdigest()
99         video_url = self._download_json(
100             'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id,
101             video_id, note='Downloading video URL info',
102             query={'rate': 0}, headers={
103                 'auth': sign,
104                 'time': str(tt),
105                 'aid': 'pcclient'
106             })['data']['live_url']
107
108         title = self._live_title(unescapeHTML(room['room_name']))
109         description = room.get('show_details')
110         thumbnail = room.get('room_src')
111         uploader = room.get('nickname')
112
113         return {
114             'id': room_id,
115             'display_id': video_id,
116             'url': video_url,
117             'title': title,
118             'description': description,
119             'thumbnail': thumbnail,
120             'uploader': uploader,
121             'is_live': True,
122         }
123
124
125 class DouyuShowIE(InfoExtractor):
126     _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
127
128     _TESTS = [{
129         'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
130         'md5': '0c2cfd068ee2afe657801269b2d86214',
131         'info_dict': {
132             'id': 'rjNBdvnVXNzvE2yw',
133             'ext': 'mp4',
134             'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
135             'duration': 7150.08,
136             'thumbnail': r're:^https?://.*\.jpg$',
137             'uploader': '陈一发儿',
138             'uploader_id': 'XrZwYelr5wbK',
139             'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
140             'upload_date': '20170402',
141         },
142     }, {
143         'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
144         'only_matching': True,
145     }]
146
147     def _real_extract(self, url):
148         url = url.replace('vmobile.', 'v.')
149         video_id = self._match_id(url)
150
151         webpage = self._download_webpage(url, video_id)
152
153         room_info = self._parse_json(self._search_regex(
154             r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
155
156         video_info = None
157
158         for trial in range(5):
159             # Sometimes Douyu rejects our request. Let's try it more times
160             try:
161                 video_info = self._download_json(
162                     'https://vmobile.douyu.com/video/getInfo', video_id,
163                     query={'vid': video_id},
164                     headers={
165                         'Referer': url,
166                         'x-requested-with': 'XMLHttpRequest',
167                     })
168                 break
169             except ExtractorError:
170                 self._sleep(1, video_id)
171
172         if not video_info:
173             raise ExtractorError('Can\'t fetch video info')
174
175         formats = self._extract_m3u8_formats(
176             video_info['data']['video_url'], video_id,
177             entry_protocol='m3u8_native', ext='mp4')
178
179         upload_date = unified_strdate(self._html_search_regex(
180             r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
181             'upload date', fatal=False))
182
183         uploader = uploader_id = uploader_url = None
184         mobj = re.search(
185             r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
186             webpage)
187         if mobj:
188             uploader_id, uploader = mobj.groups()
189             uploader_url = urljoin(url, '/author/' + uploader_id)
190
191         return {
192             'id': video_id,
193             'title': room_info['name'],
194             'formats': formats,
195             'duration': room_info.get('duration'),
196             'thumbnail': room_info.get('pic'),
197             'upload_date': upload_date,
198             'uploader': uploader,
199             'uploader_id': uploader_id,
200             'uploader_url': uploader_url,
201         }