[youtube] Skip unsupported adaptive stream type (#18804)
[youtube-dl] / youtube_dl / extractor / bilibili.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import hashlib
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_parse_qs,
10     compat_urlparse,
11 )
12 from ..utils import (
13     ExtractorError,
14     int_or_none,
15     float_or_none,
16     parse_iso8601,
17     smuggle_url,
18     strip_jsonp,
19     unified_timestamp,
20     unsmuggle_url,
21     urlencode_postdata,
22 )
23
24
25 class BiliBiliIE(InfoExtractor):
26     _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)'
27
28     _TESTS = [{
29         'url': 'http://www.bilibili.tv/video/av1074402/',
30         'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
31         'info_dict': {
32             'id': '1074402',
33             'ext': 'flv',
34             'title': '【金坷垃】金泡沫',
35             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
36             'duration': 308.067,
37             'timestamp': 1398012678,
38             'upload_date': '20140420',
39             'thumbnail': r're:^https?://.+\.jpg',
40             'uploader': '菊子桑',
41             'uploader_id': '156160',
42         },
43     }, {
44         # Tested in BiliBiliBangumiIE
45         'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
46         'only_matching': True,
47     }, {
48         'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
49         'md5': '3f721ad1e75030cc06faf73587cfec57',
50         'info_dict': {
51             'id': '100643',
52             'ext': 'mp4',
53             'title': 'CHAOS;CHILD',
54             'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
55         },
56         'skip': 'Geo-restricted to China',
57     }, {
58         # Title with double quotes
59         'url': 'http://www.bilibili.com/video/av8903802/',
60         'info_dict': {
61             'id': '8903802',
62             'title': '阿滴英文|英文歌分享#6 "Closer',
63             'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
64         },
65         'playlist': [{
66             'info_dict': {
67                 'id': '8903802_part1',
68                 'ext': 'flv',
69                 'title': '阿滴英文|英文歌分享#6 "Closer',
70                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
71                 'uploader': '阿滴英文',
72                 'uploader_id': '65880958',
73                 'timestamp': 1488382634,
74                 'upload_date': '20170301',
75             },
76             'params': {
77                 'skip_download': True,  # Test metadata only
78             },
79         }, {
80             'info_dict': {
81                 'id': '8903802_part2',
82                 'ext': 'flv',
83                 'title': '阿滴英文|英文歌分享#6 "Closer',
84                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
85                 'uploader': '阿滴英文',
86                 'uploader_id': '65880958',
87                 'timestamp': 1488382634,
88                 'upload_date': '20170301',
89             },
90             'params': {
91                 'skip_download': True,  # Test metadata only
92             },
93         }]
94     }]
95
96     _APP_KEY = '84956560bc028eb7'
97     _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e'
98
99     def _report_error(self, result):
100         if 'message' in result:
101             raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
102         elif 'code' in result:
103             raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
104         else:
105             raise ExtractorError('Can\'t extract Bangumi episode ID')
106
107     def _real_extract(self, url):
108         url, smuggled_data = unsmuggle_url(url, {})
109
110         mobj = re.match(self._VALID_URL, url)
111         video_id = mobj.group('id')
112         anime_id = mobj.group('anime_id')
113         webpage = self._download_webpage(url, video_id)
114
115         if 'anime/' not in url:
116             cid = self._search_regex(
117                 r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
118                 default=None
119             ) or compat_parse_qs(self._search_regex(
120                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
121                  r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
122                  r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
123                 webpage, 'player parameters'))['cid'][0]
124         else:
125             if 'no_bangumi_tip' not in smuggled_data:
126                 self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
127                     video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
128             headers = {
129                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
130                 'Referer': url
131             }
132             headers.update(self.geo_verification_headers())
133
134             js = self._download_json(
135                 'http://bangumi.bilibili.com/web_api/get_source', video_id,
136                 data=urlencode_postdata({'episode_id': video_id}),
137                 headers=headers)
138             if 'result' not in js:
139                 self._report_error(js)
140             cid = js['result']['cid']
141
142         headers = {
143             'Referer': url
144         }
145         headers.update(self.geo_verification_headers())
146
147         entries = []
148
149         RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
150         for num, rendition in enumerate(RENDITIONS, start=1):
151             payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
152             sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
153
154             video_info = self._download_json(
155                 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
156                 video_id, note='Downloading video info page',
157                 headers=headers, fatal=num == len(RENDITIONS))
158
159             if not video_info:
160                 continue
161
162             if 'durl' not in video_info:
163                 if num < len(RENDITIONS):
164                     continue
165                 self._report_error(video_info)
166
167             for idx, durl in enumerate(video_info['durl']):
168                 formats = [{
169                     'url': durl['url'],
170                     'filesize': int_or_none(durl['size']),
171                 }]
172                 for backup_url in durl.get('backup_url', []):
173                     formats.append({
174                         'url': backup_url,
175                         # backup URLs have lower priorities
176                         'preference': -2 if 'hd.mp4' in backup_url else -3,
177                     })
178
179                 for a_format in formats:
180                     a_format.setdefault('http_headers', {}).update({
181                         'Referer': url,
182                     })
183
184                 self._sort_formats(formats)
185
186                 entries.append({
187                     'id': '%s_part%s' % (video_id, idx),
188                     'duration': float_or_none(durl.get('length'), 1000),
189                     'formats': formats,
190                 })
191             break
192
193         title = self._html_search_regex(
194             ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
195              '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
196             group='title')
197         description = self._html_search_meta('description', webpage)
198         timestamp = unified_timestamp(self._html_search_regex(
199             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
200             default=None) or self._html_search_meta(
201             'uploadDate', webpage, 'timestamp', default=None))
202         thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
203
204         # TODO 'view_count' requires deobfuscating Javascript
205         info = {
206             'id': video_id,
207             'title': title,
208             'description': description,
209             'timestamp': timestamp,
210             'thumbnail': thumbnail,
211             'duration': float_or_none(video_info.get('timelength'), scale=1000),
212         }
213
214         uploader_mobj = re.search(
215             r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
216             webpage)
217         if uploader_mobj:
218             info.update({
219                 'uploader': uploader_mobj.group('name'),
220                 'uploader_id': uploader_mobj.group('id'),
221             })
222         if not info.get('uploader'):
223             info['uploader'] = self._html_search_meta(
224                 'author', webpage, 'uploader', default=None)
225
226         for entry in entries:
227             entry.update(info)
228
229         if len(entries) == 1:
230             return entries[0]
231         else:
232             for idx, entry in enumerate(entries):
233                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
234
235             return {
236                 '_type': 'multi_video',
237                 'id': video_id,
238                 'title': title,
239                 'description': description,
240                 'entries': entries,
241             }
242
243
244 class BiliBiliBangumiIE(InfoExtractor):
245     _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
246
247     IE_NAME = 'bangumi.bilibili.com'
248     IE_DESC = 'BiliBili番剧'
249
250     _TESTS = [{
251         'url': 'http://bangumi.bilibili.com/anime/1869',
252         'info_dict': {
253             'id': '1869',
254             'title': '混沌武士',
255             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
256         },
257         'playlist_count': 26,
258     }, {
259         'url': 'http://bangumi.bilibili.com/anime/1869',
260         'info_dict': {
261             'id': '1869',
262             'title': '混沌武士',
263             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
264         },
265         'playlist': [{
266             'md5': '91da8621454dd58316851c27c68b0c13',
267             'info_dict': {
268                 'id': '40062',
269                 'ext': 'mp4',
270                 'title': '混沌武士',
271                 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
272                 'timestamp': 1414538739,
273                 'upload_date': '20141028',
274                 'episode': '疾风怒涛 Tempestuous Temperaments',
275                 'episode_number': 1,
276             },
277         }],
278         'params': {
279             'playlist_items': '1',
280         },
281     }]
282
283     @classmethod
284     def suitable(cls, url):
285         return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
286
287     def _real_extract(self, url):
288         bangumi_id = self._match_id(url)
289
290         # Sometimes this API returns a JSONP response
291         season_info = self._download_json(
292             'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
293             bangumi_id, transform_source=strip_jsonp)['result']
294
295         entries = [{
296             '_type': 'url_transparent',
297             'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
298             'ie_key': BiliBiliIE.ie_key(),
299             'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
300             'episode': episode.get('index_title'),
301             'episode_number': int_or_none(episode.get('index')),
302         } for episode in season_info['episodes']]
303
304         entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
305
306         return self.playlist_result(
307             entries, bangumi_id,
308             season_info.get('bangumi_title'), season_info.get('evaluate'))