Merge branch 'bilibili' of https://github.com/PeterDing/youtube-dl into PeterDing...
[youtube-dl] / youtube_dl / extractor / bilibili.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import hashlib
5 import re
6
7 from .common import InfoExtractor
8 from ..compat import compat_parse_qs
9 from ..utils import (
10     int_or_none,
11     float_or_none,
12     unified_timestamp,
13 )
14
15 HEADERS = {
16     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
17 }
18
19
20 class BiliBiliIE(InfoExtractor):
21     _VALID_URL = r'https?://(www.|bangumi.|)bilibili\.(?:tv|com)/(video/av|anime/v/)(?P<id>\d+)'
22
23     _TESTS = [{
24         'url': 'http://www.bilibili.tv/video/av1074402/',
25         'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e',
26         'info_dict': {
27             'id': '1074402',
28             'ext': 'mp4',
29             'title': '【金坷垃】金泡沫',
30             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
31             'duration': 308.315,
32             'timestamp': 1398012660,
33             'upload_date': '20140420',
34             'thumbnail': 're:^https?://.+\.jpg',
35             'uploader': '菊子桑',
36             'uploader_id': '156160',
37         },
38     }, {
39         'url': 'http://www.bilibili.com/video/av1041170/',
40         'info_dict': {
41             'id': '1041170',
42             'ext': 'mp4',
43             'title': '【BD1080P】刀语【诸神&异域】',
44             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~',
45             'duration': 3382.259,
46             'timestamp': 1396530060,
47             'upload_date': '20140403',
48             'thumbnail': 're:^https?://.+\.jpg',
49             'uploader': '枫叶逝去',
50             'uploader_id': '520116',
51         },
52     }, {
53         'url': 'http://www.bilibili.com/video/av4808130/',
54         'info_dict': {
55             'id': '4808130',
56             'ext': 'mp4',
57             'title': '【长篇】哆啦A梦443【钉铛】',
58             'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
59             'duration': 1493.995,
60             'timestamp': 1464564180,
61             'upload_date': '20160529',
62             'thumbnail': 're:^https?://.+\.jpg',
63             'uploader': '喜欢拉面',
64             'uploader_id': '151066',
65         },
66     }, {
67         # Missing upload time
68         'url': 'http://www.bilibili.com/video/av1867637/',
69         'info_dict': {
70             'id': '1867637',
71             'ext': 'mp4',
72             'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】',
73             'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
74             'duration': 5760.0,
75             'uploader': '黑夜为猫',
76             'uploader_id': '610729',
77             'thumbnail': 're:^https?://.+\.jpg',
78         },
79         'params': {
80             # Just to test metadata extraction
81             'skip_download': True,
82         },
83         'expected_warnings': ['upload time'],
84     }, {
85         'url': 'http://bangumi.bilibili.com/anime/v/40068',
86         'md5': '08d539a0884f3deb7b698fb13ba69696',
87         'info_dict': {
88             'id': '40068',
89             'ext': 'mp4',
90             'duration': 1402.357,
91             'title': '混沌武士 : 第7集 四面楚歌 A Risky Racket',
92             'description': "故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子”无幻,说以50个丸子帮她搞定这群人,风觉得他莫名其妙,也就没多搭理他。而在这时,风因为一个意外而将茶水泼在了恶霸头领——龙次郎身上。愤怒的恶霸们欲将风的手指砍掉,风在无奈中大喊道:“丸子100个!”……   另一方面,龙次郎的父亲也就是当地的代官,依仗自己有着雄厚的保镖实力,在当地欺压穷人,当看到一穷人无法交齐足够的钱过桥时,欲下令将其杀死,武士仁看不惯这一幕,于是走上前,与代官的保镖交手了……   酒馆内,因为风答应给无幻100个团子,无幻将恶霸们打败了,就在这时,仁进来了。好战的无幻立刻向仁发了战书,最后两败俱伤,被代官抓入牢房,预计第二天斩首……   得知该状况的风,为报救命之恩,来到了刑场,利用烟花救出了无幻和仁。而风则以救命恩人的身份,命令二人和她一起去寻找带着向日葵香味的武士……(by百科)",
93             'thumbnail': 're:^http?://.+\.jpg',
94         },
95     }]
96
97     _APP_KEY = '6f90a59ac58a4123'
98     _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326'
99
100     def _real_extract(self, url):
101         video_id = self._match_id(url)
102         webpage = self._download_webpage(url, video_id)
103
104         _is_episode = 'anime/v' in url
105         if not _is_episode:
106             cid = compat_parse_qs(self._search_regex(
107                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
108                 r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
109                 webpage, 'player parameters'))['cid'][0]
110         else:
111             url_t = 'http://bangumi.bilibili.com/web_api/get_source'
112             js = self._download_json(url_t, video_id,
113                                      data='episode_id=%s' % video_id,
114                                      headers=HEADERS)
115             cid = js['result']['cid']
116
117         payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid)
118         sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
119
120         video_info = self._download_json(
121             'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign),
122             video_id, note='Downloading video info page')
123
124         entries = []
125
126         for idx, durl in enumerate(video_info['durl']):
127             formats = [{
128                 'url': durl['url'],
129                 'filesize': int_or_none(durl['size']),
130             }]
131             for backup_url in durl['backup_url']:
132                 formats.append({
133                     'url': backup_url,
134                     # backup URLs have lower priorities
135                     'preference': -2 if 'hd.mp4' in backup_url else -3,
136                 })
137
138             self._sort_formats(formats)
139
140             entries.append({
141                 'id': '%s_part%s' % (video_id, idx),
142                 'duration': float_or_none(durl.get('length'), 1000),
143                 'formats': formats,
144             })
145
146         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
147         description = self._html_search_meta('description', webpage)
148         timestamp = unified_timestamp(self._html_search_regex(
149             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False))
150         if _is_episode:
151             thumbnail = self._html_search_meta('og:image', webpage)
152         else:
153             thumbnail = self._html_search_meta('thumbnailUrl', webpage)
154
155         # TODO 'view_count' requires deobfuscating Javascript
156         info = {
157             'id': video_id,
158             'title': title,
159             'description': description,
160             'timestamp': timestamp,
161             'thumbnail': thumbnail,
162             'duration': float_or_none(video_info.get('timelength'), scale=1000),
163         }
164
165         uploader_mobj = re.search(
166             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
167             webpage)
168         if uploader_mobj:
169             info.update({
170                 'uploader': uploader_mobj.group('name'),
171                 'uploader_id': uploader_mobj.group('id'),
172             })
173
174         for entry in entries:
175             entry.update(info)
176
177         if len(entries) == 1:
178             return entries[0]
179         else:
180             for idx, entry in enumerate(entries):
181                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
182
183             return {
184                 '_type': 'multi_video',
185                 'id': video_id,
186                 'title': title,
187                 'description': description,
188                 'entries': entries,
189             }