_ Git - youtube-dl/blob - youtube_dl/extractor/bilibili.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import hashlib
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..compat import compat_parse_qs
   9 from ..utils import (
  10     int_or_none,
  11     float_or_none,
  12     unified_timestamp,
  13     urlencode_postdata,
  14 )
  15
  16
  17 class BiliBiliIE(InfoExtractor):
  18     _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P<id>\d+)'
  19
  20     _TEST = {
  21         'url': 'http://www.bilibili.tv/video/av1074402/',
  22         'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e',
  23         'info_dict': {
  24             'id': '1074402',
  25             'ext': 'mp4',
  26             'title': '【金坷垃】金泡沫',
  27             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
  28             'duration': 308.315,
  29             'timestamp': 1398012660,
  30             'upload_date': '20140420',
  31             'thumbnail': r're:^https?://.+\.jpg',
  32             'uploader': '菊子桑',
  33             'uploader_id': '156160',
  34         },
  35     }
  36
  37     _APP_KEY = '6f90a59ac58a4123'
  38     _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326'
  39
  40     def _real_extract(self, url):
  41         video_id = self._match_id(url)
  42         webpage = self._download_webpage(url, video_id)
  43
  44         if 'anime/v' not in url:
  45             cid = compat_parse_qs(self._search_regex(
  46                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
  47                  r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
  48                 webpage, 'player parameters'))['cid'][0]
  49         else:
  50             js = self._download_json(
  51                 'http://bangumi.bilibili.com/web_api/get_source', video_id,
  52                 data=urlencode_postdata({'episode_id': video_id}),
  53                 headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
  54             cid = js['result']['cid']
  55
  56         payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid)
  57         sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
  58
  59         video_info = self._download_json(
  60             'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign),
  61             video_id, note='Downloading video info page')
  62
  63         entries = []
  64
  65         for idx, durl in enumerate(video_info['durl']):
  66             formats = [{
  67                 'url': durl['url'],
  68                 'filesize': int_or_none(durl['size']),
  69             }]
  70             for backup_url in durl.get('backup_url', []):
  71                 formats.append({
  72                     'url': backup_url,
  73                     # backup URLs have lower priorities
  74                     'preference': -2 if 'hd.mp4' in backup_url else -3,
  75                 })
  76
  77             self._sort_formats(formats)
  78
  79             entries.append({
  80                 'id': '%s_part%s' % (video_id, idx),
  81                 'duration': float_or_none(durl.get('length'), 1000),
  82                 'formats': formats,
  83             })
  84
  85         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
  86         description = self._html_search_meta('description', webpage)
  87         timestamp = unified_timestamp(self._html_search_regex(
  88             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False))
  89         thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
  90
  91         # TODO 'view_count' requires deobfuscating Javascript
  92         info = {
  93             'id': video_id,
  94             'title': title,
  95             'description': description,
  96             'timestamp': timestamp,
  97             'thumbnail': thumbnail,
  98             'duration': float_or_none(video_info.get('timelength'), scale=1000),
  99         }
 100
 101         uploader_mobj = re.search(
 102             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
 103             webpage)
 104         if uploader_mobj:
 105             info.update({
 106                 'uploader': uploader_mobj.group('name'),
 107                 'uploader_id': uploader_mobj.group('id'),
 108             })
 109
 110         for entry in entries:
 111             entry.update(info)
 112
 113         if len(entries) == 1:
 114             return entries[0]
 115         else:
 116             for idx, entry in enumerate(entries):
 117                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
 118
 119             return {
 120                 '_type': 'multi_video',
 121                 'id': video_id,
 122                 'title': title,
 123                 'description': description,
 124                 'entries': entries,
 125             }