_ Git - youtube-dl/blob - youtube_dl/extractor/bilibili.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 import json
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_etree_fromstring,
  10 )
  11 from ..utils import (
  12     int_or_none,
  13     unescapeHTML,
  14     ExtractorError,
  15 )
  16
  17
  18 class BiliBiliIE(InfoExtractor):
  19     _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
  20
  21     _TESTS = [{
  22         'url': 'http://www.bilibili.tv/video/av1074402/',
  23         'md5': '2c301e4dab317596e837c3e7633e7d86',
  24         'info_dict': {
  25             'id': '1554319',
  26             'ext': 'flv',
  27             'title': '【金坷垃】金泡沫',
  28             'duration': 308313,
  29             'upload_date': '20140420',
  30             'thumbnail': 're:^https?://.+\.jpg',
  31             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
  32             'timestamp': 1397983878,
  33             'uploader': '菊子桑',
  34         },
  35     }, {
  36         'url': 'http://www.bilibili.com/video/av1041170/',
  37         'info_dict': {
  38             'id': '1041170',
  39             'title': '【BD1080P】刀语【诸神&异域】',
  40             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦！~',
  41             'uploader': '枫叶逝去',
  42             'timestamp': 1396501299,
  43         },
  44         'playlist_count': 9,
  45     }]
  46
  47     def _real_extract(self, url):
  48         mobj = re.match(self._VALID_URL, url)
  49         video_id = mobj.group('id')
  50         page_num = mobj.group('page_num') or '1'
  51
  52         view_data = self._download_json(
  53             'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
  54             video_id)
  55         if 'error' in view_data:
  56             raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
  57
  58         cid = view_data['cid']
  59         title = unescapeHTML(view_data['title'])
  60
  61         page = self._download_webpage(
  62             'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
  63             cid,
  64             'Downloading page %s/%s' % (page_num, view_data['pages'])
  65         )
  66         try:
  67             err_info = json.loads(page)
  68             raise ExtractorError(
  69                 'BiliBili said: ' + err_info['error_text'], expected=True)
  70         except ValueError:
  71             pass
  72
  73         doc = compat_etree_fromstring(page)
  74
  75         entries = []
  76
  77         for durl in doc.findall('./durl'):
  78             size = durl.find('./filesize|./size')
  79             formats = [{
  80                 'url': durl.find('./url').text,
  81                 'filesize': int_or_none(size.text) if size else None,
  82                 'ext': 'flv',
  83             }]
  84             backup_urls = durl.find('./backup_url')
  85             if backup_urls is not None:
  86                 for backup_url in backup_urls.findall('./url'):
  87                     formats.append({'url': backup_url.text})
  88             formats.reverse()
  89
  90             entries.append({
  91                 'id': '%s_part%s' % (cid, durl.find('./order').text),
  92                 'title': title,
  93                 'duration': int_or_none(durl.find('./length').text) // 1000,
  94                 'formats': formats,
  95             })
  96
  97         info = {
  98             'id': str(cid),
  99             'title': title,
 100             'description': view_data.get('description'),
 101             'thumbnail': view_data.get('pic'),
 102             'uploader': view_data.get('author'),
 103             'timestamp': int_or_none(view_data.get('created')),
 104             'view_count': view_data.get('play'),
 105             'duration': int_or_none(doc.find('./timelength').text),
 106         }
 107
 108         if len(entries) == 1:
 109             entries[0].update(info)
 110             return entries[0]
 111         else:
 112             info.update({
 113                 '_type': 'multi_video',
 114                 'id': video_id,
 115                 'entries': entries,
 116             })
 117             return info