_ Git - youtube-dl/blob - youtube_dl/extractor/kika.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import ExtractorError
   6
   7
   8 class KikaIE(InfoExtractor):
   9     _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*'
  10
  11     _TESTS = [
  12         {
  13             'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
  14             'md5': '4930515e36b06c111213e80d1e4aad0e',
  15             'info_dict': {
  16                 'id': '19636',
  17                 'ext': 'mp4',
  18                 'title': 'Baumhaus vom 30. Oktober 2015',
  19                 'description': None
  20             }
  21         },
  22         {
  23             'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
  24             'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
  25             'info_dict': {
  26                 'id': '8182',
  27                 'ext': 'mp4',
  28                 'title': 'Beutolomäus und der geheime Weihnachtswunsch',
  29                 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd'
  30             }
  31         },
  32         {
  33             'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
  34             'md5': '4930515e36b06c111213e80d1e4aad0e',
  35             'info_dict': {
  36                 'id': '19636',
  37                 'ext': 'mp4',
  38                 'title': 'Baumhaus vom 30. Oktober 2015',
  39                 'description': None
  40             }
  41         },
  42         {
  43             'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
  44             'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
  45             'info_dict': {
  46                 'id': '8182',
  47                 'ext': 'mp4',
  48                 'title': 'Beutolomäus und der geheime Weihnachtswunsch',
  49                 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd'
  50             }
  51         }
  52     ]
  53
  54     def _real_extract(self, url):
  55         # broadcast_id may be the same as the video_id
  56         broadcast_id = self._match_id(url)
  57         webpage = self._download_webpage(url, broadcast_id)
  58
  59         xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml'
  60         video_id = self._search_regex(xml_re, webpage, "xml_url", default=None)
  61         if not video_id:
  62             # Video is not available online
  63             err_msg = 'Video %s is not available online' % broadcast_id
  64             raise ExtractorError(err_msg, expected=True)
  65
  66         xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id)
  67         xml_tree = self._download_xml(xml_url, video_id)
  68
  69         title = xml_tree.find('title').text
  70         webpage_url = xml_tree.find('htmlUrl').text
  71
  72         # Try to get the description, not available for all videos
  73         try:
  74             broadcast_elem = xml_tree.find('broadcast')
  75             description = broadcast_elem.find('broadcastDescription').text
  76         except AttributeError:
  77             # No description available
  78             description = None
  79
  80         # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42)
  81         tmp = xml_tree.find('duration').text.split(':')
  82         duration = int(tmp[0]) * 60 + int(tmp[1])
  83
  84         formats_list = []
  85         for elem in xml_tree.find('assets'):
  86             format_dict = {}
  87             format_dict['url'] = elem.find('progressiveDownloadUrl').text
  88             format_dict['ext'] = elem.find('mediaType').text.lower()
  89             format_dict['format'] = elem.find('profileName').text
  90             format_dict['width'] = int(elem.find('frameWidth').text)
  91             format_dict['height'] = int(elem.find('frameHeight').text)
  92             format_dict['resolution'] = '%dx%d' % (format_dict['width'],
  93                                                    format_dict['height'])
  94             format_dict['abr'] = int(elem.find('bitrateAudio').text)
  95             format_dict['vbr'] = int(elem.find('bitrateVideo').text)
  96             format_dict['tbr'] = format_dict['abr'] + format_dict['vbr']
  97             format_dict['filesize'] = int(elem.find('fileSize').text)
  98
  99             formats_list.append(format_dict)
 100
 101         # Sort by resolution (=quality)
 102         formats_list.sort(key=lambda x: x['width'] * x['height'])
 103
 104         return {
 105             'id': video_id,
 106             'title': title,
 107             'description': description,
 108             'formats': formats_list,
 109             'duration': duration,
 110             'webpage_url': webpage_url
 111         }