git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/mdr.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8 )
   9
  10
  11 class MDRIE(InfoExtractor):
  12     _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
  13
  14     # No tests, MDR regularily deletes its videos
  15     _TEST = {
  16         'url': 'http://www.mdr.de/fakt/video189002.html',
  17         'only_matching': True,
  18     }
  19
  20     def _real_extract(self, url):
  21         m = re.match(self._VALID_URL, url)
  22         video_id = m.group('video_id')
  23         domain = m.group('domain')
  24
  25         # determine title and media streams from webpage
  26         html = self._download_webpage(url, video_id)
  27
  28         title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
  29         xmlurl = self._search_regex(
  30             r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
  31
  32         doc = self._download_xml(domain + xmlurl, video_id)
  33         formats = []
  34         for a in doc.findall('./assets/asset'):
  35             url_el = a.find('.//progressiveDownloadUrl')
  36             if url_el is None:
  37                 continue
  38             abr = int(a.find('bitrateAudio').text) // 1000
  39             media_type = a.find('mediaType').text
  40             format = {
  41                 'abr': abr,
  42                 'filesize': int(a.find('fileSize').text),
  43                 'url': url_el.text,
  44             }
  45
  46             vbr_el = a.find('bitrateVideo')
  47             if vbr_el is None:
  48                 format.update({
  49                     'vcodec': 'none',
  50                     'format_id': '%s-%d' % (media_type, abr),
  51                 })
  52             else:
  53                 vbr = int(vbr_el.text) // 1000
  54                 format.update({
  55                     'vbr': vbr,
  56                     'width': int(a.find('frameWidth').text),
  57                     'height': int(a.find('frameHeight').text),
  58                     'format_id': '%s-%d' % (media_type, vbr),
  59                 })
  60             formats.append(format)
  61         self._sort_formats(formats)
  62
  63         return {
  64             'id': video_id,
  65             'title': title,
  66             'formats': formats,
  67         }