_ Git - youtube-dl/blob - youtube_dl/extractor/tagesschau.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8
   9 class TagesschauIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?\d+)\.html'
  11
  12     _TESTS = [{
  13         'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
  14         'md5': 'bcdeac2194fb296d599ce7929dfa4009',
  15         'info_dict': {
  16             'id': '1399128',
  17             'ext': 'mp4',
  18             'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
  19             'description': 'md5:69da3c61275b426426d711bde96463ab',
  20             'thumbnail': 're:^http:.*\.jpg$',
  21         },
  22     }, {
  23         'url': 'http://www.tagesschau.de/multimedia/video/video-196.html',
  24         'md5': '8aaa8bf3ae1ca2652309718c03019128',
  25         'info_dict': {
  26             'id': '196',
  27             'ext': 'mp4',
  28             'title': 'Ukraine-Konflikt: Klitschko in Kiew als B\xfcrgermeister vereidigt',
  29             'description': 'md5:f22e4af75821d174fa6c977349682691',
  30             'thumbnail': 're:http://.*\.jpg',
  31         },
  32     }]
  33
  34     def _real_extract(self, url):
  35         mobj = re.match(self._VALID_URL, url)
  36         video_id = mobj.group('id')
  37
  38         if video_id.startswith('-'):
  39             display_id = video_id.strip('-')
  40         else:
  41             display_id = video_id
  42
  43         webpage = self._download_webpage(url, display_id)
  44
  45         playerpage = self._download_webpage(
  46             'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, display_id, 'Downloading player page')
  47
  48         medias = re.findall(r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', playerpage)
  49
  50         formats = []
  51         for url, ext, res in medias:
  52
  53             if res == 's':
  54                 res = 'small'
  55                 quality = 0
  56             elif res == 'm':
  57                 res = 'medium'
  58                 quality = 1
  59             elif res == 'l':
  60                 res = 'large'
  61                 quality = 2
  62
  63             formats.append({
  64                 'format_id': res+'_'+ext,
  65                 'url': url,
  66                 'quality': quality,
  67                 'ext': ext,
  68             })
  69
  70         self._sort_formats(formats)
  71
  72         thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
  73
  74         return {
  75             'id': display_id,
  76             'title': self._og_search_title(webpage).strip(),
  77             'thumbnail': 'http://www.tagesschau.de'+thumbnail,
  78             'formats': formats,
  79             'description': self._og_search_description(webpage).strip(),
  80         }