_ Git - youtube-dl/blob - youtube_dl/extractor/tagesschau.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import unicode_literals
   3
   4 import re
   5 import json
   6
   7 from .common import InfoExtractor
   8
   9
  10 class TagesschauIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?\d+)\.html'
  12
  13     _TESTS = [{
  14         'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
  15         'md5': 'bcdeac2194fb296d599ce7929dfa4009',
  16         'info_dict': {
  17             'id': '1399128',
  18             'ext': 'mp4',
  19             'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
  20             'description': 'md5:69da3c61275b426426d711bde96463ab',
  21             'thumbnail': 're:^http:.*\.jpg$',
  22         },
  23     }, {
  24         'url': 'http://www.tagesschau.de/multimedia/video/video-196.html',
  25         'md5': '8aaa8bf3ae1ca2652309718c03019128',
  26         'info_dict': {
  27             'id': '196',
  28             'ext': 'mp4',
  29             'title': 'Ukraine-Konflikt: Klitschko in Kiew als B\xfcrgermeister vereidigt',
  30             'description': 'md5:f22e4af75821d174fa6c977349682691',
  31             'thumbnail': 're:http://.*\.jpg',
  32         },
  33     }]
  34
  35     def _real_extract(self, url):
  36         mobj = re.match(self._VALID_URL, url)
  37         video_id = mobj.group('id')
  38
  39         if video_id.startswith('-'):
  40             display_id = video_id.strip('-')
  41         else:
  42             display_id = video_id
  43
  44         webpage = self._download_webpage(url, display_id)
  45
  46         playerpage = self._download_webpage(
  47             'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, display_id)
  48
  49         medias = re.findall(r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', playerpage)
  50
  51         formats = []
  52         for url, ext, res in medias:
  53
  54             if res == 's':
  55                 res = 'small'
  56                 quality = 0
  57             elif res == 'm':
  58                 res = 'medium'
  59                 quality = 1
  60             elif res == 'l':
  61                 res = 'large'
  62                 quality = 2
  63
  64             formats.append({
  65                 'format_id': res+'_'+ext,
  66                 'url': url,
  67                 'quality': quality,
  68                 'ext': ext,
  69             })
  70
  71         self._sort_formats(formats)
  72
  73         thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
  74
  75         return {
  76             'id': display_id,
  77             'title': self._og_search_title(webpage).strip(),
  78             'thumbnail': 'http://www.tagesschau.de'+thumbnail,
  79             'formats': formats,
  80             'description': self._og_search_description(webpage).strip(),
  81         }