X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Ftagesschau.py;h=73e7657d4bec7b1bc37753923744d92b769d8843;hb=55801fc76e2813de9a84eaa830d70ed73cb44463;hp=25b9864add9dc8422a5948111d25ea8243e10441;hpb=c09cbf0ed91ed54882abe6633b1e70e8a8b7db2d;p=youtube-dl diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 25b9864ad..73e7657d4 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,31 +4,63 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import parse_filesize class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P-?[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_[^/#?]+?)?\.html' _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', - 'md5': 'bcdeac2194fb296d599ce7929dfa4009', + 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', + 'md5': '917a228bc7df7850783bc47979673a09', 'info_dict': { - 'id': '1399128', + 'id': '102143', 'ext': 'mp4', - 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen', - 'description': 'md5:69da3c61275b426426d711bde96463ab', - 'thumbnail': 're:^http:.*\.jpg$', + 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', + 'thumbnail': 're:^https?:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', - 'md5': '66652566900963a3f962333579eeffcf', + 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', + 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': '5964', + 'id': '5727', 'ext': 'mp4', - 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', - 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', - 'thumbnail': 're:http://.*\.jpg', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', + 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', + 'thumbnail': 're:^https?:.*\.jpg$', }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', + 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + 'info_dict': { + 'id': '18407', + 'ext': 'mp3', + 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'thumbnail': 're:^https?:.*\.jpg$', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', + 'only_matching': True, }] _FORMATS = { @@ -38,42 +70,86 @@ class TagesschauIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if video_id.startswith('-'): - display_id = video_id.strip('-') - else: - display_id = video_id - + video_id = self._match_id(url) + display_id = video_id.lstrip('-') webpage = self._download_webpage(url, display_id) - playerpage = self._download_webpage( - 'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, - display_id, 'Downloading player page') - - medias = re.findall( - r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', - playerpage) + player_url = self._html_search_meta( + 'twitter:player', webpage, 'player URL', default=None) + if player_url: + playerpage = self._download_webpage( + player_url, display_id, 'Downloading player page') - formats = [] - for url, ext, res in medias: - f = { - 'format_id': res + '_' + ext, - 'url': url, - 'ext': ext, - } - f.update(self._FORMATS.get(res, {})) - formats.append(f) + formats = [] + for media in re.finditer( + r'''(?x) + (?P["\'])(?Phttp://media.+?)(?P=q_url) + ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) + (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? + ''', playerpage): + url = media.group('url') + type_ = media.group('type') + ext = media.group('ext') + res = media.group('quality') + f = { + 'format_id': '%s_%s' % (res, ext) if res else ext, + 'url': url, + 'ext': ext, + 'vcodec': 'none' if type_ == 'audio' else None, + } + f.update(self._FORMATS.get(res, {})) + formats.append(f) + thumbnail = self._og_search_thumbnail(playerpage) + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + else: + download_text = self._search_regex( + r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

', + webpage, 'download links') + links = re.finditer( + r'

', + download_text) + formats = [] + for l in links: + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + l.group('title')) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + formats.append(format) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) + title = self._html_search_regex( + r'(.*?)', webpage, 'title') self._sort_formats(formats) - thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1] - return { 'id': display_id, - 'title': self._og_search_title(webpage).strip(), - 'thumbnail': 'http://www.tagesschau.de' + thumbnail, + 'title': title, + 'thumbnail': thumbnail, 'formats': formats, - 'description': self._og_search_description(webpage).strip(), + 'description': description, }